In [None]:
import pandas as pd
from itertools import combinations, count
from tqdm import tqdm
import os

import networkx as nx
import matplotlib.pyplot as plt
import numpy as np

from utils import get_faa, collect_family_pairs

In [None]:
phrog_palette = {
    'DNA, RNA and nucleotide metabolism': 'red',
    'connector': 'blue',
    'head and packaging': 'green',
    'integration and excision': 'pink',
    'lysis': 'gray',
    'moron, auxiliary metabolic gene and host takeover': 'brown',
    'other': 'purple',
    'tail': 'darkorange',
    'transcription regulation': 'cyan',
    'unknown': 'lightgray'
}

# analysis

In [None]:
cluster_tsv = pd.read_csv('dereplicated_mmseqs_faa_cluster.tsv', sep='\t', header=None)
cluster_tsv.columns = ['cluster_id', 'protein_id']

In [None]:
c_p = cluster_tsv.groupby('cluster_id')['protein_id'].apply(list).to_dict()

In [None]:
p_c = {vv:k for k,v in c_p.items() for vv in v}

In [None]:
contig_dir = 'de_replicated_faas/'
contigs = os.listdir(contig_dir)

In [None]:
df1 = pd.DataFrame(columns=['p1', 'p2'])
contig_protein_dict = {}
all_proteins = {}
for c in tqdm(contigs):
    c_path = contig_dir + c
    contig_proteins, contig_proteins_sequences = get_faa(c_path)
    all_proteins.update(dict(zip([x.split(' ')[0][1:] for x in contig_proteins], contig_proteins_sequences)))
    contig_protein_dict[c] = [x.split(' ')[0][1:] for x in contig_proteins]
    contig_protein_clusters = [p_c[x.split(' ')[0][1:]] for x in contig_proteins]
    protein_number = list(range(1, len(contig_proteins)+1))
    df = pd.DataFrame({'protein_name': contig_proteins, 'query_id': contig_protein_clusters, 'protein_id': protein_number})
    contig_pairs = collect_family_pairs(contig_df=df, neighborhood=1)
    df1 = pd.concat([df1, contig_pairs], axis=0)

### capsid clusters

In [None]:
capsid_ids, _ = get_faa('mcp.faa')
capsid_ids = [i.split(' ')[0][1:] for i in capsid_ids]
capsid_clusters = set([p_c[i] for i in capsid_ids if i in p_c])

capsid_cluster_dict = {}
for c in c_p:
    if c in capsid_clusters:
        capsid_cluster_dict[c] = 'capsid'
    else:
        capsid_cluster_dict[c] = ''

## metadata information

In [None]:
cluster_predictions = pd.read_csv('cluster_predicted_PHROG_category.csv')

In [None]:
cluster_predictions['color_map'] = [phrog_palette[x] for x in cluster_predictions['predicted_category']]
phrog_color_map = cluster_predictions[['cluster', 'color_map']].set_index('cluster').T.to_dict(orient='list')
phrog_color_map = {k:v[0] for k,v in phrog_color_map.items()}

# network construction

In [None]:
pair_df = df1.copy()
pair_df = pair_df[pair_df['p1'].isin(c_p.keys())]
pair_df = pair_df[pair_df['p2'].isin(c_p.keys())]

In [None]:
p_p = pair_df.pivot_table(index='p1', columns='p2', aggfunc=len).fillna(0)
p_p = p_p.rename_axis(None, axis=1)
p_p_norm = p_p

In [None]:
df = p_p_norm.stack().reset_index()
df.columns = ['p1', 'p2', 'weight']

In [None]:
df = df[df['weight'] != 0.0]

In [None]:
threshold = 1.0
threshold_df = df[df['weight'] > threshold]

In [None]:
## cluster node sizes
cluster_node_size = {k:len(v) for k,v in c_p.items()}

### construct graph

In [None]:
G = nx.Graph()

In [None]:
G = nx.from_pandas_edgelist(threshold_df, 'p1', 'p2', edge_attr=['weight'])

nx.set_node_attributes(G, values=phrog_color_map, name='phrog_category')
nx.set_node_attributes(G, values=cluster_node_size, name='cluster_size')
nx.set_node_attributes(G, values=capsid_cluster_dict, name='cluster_annotation')

In [None]:
## output graph for cytoscape visualization
nx.write_graphml(G, 'network.graphml')