In [1]:
import pandas as pd
import pathlib
import numpy as np
from Bio import SeqIO
from collections import Counter
import json

In [2]:
home_dir = pathlib.Path.home().joinpath('Documents', 'ms_thesis_ppi')
in_file = pathlib.Path.joinpath(home_dir, 'dataset','9606_Q1.txt')
in_file_fasta = pathlib.Path.joinpath(home_dir, 'dataset','1590981466.fas.1.fasta')
df = pd.read_csv(in_file, sep="\t", header=0, dtype='unicode')
#df['UniprotName_A']

#df.pivot_table(index=['UniprotID_A'], aggfunc='size')

In [3]:
#df.pivot_table(index=['UniprotID_B'], aggfunc='size')

Getting only the unique values between two columns:

In [4]:
uniprot_list = pd.unique(df[['UniprotID_A', 'UniprotID_B']].values.ravel('K'))
np.savetxt("uniprot_list.txt", uniprot_list, fmt='%s')

Parsing the resultant fasta file after processing with CD-HIT (40% threshold)

In [5]:
uniprot_id = []
for seq_record in SeqIO.parse(in_file_fasta, "fasta"):
    uniprot_id.append(seq_record.id.split('|')[1])

# Counter(uniprot_id).keys()
print(len(uniprot_id))

13090


Creating a filtered dataframe with only relevant tuples:

In [6]:
uniprot_A = df['UniprotID_A'].to_list()
uniprot_B = df['UniprotID_B'].to_list()
length = len(uniprot_A)
reduced_interaction = []
for i in range(length):
    if ((uniprot_A[i] in uniprot_id) and (uniprot_B[i] in uniprot_id)):
        reduced_interaction.append((uniprot_A[i], uniprot_B[i]))

positive_set = pd.DataFrame(reduced_interaction, columns = ['int_A', 'int_B'])

In [7]:
np.savetxt("reduced_interaction_set.txt", positive_set, fmt='%s')

In [8]:
positive_set_r_l = positive_set.drop_duplicates(subset=["int_A"])
positive_set_r_l

Unnamed: 0,int_A,int_B
0,A0A024R0Y4,Q8WWY3
1,A0A024R5S0,Q9NVV9
3,A0A024R6G0,P50222
4,A0A0C3SFZ9,A0A0S2Z5X4
5,A0A0C4DGV4,P50222
...,...,...
32043,P25233,Q9Y6B2
32047,Q03718,Q9Y6D9
32049,F5HEZ4,Q9Y6K9
32052,P17967,Q9Y6N9


In [41]:
positive_set_r_r = positive_set_r_l.drop_duplicates(subset=["int_B"])
positive_set_r_r

Unnamed: 0,int_A,int_B
0,A0A024R0Y4,Q8WWY3
1,A0A024R5S0,Q9NVV9
3,A0A024R6G0,P50222
4,A0A0C3SFZ9,A0A0S2Z5X4
6,A0A1B0GVM0,Q8N5M1
...,...,...
32017,P19508,Q9Y4B6
32021,Q5NHB5,Q9Y4I1
32035,P00974,Q9Y5Y6
32040,Q8CLD0,Q9Y657


In [42]:
positive_set_r_r['int_B'].value_counts().max()
prot_list = pd.unique(positive_set_r_r[['int_A', 'int_B']].values.ravel('K'))
print(len(prot_list))

5510


In [43]:
id_dict = dict.fromkeys(uniprot_id, 0)
len(id_dict)


13090

In [12]:
# create a frequency list of all proteins

dict_freq = id_dict.copy()
for i in df["UniprotID_A"]:
    if i in dict_freq:
        dict_freq[i] += 1

for i in df["UniprotID_B"]:
    if i in dict_freq:
        dict_freq[i] += 1
        
with open('bind_frequency.json', 'w') as fp:
    json.dump(dict_freq, fp)        


In [15]:
# create frequency list of proteins only in the reduced set

dict_freq_r = id_dict.copy()
for i in positive_set_r_r["int_A"]:
    if i in dict_freq_r:
        dict_freq_r[i] += 1

for i in positive_set_r_r["int_B"]:
    if i in dict_freq_r:
        dict_freq_r[i] += 1
        
#dict_freq_r

In [59]:
maximum = max(dict_freq_r, key=dict_freq_r.get)  # Just use 'min' instead of 'max' for minimum.
print(dict_freq_r[maximum])


2


Next task: removing duplicates such that:
1. A protein only appears once in a pair (so remove all subsequent pairs where said protein exists)
2. Use the reduced dataset we got above.

In [52]:
filter_dict = id_dict.copy()
final_list = []
i = 0
for index, row in positive_set_r_r.iterrows():
    partner_A = row['int_A']
    partner_B = row['int_B']
    if filter_dict[partner_A]==0 and filter_dict[partner_B]==0:
        filter_dict[partner_A] += 1
        filter_dict[partner_B] += 1
        i+=1
        final_list.append([partner_A, partner_B])
        
    else:
        continue

print(i)
final_list[2549]

2550


['Q8CLD0', 'Q9Y657']

In [65]:
final_frame = pd.DataFrame(final_list, columns={'int_A', 'int_B'})
final_frame.to_csv("c3_data_positive.csv", index=False)

'int_A,int_B\r\nA0A024R0Y4,Q8WWY3\r\nA0A024R5S0,Q9NVV9\r\nA0A024R6G0,P50222\r\nA0A0C3SFZ9,A0A0S2Z5X4\r\nA0A1B0GVM0,Q8N5M1\r\nA0AVI4,Q96H12\r\nA0AVK6,Q8IUH5\r\nA0AVN2,O95751\r\nA0JLT2,Q6IAN0\r\nA0JNW5,Q8ZAF0\r\nA0MZ66,A0MZ66\r\nA1DRY3,P62136\r\nA1E959,Q9Y6K9\r\nA1L0T0,P54253\r\nA1L162,P01375\r\nA1L190,Q9UL45\r\nA1L3X0,Q96EV8\r\nA1L4F5,A1L4F5\r\nA1L4G7,A1L4G7\r\nA1L4H1,Q7Z6G3\r\nA1L4K1,Q9H0A9\r\nA1X283,P48023\r\nA1YPR0,Q9H6L5\r\nA2A2Z9,H9XIJ5\r\nA2IDD5,Q8D194\r\nA2RU00,O76083\r\nA2RU48,Q9HCM9\r\nA2RUB6,Q99IB8\r\nA2RUS2,Q8D1J6\r\nA2VCK2,Q8N9N5\r\nA2VEC9,Q96AY4\r\nA3KMH1,Q96KG7\r\nA3KN83,Q8D0V0\r\nA4D1E9,Q08379\r\nA4D1P6,P42858\r\nA4D1W7,Q12841\r\nA4D2P6,Q9UHR4\r\nA4FU01,O15198\r\nA4UGR9,P20929\r\nA5D8T8,Q6UY14\r\nA5D8V6,Q9UGN5\r\nA5D8V7,O43463\r\nA5PKW4,Q8WVB3\r\nA5PLN9,Q8CKW7\r\nA5YKK6,Q9NZN8\r\nA6H8Y1,O60885\r\nA6NC98,Q5NI33\r\nA6NHR9,Q8ZEC9\r\nA6NI79,Q9NYL2\r\nA6NIH7,Q7Z494\r\nA6NJ69,Q13387\r\nA6NK53,O00555\r\nA6NM11,Q81SN0\r\nA6NN06,Q99750\r\nA6PVS8,O43639\r\nA7E1X5,Q5NET6\r\nA7E2V4,P

The following function shows a tree hierarchy of the filesystem

In [3]:
def tree(directory):
    print(f'+ {directory}')
    for path in sorted(directory.rglob('*')):
        depth = len(path.relative_to(directory).parts)
        spacer = '    ' * depth
        print(f'{spacer}-> {path.name}')
        
# tree(home_dir)