This notebook is for combining the protein lists. As I've already combined Jorge's and my files, I'm just going to add Dokyun's. 

In [150]:
import pandas as pd
import csv
import requests
import numpy as np
import os
import sys

ModuleNotFoundError: No module named 'pdbecif'

In [137]:
na = pd.read_csv('./source_protein_lists/autoinhibited_proteins_no_structures_dokyun_na.csv')

act = pd.read_csv('./source_protein_lists/autoinhibited_proteins_no_autoinhibited_structures_dokyun_na.csv')

aut = pd.read_csv('./project_pipeline/data/proteins.tsv', sep='\t')

clas = pd.read_csv('./project_pipeline/data/classified_files_3.tsv', sep='\t')

Let's look at the intersection of these dataframes to make sure there's no overlap

In [138]:
na_pro = na['UNIPROT_AC'].unique()

act_pro = act['UNIPROT_AC'].unique()

common = np.intersect1d(na_pro, act_pro)

print(len(common))

0


Let's look at the intersection of the no-structures dataframe with my original protein list.

In [139]:
aut_pro = aut['uniprot'].unique()

common2 = np.intersect1d(na_pro, aut_pro)

common3 = np.intersect1d(act_pro, aut_pro)

print('The autoinhibitory and no structure dataframes have this many proteins in common: ' + str(len(common2)))

print('The autoinhibitory and active only dataframes have this many proteins in common: ' + str(len(common3)))

The autoinhibitory and no structure dataframes have this many proteins in common: 0
The autoinhibitory and active only dataframes have this many proteins in common: 126


In [140]:
clas_pro = clas['uniprot'].unique()

common4 = np.intersect1d(act_pro, clas_pro)

print('The classified dataframe and the active only dataframe have this many proteins in common: ' + str(len(common4)))

The classified dataframe and the active only dataframe have this many proteins in common: 24


I need to clean up the domain region specifications within the no structures dataframe.

In [141]:
na.head(10)

Unnamed: 0,UNIPROT_AC,Autoinhibitory element,Target element,Relief mechanism
0,Q7XEK4,2-68 (N-terminal autoinhibitory region),439-812 (ATPase domain),Partner binding(Calmodulin)
1,Q12774,1093-1100 (Autoinhibitory helix),1174-1358 (DH domain),PTM (Phosphorylation of autoinhibitory helix)
2,A0A044RE18,42-116 (Prodomain),159-447 (Kexin/furin catalytic domain),Cleavage (Cleavage of prodomain at the RRKR mo...
3,Q39253,1-36 (N-terminal regulatory region),56-62 (7 amino acids within the first quarter ...,PTM (Phosphorylation of Ser25); Partner bindin...
4,Q5VT25,658-930 (Coiled-coil domains),77-343 (Protein kinase domain),Ligand binding (phorbol 12-myristate 13-acetat...
5,O15078,1-580 (N-terminal autoinhibitory region),1966-2479 (C-terminal autoinhibitory region),Partner binding (CP110 binding to membrane-bin...
6,O15078,1966-2479 (C-terminal autoinhibitory region),1-580 (N-terminal autoinhibitory region),Partner binding (CP110 binding to membrane-bin...
7,Q9JK25,1258-1314 (C-terminal zinc knuckle domains),60-278 (CAP-Gly domain),Others (Binding of CLIP-170 to microtubules)
8,Q8K382,527-547 (C-terminal autoinhibitory region),1-406 (DENN domain),PTM (Akt-mediated phosphorylation in autoinhib...
9,P48608,114-369 (DID domain),59-241 (GBD domain),Partner binding (Binding of GTP-bound Rho prot...


In [142]:
conv = na.rename(columns={'UNIPROT_AC': 'uniprot', 'Autoinhibitory element': 'region_1', 'Target element': 'region_2'})

# Clean up each of the region columns to only include the sequence boundaries
conv['region_1'] = conv['region_1'].str.split('(').str[0]
conv['region_2'] = conv['region_2'].str.split('(').str[0]

conv.head()

Unnamed: 0,uniprot,region_1,region_2,Relief mechanism
0,Q7XEK4,2-68,439-812,Partner binding(Calmodulin)
1,Q12774,1093-1100,1174-1358,PTM (Phosphorylation of autoinhibitory helix)
2,A0A044RE18,42-116,159-447,Cleavage (Cleavage of prodomain at the RRKR mo...
3,Q39253,1-36,56-62,PTM (Phosphorylation of Ser25); Partner bindin...
4,Q5VT25,658-930,77-343,Ligand binding (phorbol 12-myristate 13-acetat...


In [143]:
print(len(conv))

conv = conv.dropna(subset=['region_1', 'region_2'])

print(len(conv))

101
100


I need to merge together any rows that have the same UniProt ID.

In [144]:
# Merge rows with the same uniprot
# Unfortunately, some of these have region_1 repeated in region_2, and some of them repeat the same region multiple times. So I have to account for this.

conv = conv.groupby('uniprot').agg({'region_1': ', '.join, 'region_2': ', '.join}).reset_index()

conv.head(20)

Unnamed: 0,uniprot,region_1,region_2
0,A0A044RE18,42-116,159-447
1,A0A377U130,1-56,108-477
2,A0A544CH04,112-139,1-101
3,A1X283,368-427,5-129
4,A2A5C2,"1-42 , 417-480","334-417 , 334-417"
5,A9QM74,1-92,102-495
6,B1MTB0,186-215,8-163
7,C8ZE58,604-741,855-886
8,G4SLH0,16206-16264,15934-16189
9,O15078,"1-580 , 1966-2479","1966-2479 , 1-580"


In [145]:
# Define a function to check for whether region_1 and region_2 are the exact same or whether the sequence bounds have been repeated.

def check_regions(df):

    for idx, row in df.iterrows():
        reg1 = sorted(set(row['region_1'].split(',')))
        reg2 = sorted(set(row['region_2'].split(',')))

        reg1 = [r.strip() for r in reg1]
        reg2 = [r.strip() for r in reg2]

        if reg1 == reg2:
            regs = list(reg1)
            df.at[idx, 'region_1'] = regs[0]
            df.at[idx, 'region_2'] = regs[1]

        else:
            df.at[idx, 'region_1'] = ', '.join(reg1)
            df.at[idx, 'region_2'] = ', '.join(reg2)

    return df

def retrieve_fn(df, fp):

    # Get files present in the directory
    files = os.listdir(fp)
    files = [f for f in files if '.cif' in f]


    for idx, row in df.iterrows():
        uniprot = row['uniprot']

        # Find file with uniprot in the name
        fn = [f for f in files if uniprot in f]

        if len(fn) == 0:
            df.drop(idx, inplace=True)

        else:
            df.at[idx, 'af_filename'] = str(fn[0])

    return df

In [146]:
fixed = check_regions(conv)

fixed.head(20)

Unnamed: 0,uniprot,region_1,region_2
0,A0A044RE18,42-116,159-447
1,A0A377U130,1-56,108-477
2,A0A544CH04,112-139,1-101
3,A1X283,368-427,5-129
4,A2A5C2,"417-480, 1-42","334-417, 334-417"
5,A9QM74,1-92,102-495
6,B1MTB0,186-215,8-163
7,C8ZE58,604-741,855-886
8,G4SLH0,16206-16264,15934-16189
9,O15078,"1966-2479, 1-580","1-580, 1966-2479"


In [147]:
# Add the filenames by going through the folder
fp = './project_pipeline/data/input/Alphafold_cif'

fixed = retrieve_fn(fixed, fp)

# Remove whitespce at start of regions
# fixed['region_1'] = fixed['region_1'].str.strip()
# fixed['region_2'] = fixed['region_2'].str.strip()

fixed.head()

Unnamed: 0,uniprot,region_1,region_2,pred_fn
0,A0A044RE18,42-116,159-447,AF-A0A044RE18-F1-model_v4.cif
1,A0A377U130,1-56,108-477,AF-A0A377U130-F1-model_v4.cif
2,A0A544CH04,112-139,1-101,AF-A0A544CH04-F1-model_v4.cif
3,A1X283,368-427,5-129,AF-A1X283-F1-model_v4.cif
4,A2A5C2,"417-480, 1-42","334-417, 334-417",AF-A2A5C2-F1-model_v4.cif


In [148]:
# Save to csv
fixed.to_csv('./project_pipeline/data/autoinhibited_proteins_no_structures_dokyun.csv', index=False)