In [4]:
from matplotlib.ticker import FuncFormatter
from statannotations.Annotator import Annotator
from scipy.stats import mannwhitneyu
from pandas.plotting import table
import matplotlib.pyplot as plt
import dataframe_image as dfi
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import numpy as np
import scipy as sp
import scienceplots

In [5]:
# Load the data
cl = pd.read_csv('./project_pipeline/data/classified_files.tsv', sep='\t')
afinf = pd.read_csv('./project_pipeline/data/alphafold_interface.tsv', sep='\t')

afinf.head()

Unnamed: 0,uniprot,region_1,region_2,af_filename,interacting_residue_pairs,interface_residues,number_interface_residues,region_1 search,region_2 search
0,P29476,815-870,757-949,F-P29476-F1-model_v3.cif,"[(834, 837), (847, 850), (850, 854), (845, 901...","{758, 759, 760, 761, 762, 787, 788, 789, 791, ...",93.0,"[815, 816, 817, 818, 819, 820, 821, 822, 823, ...","[757, 758, 759, 760, 761, 762, 763, 764, 765, ..."
1,P18031,353-387,2-277,F-P18031-F1-model_v3.cif,,,,"[353, 354, 355, 356, 357, 358, 359, 360, 361, ...","[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1..."
2,P00533,"166-309,482-618",712-979,F-P00533-F1-model_v3.cif,,,,"[512, 513, 514, 515, 516, 517, 518, 519, 520, ...","[712, 713, 714, 715, 716, 717, 718, 719, 720, ..."
3,P04637,364-393,102-292,F-P04637-F1-model_v3.cif,,,,"[364, 365, 366, 367, 368, 369, 370, 371, 372, ...","[102, 103, 104, 105, 106, 107, 108, 109, 110, ..."
4,O14757,391-476,9-265,F-O14757-F1-model_v3.cif,"[(205, 449), (96, 452), (93, 452), (207, 420),...","{134, 13, 14, 17, 419, 420, 422, 442, 447, 448...",28.0,"[391, 392, 393, 394, 395, 396, 397, 398, 399, ...","[9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20..."


In [6]:
# Assign uniprot conformations. Closed if any interface residues, open if none.
afinf['af_conformation'] = afinf['number_interface_residues'].apply(lambda x: 'Closed' if x > 0 else 'Open')

In [7]:
# We need to group by protein (uniprot) and find the minimum value of the 2_comp column

exem = cl.sort_values('2_comp').groupby('uniprot', as_index=False).first()

print(exem['conformation'].value_counts())

Closed    116
Open       12
Name: conformation, dtype: int64


In [8]:
# Add to classified files
af_conf = afinf[['uniprot', 'af_conformation']]
merge = pd.merge(exem, af_conf, on='uniprot', how='left')

merge = merge.drop(columns=['organism', 'date', 'notes'])

merge.head()

Unnamed: 0,uniprot,pdb,region_1,region_2,complex_rmsd,percent_region_1,percent_region_2,2_aligned,2_comp,state,conformation,af_conformation
0,A0A0R4I961,6gtv,155-279,3-147,1.45,100.0,100.0,1.673,1.135,Active,Closed,Closed
1,B5XAZ0,7bfl,116-120,1-115,2.591,100.0,79.130435,2.76,0.807,Autoinhibited,Closed,Closed
2,D2AJU0,6lol,57-228,251-545,3.539,100.0,81.694915,2.829,7.86,Autoinhibited,Closed,Closed
3,O00571,7liu,132-181,211-575,2.074,94.0,100.0,2.105,1.984,Autoinhibited,Closed,Closed
4,O08722,3g5b,541-687,"688-828,853-942",0.922,99.319728,99.5671,0.959,0.927,Autoinhibited,Closed,Closed


In [9]:
uni_only = merge[['uniprot', 'af_conformation']].drop_duplicates().reset_index(drop=True)
vals = uni_only['af_conformation'].value_counts()
vals

Closed    115
Open       13
Name: af_conformation, dtype: int64

In [10]:
states = merge['state'].value_counts()
states

Autoinhibited    67
Active           30
Name: state, dtype: int64

In [11]:
confs = merge['conformation'].value_counts()
confs

Closed    116
Open       12
Name: conformation, dtype: int64

In [12]:
af_confs = merge['af_conformation'].value_counts()
af_confs

Closed    115
Open       13
Name: af_conformation, dtype: int64

In [13]:
s_v_c = merge.groupby(['state', 'conformation']).size().unstack().fillna(0)
s_v_c

conformation,Closed,Open
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Active,25,5
Autoinhibited,64,3


In [14]:
c_v_ac = merge.groupby(['conformation', 'af_conformation']).size().unstack().fillna(0)
c_v_ac

af_conformation,Closed,Open
conformation,Unnamed: 1_level_1,Unnamed: 2_level_1
Closed,110,6
Open,5,7


In [15]:
s_v_ac = merge.groupby(['state', 'af_conformation']).size().unstack().fillna(0)
s_v_ac

af_conformation,Closed,Open
state,Unnamed: 1_level_1,Unnamed: 2_level_1
Active,28,2
Autoinhibited,63,4


In [16]:
s_v_ac_v_c = merge.groupby(['state', 'af_conformation', 'conformation']).size().unstack().fillna(0)
s_v_ac_v_c

Unnamed: 0_level_0,conformation,Closed,Open
state,af_conformation,Unnamed: 2_level_1,Unnamed: 3_level_1
Active,Closed,25.0,3.0
Active,Open,0.0,2.0
Autoinhibited,Closed,62.0,1.0
Autoinhibited,Open,2.0,2.0
