I aim here to see if there are any InterPro categories that correlate with better performance. 

In [None]:
from matplotlib.ticker import FuncFormatter
from scipy.stats import mannwhitneyu, pearsonr
from pandas.plotting import table
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import numpy as np
import scipy as sp

In [None]:
# Load data
df = pd.read_csv('./project_pipeline/data/interpro.tsv', sep='\t')

df = df.drop(columns=['region_1', 'region_2', '2_aligned', 'organism', 'date', 'notes', 'percent_region_1', 'percent_region_2'])

df.head()

In [None]:
# Melt the interpro categories into one column, with an additional column indicating what category it was
cat = pd.melt(df, id_vars=['uniprot', 'pdb', 'complex_rmsd', '2_comp', 'conformation', 'state'], var_name='interpro', value_name='value')

cat = cat.rename(columns={'2_comp': 'IMAE'})
cat.head()

In [68]:
# Do any of these categories have better 2_comp values than others?
# We'll do a MANOVA test to compare the different categories.
# summary statistics first

def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

group = cat.groupby('value').describe()
print_full(group)

                                                   complex_rmsd              \
                                                          count        mean   
value                                                                         
3'5'-cyclic nucleotide phosphodiesterase                    4.0    2.111250   
3'5'-cyclic nucleotide phosphodiesterase, catal...          2.0    1.117000   
3'5'-cyclic nucleotide phosphodiesterase, catal...          2.0    3.105500   
3'5'-cyclic nucleotide phosphodiesterase, catal...          2.0    1.117000   
3'5'-cyclic nucleotide phosphodiesterase, conse...          4.0    2.111250   
ABC1 atypical kinase-like domain, ADCK3-like do...          4.0    2.645750   
ATP-dependent RNA helicase DEAD-box, conserved ...          6.0    7.737167   
Acidic leucine-rich nuclear phosphoprotein 32               3.0    1.249333   
Actinin-type actin-binding domain, conserved site           3.0   46.279667   
Adaptor protein Cbl                                 

In [None]:
X = cat[['complex_rmsd', 'IMAE', 'state', 'conformation']]
X = pd.get_dummies(X, drop_first=True)  # Convert categorical variables to dummy variables

# Check for NaNs or infinite values
if X.isnull().values.any():
    X = X.fillna(0)
    print("Filled NaNs with 0.")

# Ensure there are no infinite values
X = X.replace([np.inf, -np.inf], 0)

# Check for constant columns
constant_columns = [col for col in X.columns if X[col].nunique() == 1]
if constant_columns:
    print("Constant columns found:", constant_columns)
    X = X.drop(columns=constant_columns)


vif_data = pd.DataFrame()
vif_data['feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data)

In [None]:
X = cat[['value']]
Y = cat[['complex_rmsd', 'IMAE', 'conformation', 'state']]

In [None]:
manova = MANOVA.from_formula('complex_rmsd + IMAE + state + conformation ~ value', data=cat)
result = manova.mv_test()
print(result)