In [1]:
import re
import numpy as np
import pandas as pd
import statistics
import math

### Per residue projection possible with this record set...

In [2]:
# MSA data
msa_data = np.genfromtxt(r"../pos_ranking_nostar.dat", names=True, dtype=None, encoding="utf-8")
msa_scores = msa_data["pos_scr"]
len(msa_scores)

291

In [3]:
df = pd.read_csv(r"../int_in_all_struct.csv")
per_residue_interactions = df.to_dict(orient="records")[0]
per_residue_interactions 

{'2': '[6, 7, 6, 6, 9]',
 '3': '[7, 9, 8, 7, 7]',
 '4': '[9, 12, 9, 13, 7]',
 '5': '[11, 11, 11, 11, 11]',
 '6': '[6, 6, 6, 6, 9, 6]',
 '8': '[9, 11, 11, 7, 8, 7]',
 '9': '[15, 11, 12, 16, 15, 12]',
 '10': '[8, 7, 7, 7, 6, 8]',
 '11': '[6, 7, 6, 7, 5, 6]',
 '12': '[12, 11, 12, 12, 9, 12]',
 '13': '[12, 11, 11, 13, 11, 13]',
 '14': '[5, 5, 5, 5, 3, 6]',
 '15': '[4, 7, 4, 3, 4, 4]',
 '16': '[7, 15, 14, 7, 9, 9]',
 '17': '[6, 8, 7, 7, 8, 7]',
 '18': '[8, 9, 8, 7, 11, 7]',
 '19': '[13, 15, 13, 13, 11, 10]',
 '20': '[12, 12, 13, 13, 12, 13]',
 '21': '[12, 12, 13, 13, 12, 13]',
 '22': '[17, 16, 17, 17, 14, 17]',
 '23': '[17, 17, 18, 14, 15, 16]',
 '24': '[14, 15, 14, 18, 13, 13]',
 '25': '[13, 17, 15, 16, 15, 14]',
 '26': '[13, 12, 13, 13, 10, 13]',
 '27': '[10, 14, 11, 11, 9, 13]',
 '28': '[5, 6, 5, 6, 7, 7]',
 '29': '[6, 5, 4, 5, 6, 5]',
 '30': '[3, 5, 4, 5, 4, 3]',
 '31': '[6, 3, 5, 4, 3, 5]',
 '32': '[5, 6, 7, 6, 6, 5]',
 '33': '[7, 4, 11, 7, 3, 7]',
 '35': '[5, 4, 5, 6, 7, 4]',
 '36': '

In [4]:
coeff_variantions = {}
for msa_residue, interaction_counts_string in per_residue_interactions.items():    
    # Clean up interaction counting types after loading them in. 
    interaction_counts = re.sub("\[|\]|,", "", interaction_counts_string).split(" ")
    interaction_counts = [int(count) for count in interaction_counts]

    # focus analysis on only these.
    if len(interaction_counts) >= 6:

        # calc mean and median count:
        mean = statistics.mean(interaction_counts)
        median = statistics.median(interaction_counts)
        stdev = statistics.stdev(interaction_counts)
        # Coefficient of variation accounts for different size numb interactions. 
        # Other option could be gini inequality. 
        coeff_variation = round(stdev/mean, 3)
        coeff_variantions[msa_residue] = coeff_variation
    
coeff_variantions

{'6': 0.188,
 '8': 0.208,
 '9': 0.154,
 '10': 0.105,
 '11': 0.122,
 '12': 0.107,
 '13': 0.083,
 '14': 0.203,
 '15': 0.315,
 '16': 0.343,
 '17': 0.105,
 '18': 0.181,
 '19': 0.141,
 '20': 0.044,
 '21': 0.044,
 '22': 0.074,
 '23': 0.091,
 '24': 0.129,
 '25': 0.094,
 '26': 0.098,
 '27': 0.164,
 '28': 0.149,
 '29': 0.146,
 '30': 0.224,
 '31': 0.279,
 '32': 0.129,
 '33': 0.432,
 '35': 0.226,
 '36': 0.146,
 '37': 0.184,
 '38': 0.06,
 '39': 0.216,
 '40': 0.082,
 '41': 0.25,
 '42': 0.132,
 '43': 0.119,
 '44': 0.105,
 '45': 0.143,
 '46': 0.113,
 '47': 0.129,
 '48': 0.084,
 '49': 0.069,
 '50': 0.072,
 '51': 0.099,
 '52': 0.106,
 '53': 0.091,
 '54': 0.118,
 '55': 0.129,
 '56': 0.112,
 '57': 0.136,
 '58': 0.091,
 '59': 0.306,
 '60': 0.16,
 '61': 0.173,
 '62': 0.465,
 '63': 0.274,
 '64': 0.316,
 '67': 0.097,
 '68': 0.066,
 '69': 0.052,
 '70': 0.106,
 '71': 0.046,
 '72': 0.107,
 '73': 0.079,
 '74': 0.174,
 '75': 0.485,
 '76': 0.442,
 '77': 0.313,
 '78': 0.167,
 '81': 0.296,
 '83': 0.128,
 '84': 0.11,

In [5]:
combined_variance_msa = {}
for residue_str, coeff_variation in coeff_variantions.items():

    msa_numb = msa_scores[int(residue_str) + 1] # 0 indexed... 

    combined_variance_msa[residue_str] = [coeff_variation, msa_numb]
combined_variance_msa

{'6': [0.188, 0.59],
 '8': [0.208, 0.48],
 '9': [0.154, 0.6],
 '10': [0.105, 0.77],
 '11': [0.122, 0.76],
 '12': [0.107, 0.56],
 '13': [0.083, 0.5],
 '14': [0.203, 0.43],
 '15': [0.315, 0.56],
 '16': [0.343, 0.75],
 '17': [0.105, 0.38],
 '18': [0.181, 0.71],
 '19': [0.141, 0.96],
 '20': [0.044, 0.78],
 '21': [0.044, 0.59],
 '22': [0.074, 0.65],
 '23': [0.091, 0.65],
 '24': [0.129, 0.75],
 '25': [0.094, 0.69],
 '26': [0.098, 0.48],
 '27': [0.164, 0.75],
 '28': [0.149, 0.73],
 '29': [0.146, 0.68],
 '30': [0.224, 0.45],
 '31': [0.279, 0.67],
 '32': [0.129, 0.0],
 '33': [0.432, 0.48],
 '35': [0.226, 0.51],
 '36': [0.146, 0.55],
 '37': [0.184, 0.64],
 '38': [0.06, 0.55],
 '39': [0.216, 0.73],
 '40': [0.082, 0.9],
 '41': [0.25, 0.58],
 '42': [0.132, 0.33],
 '43': [0.119, 0.37],
 '44': [0.105, 0.89],
 '45': [0.143, 0.68],
 '46': [0.113, 0.42],
 '47': [0.129, 0.86],
 '48': [0.084, 0.54],
 '49': [0.069, 0.51],
 '50': [0.072, 0.41],
 '51': [0.099, 0.58],
 '52': [0.106, 0.71],
 '53': [0.091, 0.47

In [6]:
msa_variance_df = pd.DataFrame.from_dict(combined_variance_msa).T
msa_variance_df.columns = ["Coeff of Variation", "MSA value"]

In [7]:
msa_values = list(msa_variance_df["MSA value"])
#msa_variance_df["log MSA value"] = [ math.log(x) for x in msa_values ]

In [8]:
msa_variance_df.corr()

Unnamed: 0,Coeff of Variation,MSA value
Coeff of Variation,1.0,-0.174066
MSA value,-0.174066,1.0


In [9]:
import plotly.express as px
fig = px.scatter(msa_variance_df, x="MSA value", y="Coeff of Variation")
fig.show()