In [5]:
import pandas as pd
import numpy as np
from biopandas.pdb import PandasPdb
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from collections import defaultdict
import pickle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GroupKFold, KFold, RepeatedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import *
from sklearn.kernel_ridge import KernelRidge


import matplotlib.pyplot as plt
plt.style.use('seaborn-whitegrid')
import seaborn as sns

In [9]:
# https://biopython.org/docs/1.75/api/Bio.SeqUtils.ProtParam.html
def get_protein_features(sequences):
    features = defaultdict(list)
    for seq in sequences:
        analysis = ProteinAnalysis(seq)
        features['len'].append(len(seq))
        features['weight'].append(analysis.molecular_weight())
        features['gravy'].append(analysis.gravy())
        features['aromaticity'].append(analysis.aromaticity())
        features['instability_index'].append(analysis.instability_index())
#         features['flexibility'].append(analysis.flexibility())
        features['isoelectric_point'].append(analysis.isoelectric_point())
        features['molar_extinction_coefficient'].append(np.mean(analysis.molar_extinction_coefficient()))
        structures = analysis.secondary_structure_fraction()
        features['helix'].append(structures[0])
        features['turn'].append(structures[1])
        features['sheet'].append(structures[2])
    features['log_len'] = list(np.log(features['len']))
    features['log_weight'] = list(np.log(features['weight']))
    return features

def get_protein_chain(df):
    proteins = pickle.load(open('Data/proteins.pkl', 'rb'))
    protein_groups = list(df['protein_group'].values)
    protein_groups = list(map(lambda x: x.split(';'), protein_groups))
    peptides = list(df.peptide.values)

    ids, chains = [], []
    for peptide, protein_group in zip(peptides, protein_groups):
        pid, chain = None, None
        for protein_id in protein_group:
            if protein_id in proteins:
                if '?' in proteins[protein_id]['chain']:
                    continue
                pid = protein_id
                chain = proteins[pid]['chain']
                break
        ids.append(protein_id)
        chains.append(chain)
    return ids, chains

In [10]:
df = pd.read_csv('Data/reproducibility2.csv')
df['peptide_mean'] = (df['peptide_mean.deanna'] +  df['peptide_mean.karin']) / 2
df['peptide_mean_1'] = df['peptide_mean.deanna']
df['peptide_mean_2'] = df['peptide_mean.karin']
df['peptide_mean_diff'] = np.sqrt(abs(df['peptide_mean_1']-df['peptide_mean_2']))
df.drop(['peptide_mean.deanna', 'peptide_mean.karin'], axis=1, inplace=True)

# Get peptide features
peptide_features = get_protein_features(df.peptide.values)
for key, val in peptide_features.items():
    df['peptide_'+key] = val
    
# Get protein features
ids, chains = get_protein_chain(df)
df['protein_name'] = ids
df['protein'] = chains
df.dropna(axis=0, inplace=True)
protein_features = get_protein_features(df.protein.values)
df.drop(['protein_group', 'protein_group_names'], axis=1, inplace=True)
for key, val in protein_features.items():
    df['protein_'+key] = val

# Correct reproducibility
df2 = df[['reproducibility', 'protein_name']].groupby('protein_name').mean()
protein_means = df2.to_dict()['reproducibility']
df['reproducibility_corrected'] = df['reproducibility'] - df['protein_name'].map(protein_means)

# Sort columns and save
df = df.reindex(sorted(df.columns), axis=1)
# df.to_csv('Data/reproducibility.csv', index=False)
print(df.columns)
print(df.shape)
df.head()

Index(['peptide', 'peptide_aromaticity', 'peptide_gravy', 'peptide_helix',
       'peptide_instability_index', 'peptide_isoelectric_point', 'peptide_len',
       'peptide_log_len', 'peptide_log_weight', 'peptide_mean',
       'peptide_mean_1', 'peptide_mean_2', 'peptide_mean_diff',
       'peptide_molar_extinction_coefficient', 'peptide_sheet', 'peptide_turn',
       'peptide_weight', 'protein', 'protein_aromaticity', 'protein_gravy',
       'protein_helix', 'protein_instability_index',
       'protein_isoelectric_point', 'protein_len', 'protein_log_len',
       'protein_log_weight', 'protein_molar_extinction_coefficient',
       'protein_name', 'protein_sheet', 'protein_turn', 'protein_weight',
       'reproducibility', 'reproducibility_corrected'],
      dtype='object')
(2911, 33)


Unnamed: 0,peptide,peptide_aromaticity,peptide_gravy,peptide_helix,peptide_instability_index,peptide_isoelectric_point,peptide_len,peptide_log_len,peptide_log_weight,peptide_mean,...,protein_len,protein_log_len,protein_log_weight,protein_molar_extinction_coefficient,protein_name,protein_sheet,protein_turn,protein_weight,reproducibility,reproducibility_corrected
0,AAAATGTIFTFR,0.166667,0.858333,0.25,5.3,9.79502,12,2.484907,7.111823,20.783651,...,356,5.874931,10.599246,25900.0,P05154,0.261236,0.233146,40104.5947,0.602647,0.13178
1,AAAIQTMSLDAER,0.0,0.069231,0.153846,43.030769,4.370373,13,2.564949,7.227325,22.568058,...,2124,7.661056,12.374699,336410.0,O75326,0.229755,0.266478,236735.5285,0.225532,-0.148407
2,AACAQLNDFLQEYGTQGCQV,0.1,-0.115,0.25,33.83,4.050028,20,2.995732,7.677565,23.491978,...,1564,7.355002,12.056308,171207.5,P0C0L4,0.273018,0.243606,172182.0885,0.344353,-0.159834
3,AADDTWEPFASGK,0.153846,-0.815385,0.153846,2.676923,4.050028,13,2.564949,7.240249,29.672498,...,820,6.709304,11.41756,142140.0,P02766,0.245122,0.215854,90904.068,0.288063,-0.022109
4,AADHDVGSELPPEGVLGALLR,0.0,0.1,0.285714,42.357143,4.308305,21,3.044522,7.657439,26.325119,...,37,3.610918,8.346772,5500.0,Q9UHG2,0.432432,0.135135,4216.5498,0.396802,0.087727
