# README
The purpose of this notebook is to process the output from FoldX (For variants from Genebass). the results are in different folders, we need to 
1: for each folder, read in the file starts with 'Dif_'(this contains the delta delta G) and the file starts with 'individual_list'(this contains the correspondence information between mutation and ddG) 
2: merge the mutation info and its corresponding ddG 
3: generate a dataframe and merge with Genebass data
4: output as csv file for further performance calculation

# Import packages

In [4]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

import matplotlib.pyplot as plt

# Processing

In [3]:
# diff_energy = pd.read_csv('/Users/joannahench/Downloads/foldx5MacStd.tar_/Genebass/P0DMS8/Dif_P0DMS8.fxout',skiprows=8,sep='\t')
# diff_energy

Unnamed: 0,Pdb,total energy,Backbone Hbond,Sidechain Hbond,Van der Waals,Electrostatics,Solvation Polar,Solvation Hydrophobic,Van der Waals clashes,entropy sidechain,...,cis_bond,torsional clash,backbone clash,helix dipole,water bridge,disulfide,electrostatic kon,partial covalent bonds,energy Ionisation,Entropy Complex
0,P0DMS8_1.pdb,1.767660,3.669020e-01,0.366902,0.033370,0.000000e+00,-0.097597,-0.027027,-2.276860e-01,-0.359329,...,0,0.000000e+00,0.000000e+00,0.000000e+00,0,0,0,0,0.000000e+00,0
1,P0DMS8_2.pdb,0.499373,4.648880e-01,0.464888,-0.395977,0.000000e+00,0.431325,-0.557309,-3.552710e-15,-0.252917,...,0,-5.641750e-02,0.000000e+00,0.000000e+00,0,0,0,0,0.000000e+00,0
2,P0DMS8_3.pdb,-0.100068,0.000000e+00,0.000000,0.009785,-1.279170e-01,-0.021035,0.009190,0.000000e+00,0.006955,...,0,-3.072050e-03,9.625120e-04,0.000000e+00,0,0,0,0,0.000000e+00,0
3,P0DMS8_4.pdb,0.140177,1.136870e-13,0.000000,-0.301008,4.700510e-09,0.432898,-0.365992,8.986840e-10,0.086637,...,0,8.620080e-07,1.143670e-01,0.000000e+00,0,0,0,0,0.000000e+00,0
4,P0DMS8_5.pdb,0.456306,-1.136870e-13,0.000000,-0.113270,-1.360380e-01,0.282557,-0.062291,-8.986840e-10,-0.030825,...,0,1.563000e-02,5.684340e-14,-1.110220e-16,0,0,0,0,0.000000e+00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,P0DMS8_133.pdb,1.207220,2.565350e-03,0.691597,0.529359,-8.824570e-03,-0.409693,0.828372,-1.671740e-02,-0.296771,...,0,1.196270e-02,3.996690e-02,5.420850e-04,0,0,0,0,2.347640e-09,0
133,P0DMS8_134.pdb,-0.393174,-1.057390e-01,0.000000,-0.482357,0.000000e+00,0.265349,-0.768084,2.394650e-02,0.238254,...,0,3.265550e-02,3.535000e-02,0.000000e+00,0,0,0,0,0.000000e+00,0
134,P0DMS8_135.pdb,0.082216,0.000000e+00,0.000000,-0.070076,0.000000e+00,0.059072,-0.161681,-2.348460e-05,0.034729,...,0,9.599970e-04,4.908390e-03,0.000000e+00,0,0,0,0,0.000000e+00,0
135,P0DMS8_136.pdb,0.319334,1.443000e-05,0.000013,-0.176965,1.483690e-02,0.696527,-0.321895,6.142420e-03,0.107890,...,0,6.313570e-03,4.107710e-02,1.905380e-01,0,0,0,0,-2.347640e-09,0


In [69]:
# define the directory containing the protein folders
directory = '/Users/joannahench/Downloads/foldx5MacStd.tar_/Genebass'

# create an empty list to store the final data
final_data = []

# loop over each protein folder
for protein_folder in os.listdir(directory):

    # check if the item in the directory is a folder
    if os.path.isdir(os.path.join(directory, protein_folder)):


        # read in the Dif_ file as a csv, skipping the first 8 rows
        dif_file = pd.read_csv(os.path.join(directory, protein_folder, 'Dif_'+str(protein_folder)+'.fxout'), skiprows=8,sep = '\t')
        

        # read in the individual_list file as a csv with semicolon separator
        individual_list_file = pd.read_csv(os.path.join(directory, protein_folder, 'individual_list_'+str(protein_folder)+'.txt'), sep=';',header=None)


        # create a new dataframe with three columns: protein, total energy, and individual data
        new_data = pd.DataFrame({'Uniprot': protein_folder, 'FoldX': dif_file['total energy'], 'mutation data': individual_list_file[0]})

        # append the new data to the final data list
        final_data.append(new_data)


# concatenate all the new dataframes into a single dataframe
final_dataframe = pd.concat(final_data)

In [70]:
# split the 'mutation data' into 'mutation_from','mutation_to','mutation_pos'
final_dataframe['mutation_from'] = final_dataframe['mutation data'].str[0]
final_dataframe['mutation_to'] = final_dataframe['mutation data'].str[-1]
final_dataframe['Protein_position'] = final_dataframe['mutation data'].str[2:-1].astype(int)


In [71]:
final_dataframe.reset_index(drop=True)
final_dataframe

Unnamed: 0,Uniprot,FoldX,mutation data,mutation_from,mutation_to,Protein_position
0,P51681,0.406824,RA60S,R,S,60
1,P51681,0.065373,RA223Q,R,Q,223
2,P51681,-0.274196,AA335V,A,V,335
3,P51681,0.344865,DA2N,D,N,2
4,P51681,-0.292315,QA4E,Q,E,4
...,...,...,...,...,...,...
455,P21917,-0.603688,AA414V,A,V,414
456,P21917,0.802561,LA415V,L,V,415
457,P21917,0.555752,LA415R,L,R,415
458,P21917,-0.089390,RA416H,R,H,416


In [36]:
# read in all gene names
gene_list_raw =pd.read_csv("230323_EST_ENSG_GENE_new.csv")
gene_list_raw

Unnamed: 0,index,ENST,ENSG,protein,gene,Uniprot
0,0,ENST00000646641,ENSG00000267534,s1pr2_human,S1PR2,O95136
1,121,ENST00000547270,ENSG00000257138,t2r38_human,TAS2R38,P59533
2,219,ENST00000390675,ENSG00000256436,t2r31_human,TAS2R31,P59538
3,344,ENST00000539585,ENSG00000256188,t2r30_human,TAS2R30,P59541
4,483,ENST00000538986,ENSG00000255837,t2r20_human,TAS2R20,P59543
...,...,...,...,...,...,...
395,64029,ENST00000510937,ENSG00000226306,npy6r_human,NPY6R,Q99463
396,64030,ENST00000641193,ENSG00000279301,o2t11_human,OR2T11,Q8NH01
397,64031,ENST00000641732,ENSG00000172146,or1a1_human,OR1A1,Q9P1Q5
398,64032,ENST00000328890,ENSG00000183024,or1g1_human,OR1G1,P47890


In [72]:
final_dataframe = final_dataframe.merge(gene_list_raw[['protein','gene','Uniprot']],on='Uniprot')
final_dataframe = final_dataframe.rename(columns={'gene':'SYMBOL'})
final_dataframe

Unnamed: 0,Uniprot,FoldX,mutation data,mutation_from,mutation_to,Protein_position,protein,SYMBOL
0,P51681,0.406824,RA60S,R,S,60,ccr5_human,CCR5
1,P51681,0.065373,RA223Q,R,Q,223,ccr5_human,CCR5
2,P51681,-0.274196,AA335V,A,V,335,ccr5_human,CCR5
3,P51681,0.344865,DA2N,D,N,2,ccr5_human,CCR5
4,P51681,-0.292315,QA4E,Q,E,4,ccr5_human,CCR5
...,...,...,...,...,...,...,...,...
15479,P21917,-0.603688,AA414V,A,V,414,drd4_human,DRD4
15480,P21917,0.802561,LA415V,L,V,415,drd4_human,DRD4
15481,P21917,0.555752,LA415R,L,R,415,drd4_human,DRD4
15482,P21917,-0.089390,RA416H,R,H,416,drd4_human,DRD4


In [39]:
# read in Genebass data
icd10_df = pd.read_csv('VEPwithGB_icd10_0323.csv')
icd10_df['mutation_from'] = icd10_df['Amino_acids'].str.split('/',expand=True)[0]
icd10_df['mutation_to'] = icd10_df['Amino_acids'].str.split('/',expand=True)[1]
icd10_df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Location,Allele,Protein_position,SYMBOL,Amino_acids,locus,SIFT,PolyPhen,BayesDel_addAF_score,BayesDel_noAF_score,...,AC,AF,BETA,SE,AF.Cases,AF.Controls,Pvalue,vcf,mutation_from,mutation_to
0,11:113412715-113412715,C,327,DRD2,K/E,chr11:113412715,tolerated(0.27),benign(0.031),-0.260005,-0.140988,...,1907,0.002415,0.431260,0.53696,0.003472,0.002413,0.421890,11 113412715 . T C,K,E
1,11:113412715-113412715,C,327,DRD2,K/E,chr11:113412715,tolerated(0.27),benign(0.031),-0.260005,-0.140988,...,1907,0.002415,-0.246980,0.61625,0.001845,0.002416,0.688580,11 113412715 . T C,K,E
2,11:113412715-113412715,C,327,DRD2,K/E,chr11:113412715,tolerated(0.27),benign(0.031),-0.260005,-0.140988,...,1907,0.002415,1.305800,1.07300,0.005556,0.002413,0.223580,11 113412715 . T C,K,E
3,11:113412715-113412715,C,327,DRD2,K/E,chr11:113412715,tolerated(0.27),benign(0.031),-0.260005,-0.140988,...,1907,0.002415,-0.141640,0.27161,0.002068,0.002417,0.602020,11 113412715 . T C,K,E
4,11:113412715-113412715,C,327,DRD2,K/E,chr11:113412715,tolerated(0.27),benign(0.031),-0.260005,-0.140988,...,1907,0.002415,0.071904,0.52037,0.002584,0.002415,0.890100,11 113412715 . T C,K,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654733,X:114907411-114907411,C,458,HTR2C,V/A,chrX:114907411,deleterious(0),benign(0.309),-0.00911951,-0.250876,...,3,0.000004,-1.007700,7.82990,0.000000,0.000004,0.897600,X 114907411 . T C,V,A
654734,X:114907411-114907411,C,458,HTR2C,V/A,chrX:114907411,deleterious(0),benign(0.309),-0.00911951,-0.250876,...,3,0.000004,5.129800,2.33040,0.000049,0.000003,0.027721,X 114907411 . T C,V,A
654735,X:114907411-114907411,C,458,HTR2C,V/A,chrX:114907411,deleterious(0),benign(0.309),-0.00911951,-0.250876,...,3,0.000004,-1.015900,4.91580,0.000000,0.000004,0.836280,X 114907411 . T C,V,A
654736,X:114907411-114907411,C,458,HTR2C,V/A,chrX:114907411,deleterious(0),benign(0.309),-0.00911951,-0.250876,...,3,0.000004,-1.022800,3.18950,0.000000,0.000004,0.748460,X 114907411 . T C,V,A


In [73]:
fx_df = pd.merge(icd10_df,final_dataframe,on=['mutation_from','Protein_position','mutation_to','SYMBOL'])
fx_df

Unnamed: 0,Location,Allele,Protein_position,SYMBOL,Amino_acids,locus,SIFT,PolyPhen,BayesDel_addAF_score,BayesDel_noAF_score,...,AF.Cases,AF.Controls,Pvalue,vcf,mutation_from,mutation_to,Uniprot,FoldX,mutation data,protein
0,11:113412715-113412715,C,327,DRD2,K/E,chr11:113412715,tolerated(0.27),benign(0.031),-0.260005,-0.140988,...,0.003472,0.002413,0.421890,11 113412715 . T C,K,E,P14416,-0.466725,KA327E,drd2_human
1,11:113412715-113412715,C,327,DRD2,K/E,chr11:113412715,tolerated(0.27),benign(0.031),-0.260005,-0.140988,...,0.001845,0.002416,0.688580,11 113412715 . T C,K,E,P14416,-0.466725,KA327E,drd2_human
2,11:113412715-113412715,C,327,DRD2,K/E,chr11:113412715,tolerated(0.27),benign(0.031),-0.260005,-0.140988,...,0.005556,0.002413,0.223580,11 113412715 . T C,K,E,P14416,-0.466725,KA327E,drd2_human
3,11:113412715-113412715,C,327,DRD2,K/E,chr11:113412715,tolerated(0.27),benign(0.031),-0.260005,-0.140988,...,0.002068,0.002417,0.602020,11 113412715 . T C,K,E,P14416,-0.466725,KA327E,drd2_human
4,11:113412715-113412715,C,327,DRD2,K/E,chr11:113412715,tolerated(0.27),benign(0.031),-0.260005,-0.140988,...,0.002584,0.002415,0.890100,11 113412715 . T C,K,E,P14416,-0.466725,KA327E,drd2_human
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
654418,X:114907411-114907411,C,458,HTR2C,V/A,chrX:114907411,deleterious(0),benign(0.309),-0.00911951,-0.250876,...,0.000000,0.000004,0.897600,X 114907411 . T C,V,A,P28335,0.020199,VA458A,5ht2c_human
654419,X:114907411-114907411,C,458,HTR2C,V/A,chrX:114907411,deleterious(0),benign(0.309),-0.00911951,-0.250876,...,0.000049,0.000003,0.027721,X 114907411 . T C,V,A,P28335,0.020199,VA458A,5ht2c_human
654420,X:114907411-114907411,C,458,HTR2C,V/A,chrX:114907411,deleterious(0),benign(0.309),-0.00911951,-0.250876,...,0.000000,0.000004,0.836280,X 114907411 . T C,V,A,P28335,0.020199,VA458A,5ht2c_human
654421,X:114907411-114907411,C,458,HTR2C,V/A,chrX:114907411,deleterious(0),benign(0.309),-0.00911951,-0.250876,...,0.000000,0.000004,0.748460,X 114907411 . T C,V,A,P28335,0.020199,VA458A,5ht2c_human


In [75]:
fx_df.to_csv('Genebass_with_FoldX_df.csv',index=False)