# README
The purpose of this notebook is to process the output from FoldX (For variants from GPCRdb). the results are in different folders, we need to 
1: for each folder, read in the file starts with 'Dif_'(this contains the delta delta G) and the file starts with 'individual_list'(this contains the correspondence information between mutation and ddG) 
2: merge the mutation info and its corresponding ddG 
3: generate a dataframe and merge with Genebass data
4: output as csv file for further performance calculation

# Import packages

In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns

import matplotlib.pyplot as plt

# Processing

In [3]:
# diff_energy = pd.read_csv('/Users/joannahench/Downloads/foldx5MacStd.tar_/GPCRdb/P0DMS8/Dif_P0DMS8.fxout',skiprows=8,sep='\t')
# diff_energy

Unnamed: 0,Pdb,total energy,Backbone Hbond,Sidechain Hbond,Van der Waals,Electrostatics,Solvation Polar,Solvation Hydrophobic,Van der Waals clashes,entropy sidechain,...,cis_bond,torsional clash,backbone clash,helix dipole,water bridge,disulfide,electrostatic kon,partial covalent bonds,energy Ionisation,Entropy Complex
0,P0DMS8_1.pdb,1.767660,3.669020e-01,0.366902,0.033370,0.000000e+00,-0.097597,-0.027027,-2.276860e-01,-0.359329,...,0,0.000000e+00,0.000000e+00,0.000000e+00,0,0,0,0,0.000000e+00,0
1,P0DMS8_2.pdb,0.499373,4.648880e-01,0.464888,-0.395977,0.000000e+00,0.431325,-0.557309,-3.552710e-15,-0.252917,...,0,-5.641750e-02,0.000000e+00,0.000000e+00,0,0,0,0,0.000000e+00,0
2,P0DMS8_3.pdb,-0.100068,0.000000e+00,0.000000,0.009785,-1.279170e-01,-0.021035,0.009190,0.000000e+00,0.006955,...,0,-3.072050e-03,9.625120e-04,0.000000e+00,0,0,0,0,0.000000e+00,0
3,P0DMS8_4.pdb,0.140177,1.136870e-13,0.000000,-0.301008,4.700510e-09,0.432898,-0.365992,8.986840e-10,0.086637,...,0,8.620080e-07,1.143670e-01,0.000000e+00,0,0,0,0,0.000000e+00,0
4,P0DMS8_5.pdb,0.456306,-1.136870e-13,0.000000,-0.113270,-1.360380e-01,0.282557,-0.062291,-8.986840e-10,-0.030825,...,0,1.563000e-02,5.684340e-14,-1.110220e-16,0,0,0,0,0.000000e+00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,P0DMS8_133.pdb,1.207220,2.565350e-03,0.691597,0.529359,-8.824570e-03,-0.409693,0.828372,-1.671740e-02,-0.296771,...,0,1.196270e-02,3.996690e-02,5.420850e-04,0,0,0,0,2.347640e-09,0
133,P0DMS8_134.pdb,-0.393174,-1.057390e-01,0.000000,-0.482357,0.000000e+00,0.265349,-0.768084,2.394650e-02,0.238254,...,0,3.265550e-02,3.535000e-02,0.000000e+00,0,0,0,0,0.000000e+00,0
134,P0DMS8_135.pdb,0.082216,0.000000e+00,0.000000,-0.070076,0.000000e+00,0.059072,-0.161681,-2.348460e-05,0.034729,...,0,9.599970e-04,4.908390e-03,0.000000e+00,0,0,0,0,0.000000e+00,0
135,P0DMS8_136.pdb,0.319334,1.443000e-05,0.000013,-0.176965,1.483690e-02,0.696527,-0.321895,6.142420e-03,0.107890,...,0,6.313570e-03,4.107710e-02,1.905380e-01,0,0,0,0,-2.347640e-09,0


In [10]:
# define the directory containing the protein folders
directory = '/Users/joannahench/Downloads/foldx5MacStd.tar_/GPCRdb'

# create an empty list to store the final data
final_data = []

# loop over each protein folder
for protein_folder in os.listdir(directory):

    # check if the item in the directory is a folder
    if os.path.isdir(os.path.join(directory, protein_folder)):


        # read in the Dif_ file as a csv, skipping the first 8 rows
        dif_file = pd.read_csv(os.path.join(directory, protein_folder, 'Dif_'+str(protein_folder)+'.fxout'), skiprows=8,sep = '\t')
        

        # read in the individual_list file as a csv with semicolon separator
        individual_list_file = pd.read_csv(os.path.join(directory, protein_folder, 'individual_list_'+str(protein_folder)+'.txt'), sep=';',header=None)


        # create a new dataframe with three columns: protein, total energy, and individual data
        new_data = pd.DataFrame({'Uniprot': protein_folder, 'FoldX': dif_file['total energy'], 'mutation data': individual_list_file[0]})

        # append the new data to the final data list
        final_data.append(new_data)


# concatenate all the new dataframes into a single dataframe
final_dataframe = pd.concat(final_data)

In [11]:
# split the 'mutation data' into 'mutation_from','mutation_to','mutation_pos'
final_dataframe['mutation_from'] = final_dataframe['mutation data'].str[0]
final_dataframe['mutation_to'] = final_dataframe['mutation data'].str[-1]
final_dataframe['mutation_pos'] = final_dataframe['mutation data'].str[2:-1].astype(int)


In [12]:
final_dataframe.reset_index(drop=True)
final_dataframe

Unnamed: 0,Uniprot,FoldX,mutation data,mutation_from,mutation_to,mutation_pos
0,P51681,0.283922,QA4A,Q,A,4
1,P51681,-0.284497,VA5A,V,A,5
2,P51681,-0.068612,SA6A,S,A,6
3,P51681,0.188599,SA6P,S,P,6
4,P51681,-0.143673,SA7A,S,A,7
...,...,...,...,...,...,...
2,P41231,0.686619,RA177A,R,A,177
3,P41231,0.795388,RA180A,R,A,180
4,P41231,0.612146,RA194H,R,H,194
5,P41231,4.240870,YA198A,Y,A,198


In [13]:
# read in all gene names
gene_list_raw =pd.read_csv("230323_EST_ENSG_GENE_new.csv")
gene_list_raw

Unnamed: 0,index,ENST,ENSG,protein,gene,Uniprot
0,0,ENST00000646641,ENSG00000267534,s1pr2_human,S1PR2,O95136
1,121,ENST00000547270,ENSG00000257138,t2r38_human,TAS2R38,P59533
2,219,ENST00000390675,ENSG00000256436,t2r31_human,TAS2R31,P59538
3,344,ENST00000539585,ENSG00000256188,t2r30_human,TAS2R30,P59541
4,483,ENST00000538986,ENSG00000255837,t2r20_human,TAS2R20,P59543
...,...,...,...,...,...,...
395,64029,ENST00000510937,ENSG00000226306,npy6r_human,NPY6R,Q99463
396,64030,ENST00000641193,ENSG00000279301,o2t11_human,OR2T11,Q8NH01
397,64031,ENST00000641732,ENSG00000172146,or1a1_human,OR1A1,Q9P1Q5
398,64032,ENST00000328890,ENSG00000183024,or1g1_human,OR1G1,P47890


In [14]:
final_dataframe = final_dataframe.merge(gene_list_raw[['protein','gene','Uniprot']],on='Uniprot')
final_dataframe = final_dataframe.rename(columns={'gene':'SYMBOL'})
final_dataframe

Unnamed: 0,Uniprot,FoldX,mutation data,mutation_from,mutation_to,mutation_pos,protein,SYMBOL
0,P51681,0.283922,QA4A,Q,A,4,ccr5_human,CCR5
1,P51681,-0.284497,VA5A,V,A,5,ccr5_human,CCR5
2,P51681,-0.068612,SA6A,S,A,6,ccr5_human,CCR5
3,P51681,0.188599,SA6P,S,P,6,ccr5_human,CCR5
4,P51681,-0.143673,SA7A,S,A,7,ccr5_human,CCR5
...,...,...,...,...,...,...,...,...
2480,P41231,0.686619,RA177A,R,A,177,p2ry2_human,P2RY2
2481,P41231,0.795388,RA180A,R,A,180,p2ry2_human,P2RY2
2482,P41231,0.612146,RA194H,R,H,194,p2ry2_human,P2RY2
2483,P41231,4.240870,YA198A,Y,A,198,p2ry2_human,P2RY2


In [25]:
# read in GPCRdb data
gpcr_missense = pd.read_csv('gpcr_filtered_mutants_0217.csv')
gpcr_missense

Unnamed: 0.1,Unnamed: 0,reference,protein,mutation_pos,mutation_from,mutation_to,ligand_name,ligand_idtype,ligand_id,ligand_class,...,exp_mu_effect_value,exp_fold_change,exp_mu_effect_qual,exp_mu_effect_ligand_prop,exp_mu_ligand_ref,opt_receptor_expression,opt_basal_activity,opt_gain_of_activity,opt_ligand_emax,opt_agonist
0,0,8903934,hrh1_human,194,T,A,(S)-cetirizine,ChEMBL Compound ID,CHEMBL1334217,Binding - unknown pharmacological activity,...,8.000,-10.000,,,[3H]-mepyramine (radioligand),0.0,0.0,,0.0,
1,1,15033376,hrh1_human,432,F,A,[3H]-mepyramine,ChEMBL Compound ID,CHEMBL511,Binding - unknown pharmacological activity,...,30.000,23.077,,,[3H]-mepyramine (radioligand),0.0,0.0,,0.0,
2,2,8198587,hrh1_human,198,N,A,[3H]-mepyramine,ChEMBL Compound ID,CHEMBL511,Binding - unknown pharmacological activity,...,0.790,1.362,,,[3H]-mepyramine (radioligand),102.6,0.0,,0.0,
3,3,17959710,hrh1_human,420,I,E,Histamine,ChEMBL Compound ID,CHEMBL90,Binding - unknown pharmacological activity,...,6.700,-5.000,,,[3H]-mepyramine (radioligand),9.1,0.0,,0.0,
4,5,15626750,hrh1_human,433,I,V,Histamine,ChEMBL Compound ID,CHEMBL90,Binding - unknown pharmacological activity,...,4.500,-1.585,,,[3H]-mepyramine (radioligand),81.8,0.0,,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12017,455,10.1021/jm020211+,aa3r_human,243,W,A,Compound39,SMILES,CCCC1C(=C(CC)N=C(c2ccccc2)C1(CCC)C([O-])=O)C(=...,Inverse agonist,...,2150.000,9.034,,,I-AB-MECA,0.0,0.0,=,0.0,
12018,456,10.1124/mol.63.5.1021,aa3r_human,243,W,A,DU124183,PubChem CID,10426659,Allosteric inverse agonist,...,3.600,3.000,,,I-AB-MECA,0.0,0.0,=,0.0,
12019,458,10.1021/jm050968b,aa3r_human,272,H,D,CID85116573,PubChem CID,85116573,Full agonist,...,0.850,-11.236,,,I-AB-MECA,0.0,0.0,=,0.0,
12020,459,10.1124/mol.63.5.1021,aa3r_human,244,L,A,DU124183,PubChem CID,10426659,Allosteric inverse agonist,...,2.100,1.750,,,I-AB-MECA,0.0,0.0,=,0.0,


In [26]:
fx_df = pd.merge(gpcr_missense,final_dataframe,on=['mutation_pos','mutation_from','mutation_to','protein'])
fx_df

Unnamed: 0.1,Unnamed: 0,reference,protein,mutation_pos,mutation_from,mutation_to,ligand_name,ligand_idtype,ligand_id,ligand_class,...,exp_mu_ligand_ref,opt_receptor_expression,opt_basal_activity,opt_gain_of_activity,opt_ligand_emax,opt_agonist,Uniprot,FoldX,mutation data,SYMBOL
0,0,8903934,hrh1_human,194,T,A,(S)-cetirizine,ChEMBL Compound ID,CHEMBL1334217,Binding - unknown pharmacological activity,...,[3H]-mepyramine (radioligand),0.0,0.0,,0.0,,P35367,-0.271773,TA194A,HRH1
1,8,8198587,hrh1_human,194,T,A,2-methylhistamine,ChEMBL Compound ID,CHEMBL12620,Binding - unknown pharmacological activity,...,[3H]-mepyramine (radioligand),126.0,0.0,,0.0,,P35367,-0.271773,TA194A,HRH1
2,12,11809864,hrh1_human,194,T,A,(S)-cetirizine,ChEMBL Compound ID,CHEMBL1334217,Binding - unknown pharmacological activity,...,[3H]-mepyramine (radioligand),24.4,0.0,,0.0,,P35367,-0.271773,TA194A,HRH1
3,19,8093027,hrh1_human,194,T,A,[3H]-mepyramine,ChEMBL Compound ID,CHEMBL511,Binding - unknown pharmacological activity,...,[3H]-mepyramine (radioligand),122.7,0.0,,0.0,,P35367,-0.271773,TA194A,HRH1
4,20,8198587,hrh1_human,194,T,A,Histamine,ChEMBL Compound ID,CHEMBL90,Binding - unknown pharmacological activity,...,[3H]-mepyramine (radioligand),126.0,0.0,,0.0,,P35367,-0.271773,TA194A,HRH1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9735,327,10.1006/bbrc.2001.5027,aa3r_human,108,R,K,Cl-IB-MECA,PubChem CID,393593,Full agonist,...,,0.0,0.0,=,0.0,,P0DMS8,-1.831330,RA108K,ADORA3
9736,343,10.1006/bbrc.2001.5027,aa3r_human,108,R,K,I-AB-MECA,PubChem CID,44208896,Full agonist,...,,0.0,0.0,=,0.0,,P0DMS8,-1.831330,RA108K,ADORA3
9737,199,10.1006/bbrc.2001.5027,aa3r_human,107,D,R,Cl-IB-MECA,PubChem CID,393593,Full agonist,...,,0.0,0.0,=,0.0,,P0DMS8,4.654750,DA107R,ADORA3
9738,436,10.1006/bbrc.2001.5027,aa3r_human,107,D,R,Cl-IB-MECA,PubChem CID,393593,Full agonist,...,,0.0,0.0,=,0.0,,P0DMS8,4.654750,DA107R,ADORA3


In [27]:
fx_df.to_csv('GPCR_FoldX_df.csv',index=False)