In [15]:
import general_tools as gt
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import uniprot_info as uni
from Bio.PDB import PDBIO, Model, Chain, Residue, Atom
import pandas as pd
from biopandas.pdb import PandasPdb
from Bio.PDB import PDBParser


In [16]:
gene = 'COL4A5'
pdb_file = f'{gene}.pdb'

In [17]:
def pdb_to_dataframe(pdb_file):
    # Create a PDB parser and load the structure
    parser = PDBParser(QUIET=True)
    structure = parser.get_structure("protein", pdb_file)

    # Initialize lists to store PDB data
    atom_data = []

    for model in structure:
        for chain in model:
            for residue in chain:
                for atom in residue:
                    atom_record = {
                        'ATOM': atom.get_serial_number(),
                        'NAME': atom.get_name(),
                        'RESNAME': residue.get_resname(),
                        'CHAINID': chain.id,
                        'RESIDUESEQ': residue.id[1],
                        'X': atom.get_coord()[0],
                        'Y': atom.get_coord()[1],
                        'Z': atom.get_coord()[2],
                        'OCCUPANCY': atom.get_occupancy(),
                        'TEMPFACTOR': atom.get_bfactor(),
                        'ELEMENT': atom.element,
                    }
                    atom_data.append(atom_record)

    # Create a DataFrame from the atom data
    df = pd.DataFrame(atom_data)

    return df


pdb_df = pdb_to_dataframe(pdb_file)
print(pdb_df)
print(pdb_df.columns)

        ATOM NAME RESNAME CHAINID  RESIDUESEQ          X      Y          Z  \
0          1    N     MET       A           1 -91.740997  1.479  50.665001   
1          2   CA     MET       A           1 -90.630997  1.526  51.652000   
2          3    C     MET       A           1 -89.237999  1.265  51.056999   
3          4   CB     MET       A           1 -90.931000  0.660  52.884998   
4          5    O     MET       A           1 -88.305000  1.903  51.515999   
...      ...  ...     ...     ...         ...        ...    ...        ...   
11351  11352   CB     THR       A        1685 -31.257999 -7.433  -0.235000   
11352  11353    O     THR       A        1685 -32.944000 -9.553   0.961000   
11353  11354  CG2     THR       A        1685 -29.969999 -6.648  -0.475000   
11354  11355  OG1     THR       A        1685 -31.139000 -8.676  -0.882000   
11355  11356  OXT     THR       A        1685 -33.806999 -7.722   1.905000   

       OCCUPANCY  TEMPFACTOR ELEMENT  
0            1.0       3

In [18]:

######### Functions for addition of b_factor column by pathogenicity to pdb_df dataframe #########

def create_dictionary_with_all_residues(protein_length):
    """
    Creates a dictionary where the keys are residue numbers and the values are
    the residue names. The dictionary contains all residues from 1 to the
    protein_length.
    The values for all residues are set to 'None' by default.
    """
    # Create an empty dictionary
    residue_dict = {}
    # Iterate through the range of numbers from 1 to the protein_length
    for residue_number in range(1, protein_length + 1):
        # Add the residue number and residue name to the residue_dict dictionary
        residue_dict[residue_number] = None
    return residue_dict

def get_b_factor_dict(residue_dict, pathogenicity_df, gene_length):
    """
    Go over all the residues: Create a subset of the pathogenicity_df by residue.
    For each subset, if there is only one value, then insert the pathogenicity value into the dictionary.
    If there are multiple values, then check if they are the same. If they are, then insert the pathogenicity value into the dictionary.
    If they are not, then insert 'Vague' into the dictionary.
    """
    for residue in range(gene_length):
        # Create a subset of the pathogenicity_df by residue
        pathogenicity_by_residue = pathogenicity_df[pathogenicity_df['position'] == residue]
        # Check if there is only one value in the pathogenicity_by_residue dataframe
        if len(pathogenicity_by_residue) == 1:
            # Get the pathogenicity value from the pathogenicity_by_residue dataframe
            pathogenicity = pathogenicity_by_residue['pathogenicity'].values[0]
            # Insert the pathogenicity value into the residue_dict dictionary
            residue_dict[residue] = pathogenicity
        elif len(pathogenicity_by_residue) > 1:
            # Get the pathogenicity values from the pathogenicity_by_residue dataframe
            pathogenicity_values = pathogenicity_by_residue['pathogenicity'].values
            # Check if all the pathogenicity values are the same
            if all(pathogenicity == pathogenicity_values[0] for pathogenicity in pathogenicity_values):
                # Insert the pathogenicity value into the residue_dict dictionary
                residue_dict[residue] = pathogenicity_values[0]
            else:
                # Insert 2 for Vague information into the residue_dict dictionary
                residue_dict[residue] = 2
    return residue_dict

def add_b_factor_by_residue(pdb_df, b_factor_dict):
    """
    Adds a b_factor column to the pdb_df dataframe. The b_factor column
    contains the b_factor values for each residue in the pdb_df dataframe.
    The b_factor values are taken from the b_factor_dict dictionary.
    Addition is done by residue, so all atoms in a residue will have the same b_factor.
    """
    # Make sure there's a bfactor column in the pdb_df dataframe
    if 'TEMPFACTOR' not in pdb_df.columns:
        # raise an error if there's no bfactor column
        raise ValueError('There is no bfactor column in the pdb_df dataframe')
    # Iterate through the rows of the pdb_df dataframe
    for index, row in pdb_df.iterrows():
        # Get the residue number for the current row
        residue_number = row['RESIDUESEQ']
        # Get the b_factor value for the current row
        b_factor = b_factor_dict[residue_number]
        # Set the b_factor value for all rows with the current residue number
        pdb_df.loc[pdb_df['RESIDUESEQ'] == residue_number, 'TEMPFACTOR'] = b_factor
    return pdb_df


def dataframe_to_pdb(dataframe, output_file):
    ppdb = PandasPdb()
    ppdb.df['ATOM'] = dataframe
    ppdb.to_pdb(output_file, records=['ATOM'])

In [19]:
# Define a function to update the B-factor (11th column) value
def update_bfactor(line, new_value):
    line_list = list(line)
    line_list[60:70] = f'{new_value:6.2f}'.encode('utf-8')
    return bytes(line_list)

In [20]:
def create_pdb_with_bfactor(pdb_file):
    # Path to your input and output files
    input_file_path = pdb_file
    output_file_path = "COL4A5_with_b_factor.pdb"
    
    # Flag to indicate whether you're inside the "ATOM" table
    inside_atom_table = False
    
    # Read the input file and write to the output file
    with open(input_file_path, "r") as input_file, open(output_file_path, "wb") as output_file:
        for line in input_file:
            # Check if the line starts with "ATOM" to identify the start of the table
            if line.startswith("ATOM"):
                inside_atom_table = True
    
            if inside_atom_table:
                # Perform your calculations here
                # For example, let's increment the B-factor by 10 for demonstration
                bfactor = float(line[60:70])
                new_bfactor = bfactor + 10.0
                line = update_bfactor(line, new_bfactor)
    
            # Write the line to the output file
            output_file.write(line.encode('utf-8'))

In [21]:
gene_length = uni.get_sequence_length(gene)
pathogenicity_df = pd.read_csv(f'gene_specific_df/{gene}_with_position.csv')
pathogenicity_df


Unnamed: 0,gene,variant,pathogenicity,uniprot_id,stability_WT,stability_MUT,blosum,hydrophobicity_WT,hydrophobicity_MUT,volume_WT,...,oda_MUT,sasa_WT,sasa_MUT,RSA_WT,RSA_MUT,oda_delta,sasa_delta,pssm,entropy,position
0,COL4A5,L8Q,benign,P29400,2243.23,2243.04,-2,3.8,-3.5,166.7,...,-35.04,139.31,115.38,0.729372,0.539159,7.64,-23.93,3.084,2.766,8
1,COL4A5,L12W,benign,P29400,2242.53,2242.54,-2,3.8,-0.9,166.7,...,-55.18,127.00,147.30,0.664921,0.557955,-10.53,20.30,3.040,2.686,12
2,COL4A5,F13S,benign,P29400,2242.74,2243.54,-2,2.8,-0.8,189.9,...,-32.23,144.35,60.34,0.633114,0.421958,14.24,-84.01,3.733,2.852,13
3,COL4A5,A16S,benign,P29400,2242.94,2242.90,1,1.8,-0.8,88.6,...,-47.05,63.18,62.45,0.522149,0.436713,3.04,-0.73,3.802,3.238,16
4,COL4A5,L19P,benign,P29400,2241.77,2244.15,-3,3.8,-1.6,166.7,...,-39.97,148.26,91.29,0.776230,0.592792,9.07,-56.97,3.004,2.775,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
629,COL4A5,I450M,benign,,2243.69,2243.37,1,4.5,1.9,166.7,...,-12.30,165.25,129.07,0.847436,0.635813,-2.20,-36.18,3.482,2.691,450
630,COL4A5,I444S,benign,,2244.00,2244.10,-2,4.5,-0.8,166.7,...,-21.27,141.28,91.21,0.724513,0.637832,3.00,-50.07,3.683,0.951,444
631,COL4A5,R777H,benign,,2243.79,2243.98,0,-4.5,-3.2,173.4,...,-17.67,246.97,179.07,0.931962,0.829028,-11.54,-67.90,3.912,3.110,777
632,COL4A5,E1084A,benign,,2243.26,2244.01,-1,-3.5,1.8,138.4,...,-11.50,133.50,84.73,0.623832,0.700248,-1.17,-48.77,4.193,2.019,1084


In [22]:
# leave only the pathogenicity and residue number columns
pathogenicity_df = pathogenicity_df[['position', 'pathogenicity']]
# Change pathogenicity to 1 and 0, where 1 is pathogenic and 0 is benign
pathogenicity_df['pathogenicity'] = pathogenicity_df['pathogenicity'].map({'pathogenic': 1, 'benign': 0})

residue_dict = create_dictionary_with_all_residues(gene_length)

b_factor_dict = get_b_factor_dict(residue_dict, pathogenicity_df, gene_length)
print(b_factor_dict)
pdb_df = add_b_factor_by_residue(pdb_df, b_factor_dict)

pdb_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pathogenicity_df['pathogenicity'] = pathogenicity_df['pathogenicity'].map({'pathogenic': 1, 'benign': 0})


{1: None, 2: None, 3: None, 4: None, 5: None, 6: None, 7: None, 8: 0, 9: None, 10: None, 11: None, 12: 0, 13: 0, 14: None, 15: None, 16: 0, 17: None, 18: None, 19: 0, 20: None, 21: None, 22: None, 23: None, 24: None, 25: None, 26: None, 27: None, 28: 2, 29: None, 30: None, 31: None, 32: None, 33: None, 34: None, 35: 0, 36: None, 37: None, 38: None, 39: 0, 40: None, 41: 0, 42: None, 43: 0, 44: None, 45: None, 46: None, 47: 0, 48: 1, 49: None, 50: None, 51: 1, 52: None, 53: None, 54: 1, 55: None, 56: None, 57: None, 58: None, 59: None, 60: None, 61: None, 62: None, 63: None, 64: None, 65: None, 66: None, 67: None, 68: None, 69: None, 70: None, 71: None, 72: None, 73: None, 74: 0, 75: None, 76: None, 77: None, 78: None, 79: None, 80: None, 81: None, 82: None, 83: None, 84: None, 85: None, 86: None, 87: None, 88: None, 89: None, 90: None, 91: None, 92: None, 93: None, 94: 0, 95: None, 96: 1, 97: None, 98: None, 99: None, 100: None, 101: None, 102: None, 103: None, 104: None, 105: None, 106

Unnamed: 0,ATOM,NAME,RESNAME,CHAINID,RESIDUESEQ,X,Y,Z,OCCUPANCY,TEMPFACTOR,ELEMENT
0,1,N,MET,A,1,-91.740997,1.479,50.665001,1.0,,N
1,2,CA,MET,A,1,-90.630997,1.526,51.652000,1.0,,C
2,3,C,MET,A,1,-89.237999,1.265,51.056999,1.0,,C
3,4,CB,MET,A,1,-90.931000,0.660,52.884998,1.0,,C
4,5,O,MET,A,1,-88.305000,1.903,51.515999,1.0,,O
...,...,...,...,...,...,...,...,...,...,...,...
11351,11352,CB,THR,A,1685,-31.257999,-7.433,-0.235000,1.0,,C
11352,11353,O,THR,A,1685,-32.944000,-9.553,0.961000,1.0,,O
11353,11354,CG2,THR,A,1685,-29.969999,-6.648,-0.475000,1.0,,C
11354,11355,OG1,THR,A,1685,-31.139000,-8.676,-0.882000,1.0,,O


In [23]:
# Create a bfactor array by going through the pdb_df dataframe, for every RESIDUESEQ, get the BFACTOR value from the dictionary.
bfactor_array = []
for index, row in pdb_df.iterrows():
    residue_number = row['RESIDUESEQ']
    bfactor = b_factor_dict[residue_number]
    bfactor_array.append(bfactor)

for i in bfactor_array:
    if i == None:
        # change the None values to 3
        bfactor_array[bfactor_array.index(i)] = 3
        
        
print(bfactor_array)

[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 

In [24]:
# save bfactor array to a csv file
bfactor_df = pd.DataFrame({'Bfactor': bfactor_array})
bfactor_df.to_csv(f'Analysis/{gene}_bfactor.csv')