This notebook is for extracting information from the files created by piping the DockQ results to text files.

In [1]:
import os
import pandas as pd

In [2]:
# Define a function for each line of the file

def get_ids(line):
    # Line looks like: "Model  : ./autoinhibitory/model/{uniprot}_{pdb}.pdb"

    id_dict = {}
    fp = line.split()[2]
    fn = fp.split('/')[3]
    ids = fn.split('_')
    uniprot = ids[0]
    pdb = ids[1].split('.')[0]

    id_dict['uniprot'] = uniprot
    id_dict['pdb'] = pdb

    return id_dict

def get_fnat(line):
    # Line looks like: "Fnat 0.804 123 correct of 153 native contacts"
    fnat_dict = {}

    fnat = float(line.split()[1])
    fnat_correct = int(line.split()[2])
    fnat_total = int(line.split()[5])

    fnat_dict['fnat'] = fnat
    fnat_dict['fnat_correct'] = fnat_correct
    fnat_dict['fnat_total'] = fnat_total

    return fnat_dict

def get_fnonnat(line):
    # Line looks like: "Fnonnat 0.134 19 non-native of 142 model contacts"
    fnonnat_dict = {}

    fnonnat = float(line.split()[1])
    fnonnat_nnative = int(line.split()[2])
    fnonnat_model = int(line.split()[5])

    fnonnat_dict['fnonnat'] = fnonnat
    fnonnat_dict['fnonnat_nnative'] = fnonnat_nnative
    fnonnat_dict['fnonnat_model'] = fnonnat_model

    return fnonnat_dict

def get_irms(line):
    # Line looks like: "iRMS 1.297"
    irms_dict = {}

    irms = float(line.split()[1])

    irms_dict['irms'] = irms

    return irms_dict

def get_lrms(line):
    # Line looks like: "LRMS 1.027"
    lrms_dict = {}

    lrms = float(line.split()[1])

    lrms_dict['lrms'] = lrms

    return lrms_dict

def get_dockq(line):
    # Line looks like: "DockQ 0.421"
    dockq_dict = {}

    dockq = float(line.split()[1])

    dockq_dict['dockq'] = dockq

    return dockq_dict

def get_info(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    info = {}
    for line in lines:
        if line.startswith('Model'):
            info.update(get_ids(line))

        elif line.startswith('Fnat'):
            info.update(get_fnat(line))

        elif line.startswith('Fnonnat'):
            info.update(get_fnonnat(line))

        elif line.startswith('iRMS'):
            info.update(get_irms(line))

        elif line.startswith('LRMS'):
            info.update(get_lrms(line))
        
        elif line.startswith('DockQ'):
            info.update(get_dockq(line))

    return info

def capri_class(score):
    if score >= 0.80:
        return 'High'
    elif 0.80 > score >= 0.49:
        return 'Medium'
    elif 0.49 > score >= 0.23:
        return 'Acceptable'
    else:
        return 'Incorrect'

In [3]:
# Main function for autoinhibitory proteins

folder_path = '../DockQ/results/autoinhibitory/'

# List for extracted information
all_info = []

# Iterate over all files in the folder
for file in os.listdir(folder_path):
    if file.endswith('.txt'):
        file_path = os.path.join(folder_path, file)
        info = get_info(file_path)
        all_info.append(info)

# Create a DataFrame
df = pd.DataFrame(all_info)

# Assign capri scores
df['capri'] = df['dockq'].apply(capri_class)

# Save the DataFrame to a CSV file
df.to_csv('./project_pipeline/data/ai_dockq_results.csv', index=False)


# Show the dataframe
df.head()

Unnamed: 0,uniprot,pdb,fnat,fnat_correct,fnat_total,fnonnat,fnonnat_nnative,fnonnat_model,irms,lrms,dockq,capri
0,P28482,4qp6,0.778,63.0,81.0,0.087,6.0,69.0,1.603,3.298,0.705,Medium
1,P62826,5uwh,0.714,5.0,7.0,0.0,0.0,5.0,20.475,40.342,0.254,Acceptable
2,Q8IXJ6,4r8m,0.968,90.0,93.0,0.011,1.0,91.0,0.411,0.77,0.963,High
3,P28482,6gjb,0.561,64.0,114.0,0.2,16.0,80.0,2.498,5.796,0.503,Medium
4,P28482,7nr3,0.545,60.0,110.0,0.167,12.0,72.0,2.445,5.722,0.502,Medium


In [4]:
# Main function for multi-domain proteins

folder_path = '../DockQ/results/multi_domain/'

# List for extracted information
all_info = []

# Iterate over all files in the folder
for file in os.listdir(folder_path):
    if file.endswith('.txt'):
        file_path = os.path.join(folder_path, file)
        info = get_info(file_path)
        all_info.append(info)

# Create a DataFrame
df = pd.DataFrame(all_info)

# Assign capri scores
df['capri'] = df['dockq'].apply(capri_class)

# Save the DataFrame to a CSV file
df.to_csv('./project_pipeline/data/md_dockq_results.csv', index=False)

# Show the dataframe
df.head()

Unnamed: 0,uniprot,pdb,fnat,fnat_correct,fnat_total,fnonnat,fnonnat_nnative,fnonnat_model,irms,lrms,dockq,capri
0,A0A0A0V031,5ups,0.929,130.0,140.0,0.097,14.0,144.0,0.426,0.592,0.95,High
1,P59676,1pyy,0.981,101.0,103.0,0.082,9.0,110.0,0.448,2.013,0.949,High
2,D9N168,8scl,0.981,104.0,106.0,0.028,3.0,107.0,0.291,0.906,0.978,High
3,A0A0H2WY27,7kcx,0.934,185.0,198.0,0.084,17.0,202.0,0.988,1.442,0.868,High
4,A0R0V0,5kei,0.699,144.0,206.0,0.265,52.0,196.0,8.903,20.869,0.29,Acceptable
