In [1]:
import ast
from itertools import combinations
from coverage_functions import *

# SWISS-MODEL
## Filtering data according to reference proteome & model quality

In [3]:
def filter_swissmodel(file):
    # provider filtering
    file = file[(file[["provider"]]  == "SWISSMODEL").all(1)] # provider PDB is filtered out
    
    # Filter wrt proteome
    file = file.assign(in_uniprot=file['UniProtKB_ac'].isin(proteome.Entry))
    file = file[file.in_uniprot == True]
    
    # 30aa filtering
    file['model_length'] = file['to'] - file['from'] + 1
    file = file[file["model_length"] >= 30]
    
    # Get high quality models, QMEANDISCo Global Score > 0.7 as advised by providers
    file = file[(file[["qmeandisco_global"]] > 0.7).all(1)]
    
    # duplicate filtering, keep the highest scores
    file = file.sort_values(by=["qmeandisco_global"]).drop_duplicates(subset=["UniProtKB_ac", "from", "to"], keep="last")
    
    return file

In [2]:
# reviewed homo sapiens proteome from UniProt 2022_04
# this dataset is filtered according to certain thresholds
proteome = pd.read_excel("processed_data/uniprot/30aa_nounchar_noputative_ref_proteome_protein_existence_filtered_02.xlsx", header=0)
len(proteome)

18401

In [3]:
# load swissmodel data
swissmodel = pd.read_csv("raw_data/swissmodel/INDEX", sep="\t", skiprows=(0,1,2,3,4,5), header=0) # 163578

In [6]:
# filtering
swissmodel_30aa_hq = filter_swissmodel(swissmodel) # 9541
len(swissmodel_30aa_hq)

9541

In [17]:
swissmodel_30aa_hq.to_csv("processed_data/swissmodel/swissmodel_30aa_hq.tsv", sep="\t", index=False)

## Coverage Calculation

In [3]:
# we will work on the filtered file
swissmodel_30aa_hq = pd.read_csv("processed_data/swissmodel/swissmodel_30aa_hq.tsv", sep="\t")

In [4]:
swissmodel_30aa_hq = swissmodel_30aa_hq.rename(columns={'UniProtKB_ac': 'Entry'})
swissmodel_30aa_hq['combined'] = swissmodel_30aa_hq.apply(lambda x: list([x["from"], x["to"]]),axis=1)
data_new1 = swissmodel_30aa_hq.groupby('Entry').combined.apply(list).reset_index() 
data_new2 = swissmodel_30aa_hq.groupby(['Entry'], as_index=False).agg({"uniprot_seq_length":"first"}) 
swissmodel_30aa_hq_cov = pd.merge(data_new1, data_new2, on="Entry") 
swissmodel_30aa_hq_cov # 7886 proteins

Unnamed: 0,Entry,combined,uniprot_seq_length
0,A0A075B6H7,"[[21, 116]]",116
1,A0A075B6H8,"[[19, 117]]",117
2,A0A075B6H9,"[[21, 119]]",119
3,A0A075B6I3,"[[20, 122]]",123
4,A0A075B6I6,"[[21, 117]]",118
...,...,...,...
7881,Q9Y6X9,"[[1, 551]]",1032
7882,Q9Y6Y0,"[[5, 129], [347, 642]]",642
7883,Q9Y6Y9,"[[19, 158]]",160
7884,Q9Y6Z7,"[[132, 275]]",277


In [5]:
# calculate_coverage function comes from coverage_functions.py file
swissmodel_30aa_hq_cov["Coverage"] = swissmodel_30aa_hq_cov.apply(lambda x: calculate_coverage(begin_end = x["combined"],
                                                                                               prot_length = x["uniprot_seq_length"]), axis=1)
swissmodel_30aa_hq_cov.to_excel("processed_data/swissmodel/swissmodel_30aa_hq_cov.xlsx", index=False)

In [6]:
# double checking
swiss_res_cov = get_residue_coverage(swissmodel_30aa_hq, length='uniprot_seq_length', res_beg_end=['to', 'from'])
len(swiss_res_cov)

7886