In [10]:
# import libraries 
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

from Bio.SeqUtils.ProtParam import ProteinAnalysis
from matplotlib import patches
from matplotlib_venn import venn2, venn3
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import fdrcorrection

In [11]:
data_path = os.getcwd() + "/Datasets/"

# Import Uniprot data set

In [12]:
df = pd.read_csv(data_path + "Human_brain_Uniprot_seq.csv", sep=";", header=0, names=["Uniprot", "Sequence"]) 
# drop entries without sequence (obsolete)
df.dropna(subset=["Sequence"], inplace=True)
# drop entries with non-standard amino acids
df = df[df["Sequence"].str.contains("B|U|X") == False]
df 

Unnamed: 0,Uniprot,Sequence
0,Q8TDC3,MSSGAKEGGGGSPAYHLPHPHPHPPQHAQYVGPYRLEKTLGKGQTG...
1,P48065,MDGKVAVQECGPPAVSWVPEEGEKLDQEDEDQVKDRGQWTNKMEFV...
2,Q9Y250,MGSVSSLISGHSFHSKHCRASQYKLRKSSHLKKLNRYSDGLLRFGF...
3,P0DMW5,MAASAALSAAAAAAALSGLAVRLSRSAAARGSYGAFCKGLTRTLLT...
4,P21579,MVSESHHEALAAPPVTTVATVLPSNATEPASPGEGKEDAFSKLKEK...
...,...,...
2541,Q8IZU8,MALMFTGHLLFLALLMFAFSTFEESVSNYSEWAVFTDDIDQFKTQK...
2542,Q4JDL3,MSSPRDFRAEPVNDYEGNDSEAEDLNFRETLPSSSQENTPRSKVFE...
2543,Q8N4V2,MEEDLFQLRQLPVVKFRRTGESARSEDDTASGEHEVQIEGVHVGLE...
2544,P48426,MATPGNLGSSVLASKTKTKKKHFVAQKVKLFRASDPLLSVLMWGVN...


In [17]:
features = pd.read_csv(data_path + "Features/features_human_proteome_no_filtering.csv")
features = features[["id", "helix", "turn", "sheet"]]
features

Unnamed: 0,id,helix,turn,sheet
0,Q8N7X0,0.225555,0.604079,0.170366
1,Q5T1N1,0.183014,0.777512,0.039474
2,Q92667,0.079734,0.805094,0.115172
3,Q5VUY0,0.491400,0.380835,0.127764
4,P62736,0.445623,0.347480,0.206897
...,...,...,...,...
20376,A0PK05,0.316364,0.683636,0.000000
20377,Q9HCN2,0.000000,0.750000,0.250000
20378,A0A0A0MS03,0.026316,0.535088,0.438596
20379,A0A0A6YYK4,0.026087,0.573913,0.400000


# Features

## Sequence length

In [14]:
df["Length"] = df["Sequence"].apply(len)
df

Unnamed: 0,Uniprot,Sequence,Length
0,Q8TDC3,MSSGAKEGGGGSPAYHLPHPHPHPPQHAQYVGPYRLEKTLGKGQTG...,778
1,P48065,MDGKVAVQECGPPAVSWVPEEGEKLDQEDEDQVKDRGQWTNKMEFV...,614
2,Q9Y250,MGSVSSLISGHSFHSKHCRASQYKLRKSSHLKKLNRYSDGLLRFGF...,596
3,P0DMW5,MAASAALSAAAAAAALSGLAVRLSRSAAARGSYGAFCKGLTRTLLT...,78
4,P21579,MVSESHHEALAAPPVTTVATVLPSNATEPASPGEGKEDAFSKLKEK...,422
...,...,...,...
2541,Q8IZU8,MALMFTGHLLFLALLMFAFSTFEESVSNYSEWAVFTDDIDQFKTQK...,1212
2542,Q4JDL3,MSSPRDFRAEPVNDYEGNDSEAEDLNFRETLPSSSQENTPRSKVFE...,420
2543,Q8N4V2,MEEDLFQLRQLPVVKFRRTGESARSEDDTASGEHEVQIEGVHVGLE...,548
2544,P48426,MATPGNLGSSVLASKTKTKKKHFVAQKVKLFRASDPLLSVLMWGVN...,406


## Basic amino acid properties

In [15]:
# calculate amino acid counts of all residues across organism domains
def protein_analysis(df):
    
    PA = ProteinAnalysis(df["Sequence"])
    
    # molecular weight
    df["Molecular weight"] = PA.molecular_weight()
    
    # amino acid proportions
    amino_acids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']
    aa_dict = PA.get_amino_acids_percent()
    
    for aa in amino_acids:
        df[aa] = aa_dict[aa]
        
    # isoelectric point
    df["Isoelectric point"] = PA.isoelectric_point()
    
    # aromaticity
    df["Aromaticity"] = PA.aromaticity()
    
    # instability index
    df["Instability index"] = PA.instability_index()
    
    return df

df = df.apply(protein_analysis, axis=1)
df

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,Q,R,S,T,V,W,Y,Isoelectric point,Aromaticity,Instability index
0,Q8TDC3,MSSGAKEGGGGSPAYHLPHPHPHPPQHAQYVGPYRLEKTLGKGQTG...,778,85085.7493,0.051414,0.012853,0.043702,0.065553,0.029563,0.092545,...,0.034704,0.078406,0.109254,0.035990,0.051414,0.005141,0.020566,9.387580,0.055270,62.915039
1,P48065,MDGKVAVQECGPPAVSWVPEEGEKLDQEDEDQVKDRGQWTNKMEFV...,614,69367.5176,0.052117,0.037459,0.030945,0.043974,0.083062,0.081433,...,0.032573,0.032573,0.066775,0.050489,0.074919,0.029316,0.047231,5.960671,0.159609,37.459805
2,Q9Y250,MGSVSSLISGHSFHSKHCRASQYKLRKSSHLKKLNRYSDGLLRFGF...,596,66612.1565,0.060403,0.011745,0.041946,0.104027,0.023490,0.072148,...,0.083893,0.062081,0.110738,0.030201,0.033557,0.003356,0.018456,6.636487,0.045302,54.359581
3,P0DMW5,MAASAALSAAAAAAALSGLAVRLSRSAAARGSYGAFCKGLTRTLLT...,78,8388.8176,0.230769,0.012821,0.012821,0.012821,0.064103,0.051282,...,0.012821,0.102564,0.089744,0.038462,0.064103,0.012821,0.038462,11.253811,0.115385,33.315513
4,P21579,MVSESHHEALAAPPVTTVATVLPSNATEPASPGEGKEDAFSKLKEK...,422,47572.5391,0.063981,0.014218,0.061611,0.082938,0.045024,0.059242,...,0.033175,0.018957,0.037915,0.054502,0.082938,0.009479,0.026066,8.256547,0.080569,29.304265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2541,Q8IZU8,MALMFTGHLLFLALLMFAFSTFEESVSNYSEWAVFTDDIDQFKTQK...,1212,139235.8716,0.067657,0.009076,0.045380,0.048680,0.063531,0.055281,...,0.042079,0.043729,0.073432,0.048680,0.059406,0.027228,0.044554,8.481477,0.135314,37.266708
2542,Q4JDL3,MSSPRDFRAEPVNDYEGNDSEAEDLNFRETLPSSSQENTPRSKVFE...,420,48422.3273,0.047619,0.019048,0.057143,0.083333,0.052381,0.045238,...,0.042857,0.059524,0.085714,0.042857,0.059524,0.011905,0.040476,5.543530,0.104762,38.816929
2543,Q8N4V2,MEEDLFQLRQLPVVKFRRTGESARSEDDTASGEHEVQIEGVHVGLE...,548,60768.4670,0.085766,0.023723,0.031022,0.062044,0.056569,0.082117,...,0.029197,0.047445,0.072993,0.056569,0.082117,0.031022,0.027372,5.603040,0.114964,42.004015
2544,P48426,MATPGNLGSSVLASKTKTKKKHFVAQKVKLFRASDPLLSVLMWGVN...,406,46224.0440,0.056650,0.009852,0.068966,0.073892,0.046798,0.051724,...,0.027094,0.039409,0.066502,0.044335,0.068966,0.002463,0.039409,6.504620,0.088670,36.958621


## Structural features (NetSurfP-2.0)

In [20]:
df_new = df.merge(features, left_on="Uniprot", right_on="id", how="inner")
df_new.drop(["id"], axis=1, inplace=True)
df_new

Unnamed: 0,Uniprot,Sequence,Length,Molecular weight,A,C,D,E,F,G,...,T,V,W,Y,Isoelectric point,Aromaticity,Instability index,helix,turn,sheet
0,Q8TDC3,MSSGAKEGGGGSPAYHLPHPHPHPPQHAQYVGPYRLEKTLGKGQTG...,778,85085.7493,0.051414,0.012853,0.043702,0.065553,0.029563,0.092545,...,0.035990,0.051414,0.005141,0.020566,9.387580,0.055270,62.915039,0.241645,0.641388,0.116967
1,P48065,MDGKVAVQECGPPAVSWVPEEGEKLDQEDEDQVKDRGQWTNKMEFV...,614,69367.5176,0.052117,0.037459,0.030945,0.043974,0.083062,0.081433,...,0.050489,0.074919,0.029316,0.047231,5.960671,0.159609,37.459805,0.700326,0.289902,0.009772
2,Q9Y250,MGSVSSLISGHSFHSKHCRASQYKLRKSSHLKKLNRYSDGLLRFGF...,596,66612.1565,0.060403,0.011745,0.041946,0.104027,0.023490,0.072148,...,0.030201,0.033557,0.003356,0.018456,6.636487,0.045302,54.359581,0.498322,0.473154,0.028523
3,P0DMW5,MAASAALSAAAAAAALSGLAVRLSRSAAARGSYGAFCKGLTRTLLT...,78,8388.8176,0.230769,0.012821,0.012821,0.012821,0.064103,0.051282,...,0.038462,0.064103,0.012821,0.038462,11.253811,0.115385,33.315513,0.692308,0.307692,0.000000
4,P21579,MVSESHHEALAAPPVTTVATVLPSNATEPASPGEGKEDAFSKLKEK...,422,47572.5391,0.063981,0.014218,0.061611,0.082938,0.045024,0.059242,...,0.054502,0.082938,0.009479,0.026066,8.256547,0.080569,29.304265,0.080569,0.625592,0.293839
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2537,Q8IZU8,MALMFTGHLLFLALLMFAFSTFEESVSNYSEWAVFTDDIDQFKTQK...,1212,139235.8716,0.067657,0.009076,0.045380,0.048680,0.063531,0.055281,...,0.048680,0.059406,0.027228,0.044554,8.481477,0.135314,37.266708,0.338284,0.520627,0.141089
2538,Q4JDL3,MSSPRDFRAEPVNDYEGNDSEAEDLNFRETLPSSSQENTPRSKVFE...,420,48422.3273,0.047619,0.019048,0.057143,0.083333,0.052381,0.045238,...,0.042857,0.059524,0.011905,0.040476,5.543530,0.104762,38.816929,0.247619,0.588095,0.164286
2539,Q8N4V2,MEEDLFQLRQLPVVKFRRTGESARSEDDTASGEHEVQIEGVHVGLE...,548,60768.4670,0.085766,0.023723,0.031022,0.062044,0.056569,0.082117,...,0.056569,0.082117,0.031022,0.027372,5.603040,0.114964,42.004015,0.682482,0.317518,0.000000
2540,P48426,MATPGNLGSSVLASKTKTKKKHFVAQKVKLFRASDPLLSVLMWGVN...,406,46224.0440,0.056650,0.009852,0.068966,0.073892,0.046798,0.051724,...,0.044335,0.068966,0.002463,0.039409,6.504620,0.088670,36.958621,0.248768,0.571429,0.179803


## Subcellular location prediction

##  Domains

## PTM prediction

## Signal peptide

In [None]:
signalp = pd.read_csv(data_path + "Features/prediction_results.txt", sep="\t", index_col=False, header=None, skiprows=2,
    names=["Uniprot", "Prediction", "Likelihood-Other", "Likelihood-SP", "CS Position"])
signalp

In [None]:
signalp["Prediction"].value_counts()

In [None]:
def get_uniprot(string):
    _, uniprot, _ = string.split("|")
    return uniprot

# retrieve Uniprot ID
signalp["Uniprot"] = signalp["Uniprot"].apply(get_uniprot)
signalp