# Secondary structure

In [62]:
import pandas as pd
from scipy.stats import entropy
import scipy

## Read in data

### RSA data 

In [39]:
RSA = pd.read_csv("https://raw.githubusercontent.com/hutch-gwc/Rosalind/master/rsa.csv")  # read the data
RSA.drop(['index'], axis=1, inplace=True)  # drop column we don't need
RSA = RSA[RSA.protein == "Flu"]  # we only want the flu protein
RSA.drop(['protein'], axis=1, inplace=True)  # we don't need the protein column any more
RSA.head()

Unnamed: 0,site,RSA
530,18,0.170984
531,19,0.168605
532,20,0.0
533,21,0.077844
534,22,0.005076


### Preferences

In [40]:
prefs = pd.read_csv("https://raw.githubusercontent.com/hutch-gwc/Rosalind/master/HA_WSN_avg.csv")  # read in the data 
prefs.head()

Unnamed: 0,A,C,D,E,F,G,H,I,K,L,...,N,P,Q,R,S,T,V,W,Y,site
0,0.050522,0.292074,0.041984,0.069723,0.112054,0.070729,0.207317,0.172276,0.175882,0.170153,...,0.228765,0.095909,0.145664,0.068147,0.1692,0.216658,0.183667,0.070102,0.282156,1
1,0.299156,0.092621,0.193645,0.147577,0.09338,0.048215,0.11077,0.303162,0.100233,0.172399,...,0.202556,0.130529,0.103289,0.232608,0.080003,0.153042,0.20986,0.049683,0.175782,2
2,0.282239,0.05426,0.075008,0.115239,0.052471,0.07093,0.090155,0.078627,0.245924,0.158548,...,0.268678,0.086427,0.165511,0.145599,0.226276,0.227845,0.16936,0.081525,0.108752,3
3,0.285097,0.571661,0.023748,0.078999,0.264428,0.101803,0.186965,0.092665,0.074159,0.344304,...,0.030057,0.042051,0.09353,0.064137,0.017293,0.103522,0.148757,0.218695,0.103401,4
4,0.185256,0.157839,0.127189,0.069883,0.49054,0.035142,0.066323,0.136314,0.044657,0.525195,...,0.048845,0.071764,0.05858,0.098211,0.033002,0.051313,0.269358,0.093787,0.249184,5


## Merge RSA and Prefs data

In [72]:
# we are going to use a `pandas` function called "merge" to put all of our data into one dataframe
# `on` asks what do we want to match in the two dataframes? 
# in our case, the "site" in RSA is the same as the "site" in prefs 
# `how` tells `pandas` what to do if there is a site in one dataframe but not in another. 
# `inner` says "drop these sites"
df = pd.merge(RSA, prefs, on='site', how='inner')
df.head()

Unnamed: 0,site,RSA,A,C,D,E,F,G,H,I,...,M,N,P,Q,R,S,T,V,W,Y
0,18,0.170984,0.031051,0.032774,0.066396,0.130986,0.077862,0.010545,0.23071,0.057492,...,0.201627,0.110354,0.016166,0.196481,0.325941,0.33675,0.297381,0.198282,0.063771,0.161101
1,19,0.168605,0.035575,0.074407,0.09706,0.050744,0.068674,0.030246,0.057919,1.135448,...,0.155037,0.046413,0.131968,0.050284,0.059777,0.032668,0.205351,0.276241,0.083863,0.066269
2,20,0.0,0.057711,1.578429,0.123115,0.05694,0.157598,0.053046,0.048184,0.054634,...,0.086453,0.061416,0.055079,0.060295,0.098218,0.047429,0.044885,0.062976,0.139656,0.110007
3,21,0.077844,0.089144,0.100696,0.064763,0.094098,0.055839,0.038984,0.092634,1.46951,...,0.126284,0.042902,0.039655,0.082291,0.03998,0.084321,0.099074,0.142382,0.103434,0.050375
4,22,0.005076,0.058867,0.135161,0.13859,0.07987,0.056306,1.476877,0.061189,0.051963,...,0.100659,0.068674,0.029653,0.079329,0.068681,0.084593,0.037853,0.149194,0.105167,0.064609


## Entropy?

In [73]:
amino_acids = [x for x in list(df.columns.values) if x not in ["site", "RSA"]]
df["entropy"] = df[amino_acids].apply(lambda r: tuple(r), axis=1).apply(scipy.array)
df["entropy"] = df["entropy"].apply(lambda x: entropy(x))
df.head()

Unnamed: 0,site,RSA,A,C,D,E,F,G,H,I,...,N,P,Q,R,S,T,V,W,Y,entropy
0,18,0.170984,0.031051,0.032774,0.066396,0.130986,0.077862,0.010545,0.23071,0.057492,...,0.110354,0.016166,0.196481,0.325941,0.33675,0.297381,0.198282,0.063771,0.161101,2.739985
1,19,0.168605,0.035575,0.074407,0.09706,0.050744,0.068674,0.030246,0.057919,1.135448,...,0.046413,0.131968,0.050284,0.059777,0.032668,0.205351,0.276241,0.083863,0.066269,2.361446
2,20,0.0,0.057711,1.578429,0.123115,0.05694,0.157598,0.053046,0.048184,0.054634,...,0.061416,0.055079,0.060295,0.098218,0.047429,0.044885,0.062976,0.139656,0.110007,2.042572
3,21,0.077844,0.089144,0.100696,0.064763,0.094098,0.055839,0.038984,0.092634,1.46951,...,0.042902,0.039655,0.082291,0.03998,0.084321,0.099074,0.142382,0.103434,0.050375,2.152196
4,22,0.005076,0.058867,0.135161,0.13859,0.07987,0.056306,1.476877,0.061189,0.051963,...,0.068674,0.029653,0.079329,0.068681,0.084593,0.037853,0.149194,0.105167,0.064609,2.147961


In [74]:
df = pd.melt(df, id_vars=["site", "RSA", "entropy"], var_name="amino_acid", value_name="pref")
df.head()

Unnamed: 0,site,RSA,entropy,amino_acid,pref
0,18,0.170984,2.739985,A,0.031051
1,19,0.168605,2.361446,A,0.035575
2,20,0.0,2.042572,A,0.057711
3,21,0.077844,2.152196,A,0.089144
4,22,0.005076,2.147961,A,0.058867


## Classify the amino acids

In [77]:
classification = {'R': "polar", 'N': "polar", 'D': "polar", 'C': "polar", 'E': "polar", 'Q': "polar", 'H': "polar",
                 'K': "polar", 'S': "polar", 'T': "polar", 'W': "polar", 'A': "non-polar", 'G': "non-polar",
                 'I': "non-polar", 'L': "non-polar", 'M': "non-polar", 'F': "non-polar", 'P': "non-polar", 'V': "non-polar"}        
classification

{'A': 'non-polar',
 'C': 'polar',
 'D': 'polar',
 'E': 'polar',
 'F': 'non-polar',
 'G': 'non-polar',
 'H': 'polar',
 'I': 'non-polar',
 'K': 'polar',
 'L': 'non-polar',
 'M': 'non-polar',
 'N': 'polar',
 'P': 'non-polar',
 'Q': 'polar',
 'R': 'polar',
 'S': 'polar',
 'T': 'polar',
 'V': 'non-polar',
 'W': 'polar'}