# 06 Data Re-Processing

Importing libraries.

In [23]:
import pandas as pd
import numpy as np

import yaml

Reading files.

In [2]:
try:
    with open('../params.yml','r') as file:
        config = yaml.safe_load(file)  
except Exception as e:
    print('Error reading the config file')

In [3]:
config

{'raw_data_1': '../01_data/PDB_31-07-2011.csv',
 'raw_data_2': '../01_data/PDB_31-12-2012.csv',
 'data': '../01_data/data.csv',
 'new_data': '../01_data/new_data.csv',
 'xy': '../01_data/xy.csv',
 'xy_06': '../01_data/xy_06.csv',
 'xy_enc': '../01_data/xy_enc.csv'}

In [4]:
data_1 = pd.read_csv(config['raw_data_1']).iloc[:,1:]
data_2 = pd.read_csv(config['raw_data_2']).iloc[:,1:]

In [5]:
print(data_1.shape,data_2.shape)

(17608, 4) (5877, 4)


In [6]:
data = pd.concat([data_1,data_2],axis=0)

In [10]:
print(data.shape)
data.drop_duplicates(keep='first', inplace=True, ignore_index=True)
print(data.shape)

(23485, 4)
(21092, 4)


In [7]:
data.head()

Unnamed: 0,pdb_id,seq,sst3,sst8
0,12asA,AYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLS...,CHHHHHHHHHHHHHHHHHHHHHHHCEEECCCCCEEECCCCCCCCCC...,CHHHHHHHHHHHHHHHHHHHHHHHCEEECCCCSEEETTSSCSCCTT...
1,16vpA,SRMPSPPMPVPPAALFNRLLDDLGFSAGPALCTMLDTWNEDLFSAL...,CCCCCCCCCCCHHHHHHHHHHHHCCCCHHHHHHHHHHCCCCCCCCC...,CCSCCCCCCCCHHHHHHHHHHHHTCTTHHHHHHHHHHCCCCCSTTS...
2,1914A,MVLLESEQFLTELTRLFQKCRSSGSVFITLKKYDEGLEPAENKCLL...,CCEECHHHHHHHHHHHHHHCCCCCCEEEEEEEECCCCCCCCCEEEE...,CCEECHHHHHHHHHHHHHHTSSSCCEEEEEEEECCCCCCCCCEEEE...
3,1a0iA,VNIKTNPFKAVSFVESAIKKALDNAGYLIAEIKYDGVRGNICVDNT...,CCCCCCCEEEEECCHHHHHHHHHHHCCEEEEECCCCEEEEEEEECC...,CTTCCCCEEEEECCHHHHHHHHHHHSSEEEEECCCSEEEEEEEETT...
4,1a0pA,QDLARIEQFLDALWLEKNLAENTLNAYRRDLSMMVEWLHHRGLTLA...,CHHHHHHHHHHHHHHCCCCCHHHHHHHHHHHHHHHHHHHHCCCCCC...,CHHHHHHHHHHHHHHTTCSCHHHHHHHHHHHHHHHHHHHHTSCCTT...


#### Remarks:
pdb_id : protein reference\
seq: Aminoacid sequende\
sst3: secondary structure labeled with 3 categories:\
&emsp; H - Helix\
&emsp; E - B-strand \
&emsp; C - Irregular elements\
 \
sst8: secondary structure labeled with 8 categories:\
&emsp; H - α-helix\
&emsp; G - 3-helix\
&emsp; I - π-helix\
 \
&emsp; E - β-strand\
&emsp; B - β-bridge\
 \
&emsp; C - Loops and irregular elements (corresponding to the blank characters output by DSSP)\
&emsp; T - Turn\
&emsp; S - Bend\

## Data Processing

Get a dataset of each aminoacid, the 2 aminoacids before and 2 after, and the secondary structure (3-labels and 8-labels)

In [25]:
%%writefile -a functions.py

import pandas as pd

def split_columns(data):
    '''function to break proteins into 20-characters-length sequences'''

    df1 = pd.DataFrame()
    for index in range(0,len(data['seq'])):
        df2 = pd.DataFrame()
        for n in range(2,len(data['seq'][index])-2):
            parts = [ data['seq'][index][n-2], data['seq'][index][n-1], data['seq'][index][n],
                     data['seq'][index][n+1], data['seq'][index][n+2], data['sst3'][index][n], data['sst8'][index][n]]
            df1 = pd.concat([df1, pd.DataFrame(parts).T], axis=0)
        df2 = pd.concat([df2, df1], axis=0)

    df1 = pd.concat([df1,df2], axis=0)

    df1.columns = ['AA-2','AA-1','AA','AA+1','AA+2','y3','y8']
                
    return df1

Appending to functions.py


In [26]:
%run functions.py

In [None]:
new_data = split_columns(data)

In [None]:
x = new_data.iloc[:,:5]
x.shape

Encoding AA.

In [7]:
from sklearn.preprocessing import OneHotEncoder

encoder= OneHotEncoder(drop = 'first').fit(x)

x_enc = encoder.transform(x).toarray()
x_enc = pd.DataFrame(x_enc, columns = encoder.get_feature_names_out())

In [8]:
x_enc

Unnamed: 0,AA-2i_C,AA-2i_D,AA-2i_E,AA-2i_F,AA-2i_G,AA-2i_H,AA-2i_I,AA-2i_K,AA-2i_L,AA-2i_M,...,AA+2_N,AA+2_P,AA+2_Q,AA+2_R,AA+2_S,AA+2_T,AA+2_V,AA+2_W,AA+2_X,AA+2_Y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3118978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3118979,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3118980,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3118981,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
xy = pd.concat([x_enc, new_data.iloc[:,-2:]], axis=1)
xy.head()

Unnamed: 0,AA-2i_C,AA-2i_D,AA-2i_E,AA-2i_F,AA-2i_G,AA-2i_H,AA-2i_I,AA-2i_K,AA-2i_L,AA-2i_M,...,AA+2_Q,AA+2_R,AA+2_S,AA+2_T,AA+2_V,AA+2_W,AA+2_X,AA+2_Y,y3,y8
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,H,H
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,H,H
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,H,H
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,H,H
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,H,H


In [10]:
print(xy.shape)
xy.drop_duplicates(keep='first', inplace=True, ignore_index=True)
print(xy.shape)
xy.to_csv('../01_data/xy_enc.csv',index=False)

(3118983, 102)
(3118983, 102)
