# Data Cleaning and Processing

### Data Cleaning

Importing libraries.

In [1]:
import pandas as pd
import numpy as np

import yaml

Reading files.

In [None]:
try:
    with open('../params.yml','r') as file:
        config = yaml.safe_load(file)  
except Exception as e:
    print('Error reading the config file')

In [None]:
config

In [None]:
data_1 = pd.read_csv(config['raw_data_1']).iloc[:,1:]
data_2 = pd.read_csv(config['raw_data_2']).iloc[:,1:]

In [None]:
data_1.shape

In [None]:
data_2.shape

In [None]:
data = pd.concat([data_1,data_2],axis=0)

In [None]:
data.drop_duplicates(keep='first', inplace=True, ignore_index=True)
data

In [None]:
#data.to_csv('../01_data/data.csv',index=False)

#### Remarks:
pdb_id : protein reference\
seq: Aminoacid sequende\
sst3: secondary structure labeled with 3 categories:\
&emsp; H - Helix\
&emsp; E - B-strand \
&emsp; C - Irregular elements\
 \
sst8: secondary structure labeled with 8 categories:\
&emsp; H - α-helix\
&emsp; G - 3-helix\
&emsp; I - π-helix\
 \
&emsp; E - β-strand\
&emsp; B - β-bridge\
 \
&emsp; C - Loops and irregular elements (corresponding to the blank characters output by DSSP)\
&emsp; T - Turn\
&emsp; S - Bend\

### Data Processing

Get a dataset of each aminoacid, the 2 aminoacids before and 2 after, and the secondary structure (3-labels and 8-labels)

In [None]:
def split_columns(data):
    '''function to break proteins into 20-characters-length sequences'''

    df1 = pd.DataFrame()
    for index in range(0,len(data['seq'])):
        df2 = pd.DataFrame()
        for n in range(2,len(data['seq'][index])-2):
            parts = [ data['seq'][index][n-2], data['seq'][index][n-1], data['seq'][index][n],
                     data['seq'][index][n+1], data['seq'][index][n+2], data['sst3'][index][n], data['sst8'][index][n]]
            df1 = pd.concat([df1, pd.DataFrame(parts).T], axis=0)
        df2 = pd.concat([df2, df1], axis=0)

    df1 = pd.concat([df1,df2], axis=0)

    df1.columns = ['AA-2','AA-1','AA','AA+1','AA+2','y3','y8']
                
    return df1

In [None]:
#new_data = split_columns(data[0:1000])

In [None]:
#new_data = pd.read_csv(config['new_data'])

In [None]:
new_data

In [None]:
print(new_data.shape)
new_data.drop_duplicates(keep='first', inplace=True, ignore_index=True)
print(new_data.shape)

In [None]:
new_data.to_csv('../01_data/new_data.csv',index=False)

The number of rows is too big so I am operating a downsizing.

In [None]:
skip = np.linspace(1, 3118982, 3068983, dtype=int)

In [None]:
skip

In [None]:
down_data = new_data.drop(labels=skip, axis=0)
down_data.shape

In [None]:
x = down_data.iloc[:,0:5]
x

In [None]:
y = down_data.iloc[:,5:]
y

Encoding AA.

In [None]:
from sklearn.preprocessing import OneHotEncoder

def encoding_x(data):
    
    #get dataset with unique values of AA
    aa_types=[]
    for col in data.columns:
        for a in data[col]:
            if a not in aa_types:
                aa_types.append(a)

    aa_types.sort()
    aa_df = pd.DataFrame(aa_types)
    aa_df.columns = ['AA']
    
    #get a encoder-code for each AA
    
    encoder= OneHotEncoder().fit(aa_df)
    aa_enc = encoder.transform(aa_df).toarray()
    data_c = data.copy()
    
    #converse every AA in dataset by its own encoder
    
    x = pd.DataFrame(index=range(0,len(data))) #new dataset

    for col in data.columns:
        list_ = []
        for row in data[col]:
            index = int(aa_df[aa_df['AA']==row].index.values)
            row = aa_enc[index]
            list_.append(row)
        x[col] = list_
        
    #sum AA-encoders per sequence     
    x_sum = x.sum(axis=1)
    
    #converting np.array into pd.DataFrame
    all_df= pd.DataFrame()
    for n in range(0,len(x_sum)):
        all_df = pd.concat([all_df, pd.DataFrame(x_sum[n]).T], axis=0, ignore_index=True)
        
    return all_df

In [None]:
x_enc = encoding_x(x)

In [None]:
xy = pd.concat([x_enc, clean_data.iloc[:,-2:]], axis=1)

In [None]:
x_enc = encoding_x(clean_data)

In [None]:
xy = pd.concat([x_enc, clean_data.iloc[:,-2:]], axis=1)

In [None]:
print(xy.shape)
xy.drop_duplicates(keep='first', inplace=True, ignore_index=True)
print(xy.shape)
xy.to_csv('../01_data/xy.csv',index=False)