In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv('./raw/df_no_mod_res.csv')


### Filter out sequences that are too long / short

In [3]:
print('number of rows before: ', len(df))

#print the different lengths of the sequences and their counts, ordered by length
print('lengths and count of sequences: ', df['Sequence'].str.len().value_counts().sort_index())

#get rid of sequences with length smaller 29 or bigger 39
df = df[(df['Sequence'].str.len() >= 29) & (df['Sequence'].str.len() <= 39)]

print('number of rows after: ', len(df))

number of rows before:  1318
lengths and count of sequences:  Sequence
16       1
29     150
30      20
31       4
32     279
33       6
34     451
35      41
36       7
37      16
38       8
39     289
41       1
42       2
43       4
44       5
46       1
48       4
49       3
51       1
53       3
58       1
61       1
62       3
64       2
68       1
72       4
74       4
82       1
84       2
107      3
Name: count, dtype: int64
number of rows after:  1271


In [4]:
import numpy as np

#convert to pIC50
df['pIC50'] = np.round(-np.log10(df['IC50'].astype('float64')*1e-9),2)

#rename IC50 to  Nav1.7 IC50 (nM)
df.rename(columns={'IC50':'Nav1.7 IC50 (nM)'}, inplace=True)

df.to_pickle('./processed/clean_df.pkl')

In [5]:
df

Unnamed: 0,Sequence ID,Sequence,Assay,Reference,Nav1.7 IC50 (nM),Nav1.2 IC50 (nM),Nav1.4 IC50 (nM),REGION_NOTES,Variants_of,pIC50
0,SEQ ID NO: 1 (GpTx-1),DCLGFMRKCIPDNDKCCRPNLVCSRTHKWCKYVF,Unknown,CN109517041,90.0,,3.7,,GpTx-1,7.05
1,SEQ ID NO: 3,DCLGAFRKCIPDNDKCCRPNLVCSRLHRWCKYVF,Unknown,CN109517041,1.6,,1900.0,,GpTx-1,8.80
2,SEQ ID NO: 4,DCLGFMRKCEPDNDKCCRPNLVCSRTHKWCKYVF,Unknown,CN109517041,2.1,,1300.0,,GpTx-1,8.68
3,SEQ ID NO: 5,DCLGFMRKCIEDNDKCCRPNLVCSRTHKWCKYVF,Unknown,CN109517041,2.5,,1400.0,,GpTx-1,8.60
4,SEQ ID NO: 6,DCLGFMRKCIPDNDKCCKPNLVCSRTHKWCKYVF,Unknown,CN109517041,1.6,,5100.0,,GpTx-1,8.80
...,...,...,...,...,...,...,...,...,...,...
1313,Seq.id_21;[K24D]Pnc1a,DCRYMFGDCEKDEDCCKHLGCKRDMKYCAWDFTFT,FLIPR Tetra,WO2017219081,510.0,38000.0,12000.0,,Pnc1a,6.29
1314,Seq.id_22;[W30A]Pnc1a,DCRYMFGDCEKDEDCCKHLGCKRKMKYCAADFTFT,FLIPR Tetra,WO2017219081,7000.0,24000.0,12000.0,,Pnc1a,5.15
1315,Seq.id_18;[Y4A]Pnc1a,DCRAMFGDCEKDEDCCKHLGCKRKMKYCAWDFTFT,FLIPR Tetra,WO2017219081,934.0,8000.0,12000.0,,Pnc1a,6.03
1316,Seq.id_19;[E10K]Pnc1a,DCRYMFGDCKKDEDCCKHLGCKRKMKYCAWDFTFT,FLIPR Tetra,WO2017219081,41.0,172.0,2000.0,,Pnc1a,7.39


In [6]:
#group by Sequence Assay and REGION_NOTES
df_grouped = df.groupby(['Sequence', 'Assay','REGION_NOTES'], dropna=False).agg({'Variants_of': 'max', 'Reference': 'max', 'Sequence ID': list, 'pIC50': 'median', 'Nav1.2 IC50 (nM)': 'median', 'Nav1.4 IC50 (nM)': 'median', 'Nav1.7 IC50 (nM)': 'median' }).reset_index()

df_grouped

Unnamed: 0,Sequence,Assay,REGION_NOTES,Variants_of,Reference,Sequence ID,pIC50,Nav1.2 IC50 (nM),Nav1.4 IC50 (nM),Nav1.7 IC50 (nM)
0,ACKGVFDACTPGKNECCPNRVCSDKHKWCKWKL,PatchXpress,,GpTx-1,WO2012125973,[529],6.14,,24890.0,717.0
1,ACLGFMRKCIPDNDKCCRPNLVCSRTHKWCKYVF,IonWorks Quattro,['C-term Amide'],GpTx-1,WO2012125973,[20],7.00,,1680.0,100.0
2,ACQCQKWMQTCDAERKCCEGFSCTLWCKKKLW,FLIPR Tetra,,Protoxin II,WO2016140859,[115],7.54,,,28.6
3,ACQKWMWTCDSKRACCEGLRCKLWCRKII,IonWorks Quattro,['C-term Amide'],JzTx-V,WO2014165277,[3],8.40,,70.0,4.0
4,ADCLGFMRKCIPDNDKCCRPNLVCSRTHKWCKYVF,IonWorks Quattro,['C-term Amide'],GpTx-1,WO2012125973,[100],6.43,,7070.0,370.0
...,...,...,...,...,...,...,...,...,...,...
1225,YCQKWMWTCDSKRRCCEGLRCKLWCRKII,IonWorks Quattro,['C-term Amide'],JzTx-V,WO2014165277,[100],8.10,,80.0,8.0
1226,YCQKWMWTCDSRRACCEGLRCKLWCRKII,IonWorks Quattro,['C-term Amide'],JzTx-V,WO2014165277,[99],8.10,,150.0,8.0
1227,YCQKYMWTCDSKRACCEGLECKLWCRKYI,IonWorks Quattro,['C-term Amide'],JzTx-V,WO2014165277,"[194, 194]",5.82,,5000.0,1705.0
1228,YCQRWMWTCDSKRACCEGLRCKLWCRKII,IonWorks Quattro,['C-term Amide'],JzTx-V,WO2014165277,[92],7.85,,160.0,14.0


In [7]:
df_grouped.to_pickle('./processed/clean_df_grouped.pkl')

In [8]:
#print Variants_of and the value counts
print(df['Variants_of'].value_counts())

Variants_of
GpTx-1           497
Protoxin II      320
Huwentoxin-IV    295
JzTx-V           144
Pnc1a             15
Name: count, dtype: int64


### Generate the exploded + aligned sequences

In [9]:
def transform_data(df):

    # Create a new dataframe to store the transformed data
    transformed_rows = []

    # Iterate over each row in the dataset
    for index, row in df.iterrows():
        sequence = row['Sequence']

        # Find the position of the first occurrence of 'CC' in the sequence
        cc_index = sequence.find('CC')
        # Iterate over each character in the sequence
        transformed_row = {}
        for i, char in enumerate(sequence):
            # Create a new column name based on the position of the letter within the sequence
            column_name = str(i - cc_index)

            # Store the letter in the corresponding column
            transformed_row[column_name] = char

        # Append the transformed row to the list of transformed rows
        transformed_rows.append(transformed_row)

    # Create the transformed dataframe from the list of transformed rows
    transformed_df = pd.DataFrame(transformed_rows)

    sorted_columns = sorted(transformed_df.columns, key=lambda x: int(x))
    transformed_df = transformed_df[sorted_columns]

    return transformed_df


In [10]:
test = transform_data(df)

print("Number of Sequences before filter: ", len(test))

''' 
for i in test.columns:
    print(i, test.describe()[i].freq)

for i in test.columns:
    print(i, test.describe()[i].freq)

'''

#get rid of rows which have not null values in column '22' and column '-18' 
test = test[test['22'].isnull()]
test = test[test['-18'].isnull()]
test = test.dropna(axis=1, how='all')

print("Number of Sequences after filter: ", len(test))

Number of Sequences before filter:  1271
Number of Sequences after filter:  1260
