# Genetic attribution preprocessing

In this notebook we overview the [Genetic Engineering Attrinution competition in Driven Data](https://www.drivendata.org/competitions/63/genetic-engineering-attribution/).   

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from collections import OrderedDict

## Loading data

In [2]:
%%time
parent_dir = "/home/rio/genetic_engineering_attribution"
data_dir = os.path.join(parent_dir,"data")
data_list = os.listdir(data_dir)
data_list

CPU times: user 543 µs, sys: 70 µs, total: 613 µs
Wall time: 338 µs


['train_labels.csv',
 'train_values.csv',
 'test_values.csv',
 'submission_format_3TFRxH6.csv']

#### Training features

In [3]:
%%time
train_features_path = os.path.join(data_dir,data_list[1])
train_features = pd.read_csv(train_features_path)
print("shape of train_features: ", train_features.shape)
train_features.head()

shape of train_features:  (63017, 41)
CPU times: user 2.17 s, sys: 168 ms, total: 2.33 s
Wall time: 2.33 s


Unnamed: 0,sequence_id,sequence,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,copy_number_low_copy,copy_number_unknown,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,9ZIMC,CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCA...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,GCTGGATGGTTTGGGACATGTGCAGCCCCGTCTCTGTATGGAGTGA...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,NNCCGGGCTGTAGCTACACAGGGCGGAGATGAGAGCCCTACGAAAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,GCGGAGATGAAGAGCCCTACGAAAGCTGAGCCTGCGACTCCCGCAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,CGCGCATTACTTCACATGGTCCTCAAGGGTAACATGAAAGTGATCC...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Training targets

In [4]:
%%time
train_targets_path = os.path.join(data_dir,data_list[0])
train_targets = pd.read_csv(train_targets_path)
print("shape of train_targets: ", train_targets.shape)
train_targets.head()

shape of train_targets:  (63017, 1315)
CPU times: user 6.64 s, sys: 387 ms, total: 7.03 s
Wall time: 7.03 s


Unnamed: 0,sequence_id,00Q4V31T,012VT4JK,028IO5W2,03GRNN7N,03Y3W51H,09MQV1TY,0A4AHRCT,0A9M05NC,0B9GCUVV,...,ZQNGGY33,ZSHS4VJZ,ZT1IP3T6,ZU6860XU,ZU6TVFFU,ZU75P59K,ZUI6TDWV,ZWFD8OHC,ZX06ZDZN,ZZJVE4HO
0,9ZIMC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Preprocessing

#### One-hot encoding DNA sequences

In [5]:
def dna2dict(dna_seq):
    """
    DNA bases are ordered in tuples alphabetically as (A,C,G,N,T)
    """
    dna_dict = OrderedDict()
    dna_seq = dna_seq.upper()
    for ix,b in enumerate(dna_seq):
        dna_dict[ix] = (int(b=="A"),int(b=="C"),int(b=="G"),int(b=="N"),int(b=="T"))
    return dna_dict
        

In [6]:
dna_dict = dna2dict(train_features.sequence.values[1])
dna_dict

OrderedDict([(0, (0, 0, 1, 0, 0)),
             (1, (0, 1, 0, 0, 0)),
             (2, (0, 0, 0, 0, 1)),
             (3, (0, 0, 1, 0, 0)),
             (4, (0, 0, 1, 0, 0)),
             (5, (1, 0, 0, 0, 0)),
             (6, (0, 0, 0, 0, 1)),
             (7, (0, 0, 1, 0, 0)),
             (8, (0, 0, 1, 0, 0)),
             (9, (0, 0, 0, 0, 1)),
             (10, (0, 0, 0, 0, 1)),
             (11, (0, 0, 0, 0, 1)),
             (12, (0, 0, 1, 0, 0)),
             (13, (0, 0, 1, 0, 0)),
             (14, (0, 0, 1, 0, 0)),
             (15, (1, 0, 0, 0, 0)),
             (16, (0, 1, 0, 0, 0)),
             (17, (1, 0, 0, 0, 0)),
             (18, (0, 0, 0, 0, 1)),
             (19, (0, 0, 1, 0, 0)),
             (20, (0, 0, 0, 0, 1)),
             (21, (0, 0, 1, 0, 0)),
             (22, (0, 1, 0, 0, 0)),
             (23, (1, 0, 0, 0, 0)),
             (24, (0, 0, 1, 0, 0)),
             (25, (0, 1, 0, 0, 0)),
             (26, (0, 1, 0, 0, 0)),
             (27, (0, 1, 0, 0, 0)),
  

In [7]:
len(dna_dict)

456

In [8]:
list(dna_dict.values())

[(0, 0, 1, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 0, 0, 1),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 1, 0, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 1, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 1, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 0,

#### One-hot encoded DNA sequences as dataframes

In [9]:
def dna2df(dna_seq,seq_id=None, lab_id = None):
    dna_dict = dna2dict(dna_seq)
    dna_df = pd.DataFrame(dna_dict.values(),columns=("A","C","G","N","T"))
    if seq_id:
        columns = ["sequence_id"] + list(dna_df.columns)
        dna_df["sequence_id"] = [seq_id]*len(dna_df)
        dna_df = dna_df.loc[:,columns]
    if lab_id:
        columns = ["lab_id"] + list(dna_df.columns)
        dna_df["lab_id"] = [lab_id]*len(dna_df)
        dna_df = dna_df.loc[:,columns]
    return dna_df


In [10]:
dna_seq = train_features.sequence.values[0]
seq_id = train_features.sequence_id.values[0]
lab_id = train_targets.columns[train_targets.iloc[0,:]==1][0]
dna_df = dna2df(dna_seq=dna_seq,seq_id=seq_id,lab_id=lab_id)
dna_df

Unnamed: 0,lab_id,sequence_id,A,C,G,N,T
0,RYUA3GVO,9ZIMC,0,1,0,0,0
1,RYUA3GVO,9ZIMC,1,0,0,0,0
2,RYUA3GVO,9ZIMC,0,0,0,0,1
3,RYUA3GVO,9ZIMC,0,0,1,0,0
4,RYUA3GVO,9ZIMC,0,1,0,0,0
...,...,...,...,...,...,...,...
7146,RYUA3GVO,9ZIMC,1,0,0,0,0
7147,RYUA3GVO,9ZIMC,0,1,0,0,0
7148,RYUA3GVO,9ZIMC,0,1,0,0,0
7149,RYUA3GVO,9ZIMC,0,0,1,0,0


In [11]:
lab_id

'RYUA3GVO'

#### Generating separate csvs for each lab

In [12]:
%%time
split_labs_dir = os.path.join(parent_dir,"split_labs")
if not os.path.isdir(split_labs_dir):
    os.mkdir(split_labs_dir)
#unique_labs = train_targets.columns[1:]
#pbar = tqdm(unique_labs)
#for lab in tqdm(unique_labs):
#    lab_path = os.path.join(split_labs_dir,lab)
#    if not os.path.isdir(lab_path):
#        pbar.set_description(f"Generating directory {lab_path}")
#        os.mkdir(lab_path)
#    else:
#        pbar.set_description(f"Directory {lab_path} already exists. Skipping.")


CPU times: user 238 µs, sys: 16 µs, total: 254 µs
Wall time: 268 µs


In [13]:
%%time
split_labs_dir = os.path.join(parent_dir,"split_labs")
def split_labs(train_features,train_targets,savedir=split_labs_dir,usecols=None):
    pbar = tqdm(train_features.iterrows(),total=len(train_features))
    if usecols is None:
        usecols = list(train_features.columns[train_features.columns.get_loc("sequence")+1:])
    unique_labs = list(train_targets.columns[1:])
    pbar = tqdm(unique_labs)
    #print("unique_labs; ", unique_labs)
    for lab_id in pbar:
        #print("lab_id: ", lab_id)
        pbar.set_description(f"Processing lab {lab_id}")
        ixs = train_targets.index[train_targets.loc[:,lab_id]==1]
        df_lab_concat = []
        for ix in ixs:
            row = train_features.loc[ix,:]
            dna_seq = row.sequence
            seq_id = row.sequence_id
            lab_id = train_targets.columns[train_targets.iloc[ix,:]==1][0]
            dna_df = dna2df(dna_seq,seq_id,lab_id)
        #pbar.set_description(f"Processing sample {seq_id}")
        #print("dna_df: ", dna_df)
            if usecols:
                repeat = row.loc[usecols].values.reshape(1,-1)
                repeat = np.repeat( repeat,len(dna_df), axis=0)
                #print("shape: ", np.repeat(sample.loc[usecols].values,len(dna_df),axis=1).shape)
                #print(np.repeat( sample.loc[usecols].values,len(dna_df)  ))
                extra_cols_df = pd.DataFrame( repeat ,columns=usecols)   
                dna_df = pd.concat([dna_df,extra_cols_df],axis=1)
            df_lab_concat.append(dna_df)
        df_lab = pd.concat(df_lab_concat,axis=0,ignore_index=True)
        if savedir is not None:
            savepath = os.path.join(savedir,lab_id+".csv")
            #print("savepath: ", savepath)
            df_lab.to_csv(savepath,index=True)
    
    
    

#### Saving a .csv for each lab

In [None]:
%%time
split_labs(train_features,train_targets,split_labs_dir,usecols=None)

  0%|          | 0/63017 [00:00<?, ?it/s]
  0%|          | 0/63017 [00:00<?, ?it/s][A

Processing lab 00Q4V31T:   0%|          | 0/1314 [00:00<?, ?it/s][A
Processing lab 00Q4V31T:   0%|          | 1/1314 [00:01<25:03,  1.14s/it][A
Processing lab 012VT4JK:   0%|          | 1/1314 [00:01<25:03,  1.14s/it][A
Processing lab 012VT4JK:   0%|          | 2/1314 [00:05<45:38,  2.09s/it][A
Processing lab 028IO5W2:   0%|          | 2/1314 [00:05<45:38,  2.09s/it][A
Processing lab 028IO5W2:   0%|          | 3/1314 [00:06<40:04,  1.83s/it][A
Processing lab 03GRNN7N:   0%|          | 3/1314 [00:06<40:04,  1.83s/it][A
Processing lab 03GRNN7N:   0%|          | 4/1314 [00:06<28:42,  1.31s/it][A
Processing lab 03Y3W51H:   0%|          | 4/1314 [00:06<28:42,  1.31s/it][A
Processing lab 03Y3W51H:   0%|          | 5/1314 [00:36<3:30:38,  9.65s/it][A
Processing lab 09MQV1TY:   0%|          | 5/1314 [00:36<3:30:38,  9.65s/it][A
Processing lab 09MQV1TY:   0%|          | 6/1314 [00:38<2:47:13,  7.67

Processing lab 1LBGAU5Z:   4%|▍         | 52/1314 [02:49<43:06,  2.05s/it][A
Processing lab 1LBGAU5Z:   4%|▍         | 53/1314 [02:50<35:09,  1.67s/it][A
Processing lab 1NXRMDN6:   4%|▍         | 53/1314 [02:50<35:09,  1.67s/it][A
Processing lab 1NXRMDN6:   4%|▍         | 54/1314 [02:51<32:58,  1.57s/it][A
Processing lab 1OQJ21E9:   4%|▍         | 54/1314 [02:51<32:58,  1.57s/it][A
Processing lab 1OQJ21E9:   4%|▍         | 55/1314 [02:53<31:09,  1.48s/it][A
Processing lab 1OWZDF82:   4%|▍         | 55/1314 [02:53<31:09,  1.48s/it][A
Processing lab 1OWZDF82:   4%|▍         | 56/1314 [02:53<24:18,  1.16s/it][A
Processing lab 1PA232PA:   4%|▍         | 56/1314 [02:53<24:18,  1.16s/it][A
Processing lab 1PA232PA:   4%|▍         | 57/1314 [02:54<22:59,  1.10s/it][A
Processing lab 1PIGWQFY:   4%|▍         | 57/1314 [02:54<22:59,  1.10s/it][A
Processing lab 1PIGWQFY:   4%|▍         | 58/1314 [02:58<40:11,  1.92s/it][A
Processing lab 1Q1IUY3G:   4%|▍         | 58/1314 [02:58<40:11, 

Processing lab 2VTLZHDS:   8%|▊         | 104/1314 [06:12<34:51,  1.73s/it][A
Processing lab 2VX4F6RC:   8%|▊         | 104/1314 [06:12<34:51,  1.73s/it][A
Processing lab 2VX4F6RC:   8%|▊         | 105/1314 [06:12<30:00,  1.49s/it][A
Processing lab 2XC1478M:   8%|▊         | 105/1314 [06:12<30:00,  1.49s/it][A
Processing lab 2XC1478M:   8%|▊         | 106/1314 [06:16<44:00,  2.19s/it][A
Processing lab 2XX0N87I:   8%|▊         | 106/1314 [06:16<44:00,  2.19s/it][A
Processing lab 2XX0N87I:   8%|▊         | 107/1314 [06:16<31:58,  1.59s/it][A
Processing lab 2Y9L13L4:   8%|▊         | 107/1314 [06:16<31:58,  1.59s/it][A
Processing lab 2Y9L13L4:   8%|▊         | 108/1314 [06:17<26:29,  1.32s/it][A
Processing lab 2YCH1PUI:   8%|▊         | 108/1314 [06:17<26:29,  1.32s/it][A
Processing lab 2YCH1PUI:   8%|▊         | 109/1314 [06:17<19:40,  1.02it/s][A
Processing lab 2YLQA8OZ:   8%|▊         | 109/1314 [06:17<19:40,  1.02it/s][A
Processing lab 2YLQA8OZ:   8%|▊         | 110/1314 [

Processing lab 40MD0YZ3:  12%|█▏        | 155/1314 [10:05<1:14:21,  3.85s/it][A
Processing lab 40ZI3TDN:  12%|█▏        | 155/1314 [10:05<1:14:21,  3.85s/it][A
Processing lab 40ZI3TDN:  12%|█▏        | 156/1314 [10:15<1:46:40,  5.53s/it][A
Processing lab 443NZOSB:  12%|█▏        | 156/1314 [10:15<1:46:40,  5.53s/it][A
Processing lab 443NZOSB:  12%|█▏        | 157/1314 [10:16<1:24:36,  4.39s/it][A
Processing lab 448QVC4C:  12%|█▏        | 157/1314 [10:16<1:24:36,  4.39s/it][A
Processing lab 448QVC4C:  12%|█▏        | 158/1314 [10:18<1:10:45,  3.67s/it][A
Processing lab 44N2CYI9:  12%|█▏        | 158/1314 [10:18<1:10:45,  3.67s/it][A
Processing lab 44N2CYI9:  12%|█▏        | 159/1314 [10:19<52:38,  2.73s/it]  [A
Processing lab 459BZKP3:  12%|█▏        | 159/1314 [10:19<52:38,  2.73s/it][A
Processing lab 459BZKP3:  12%|█▏        | 160/1314 [10:26<1:15:07,  3.91s/it][A
Processing lab 4648UZGD:  12%|█▏        | 160/1314 [10:26<1:15:07,  3.91s/it][A
Processing lab 4648UZGD:  12%|

Processing lab 5AUVXXDU:  16%|█▌        | 206/1314 [12:06<1:03:54,  3.46s/it][A
Processing lab 5BNUT8AW:  16%|█▌        | 206/1314 [12:06<1:03:54,  3.46s/it][A
Processing lab 5BNUT8AW:  16%|█▌        | 207/1314 [12:07<51:53,  2.81s/it]  [A
Processing lab 5BTY65G6:  16%|█▌        | 207/1314 [12:07<51:53,  2.81s/it][A
Processing lab 5BTY65G6:  16%|█▌        | 208/1314 [12:08<40:36,  2.20s/it][A
Processing lab 5CBNCRST:  16%|█▌        | 208/1314 [12:08<40:36,  2.20s/it][A
Processing lab 5CBNCRST:  16%|█▌        | 209/1314 [12:08<29:56,  1.63s/it][A
Processing lab 5FUDT1QA:  16%|█▌        | 209/1314 [12:08<29:56,  1.63s/it][A
Processing lab 5FUDT1QA:  16%|█▌        | 210/1314 [12:10<31:02,  1.69s/it][A
Processing lab 5H71LUBY:  16%|█▌        | 210/1314 [12:10<31:02,  1.69s/it][A
Processing lab 5H71LUBY:  16%|█▌        | 211/1314 [12:11<28:45,  1.56s/it][A
Processing lab 5K2PTY6L:  16%|█▌        | 211/1314 [12:11<28:45,  1.56s/it][A
Processing lab 5K2PTY6L:  16%|█▌        | 212/

Processing lab 6UI9XACW:  20%|█▉        | 257/1314 [14:14<1:26:08,  4.89s/it][A
Processing lab 6UI9XACW:  20%|█▉        | 258/1314 [14:15<1:06:34,  3.78s/it][A
Processing lab 6UXF7L28:  20%|█▉        | 258/1314 [14:15<1:06:34,  3.78s/it][A
Processing lab 6UXF7L28:  20%|█▉        | 259/1314 [14:16<47:23,  2.70s/it]  [A
Processing lab 6WD2LIHN:  20%|█▉        | 259/1314 [14:16<47:23,  2.70s/it][A
Processing lab 6WD2LIHN:  20%|█▉        | 260/1314 [14:16<35:05,  2.00s/it][A
Processing lab 6WT1F4RJ:  20%|█▉        | 260/1314 [14:16<35:05,  2.00s/it][A
Processing lab 6WT1F4RJ:  20%|█▉        | 261/1314 [14:18<32:31,  1.85s/it][A
Processing lab 6XVBD39G:  20%|█▉        | 261/1314 [14:18<32:31,  1.85s/it][A
Processing lab 6XVBD39G:  20%|█▉        | 262/1314 [14:19<31:04,  1.77s/it][A
Processing lab 6YSX60MZ:  20%|█▉        | 262/1314 [14:19<31:04,  1.77s/it][A
Processing lab 6YSX60MZ:  20%|██        | 263/1314 [14:19<22:48,  1.30s/it][A
Processing lab 7039MMH2:  20%|██        | 26

Processing lab 8D4D6M5V:  23%|██▎       | 308/1314 [16:54<50:24,  3.01s/it][A
Processing lab 8D4D6M5V:  24%|██▎       | 309/1314 [16:57<46:33,  2.78s/it][A
Processing lab 8ECLELF1:  24%|██▎       | 309/1314 [16:57<46:33,  2.78s/it][A
Processing lab 8ECLELF1:  24%|██▎       | 310/1314 [16:58<38:09,  2.28s/it][A
Processing lab 8EKC599S:  24%|██▎       | 310/1314 [16:58<38:09,  2.28s/it][A
Processing lab 8EKC599S:  24%|██▎       | 311/1314 [16:58<29:32,  1.77s/it][A
Processing lab 8F0XPAZX:  24%|██▎       | 311/1314 [16:58<29:32,  1.77s/it][A
Processing lab 8F0XPAZX:  24%|██▎       | 312/1314 [16:59<21:28,  1.29s/it][A
Processing lab 8FT6HD4D:  24%|██▎       | 312/1314 [16:59<21:28,  1.29s/it][A
Processing lab 8FT6HD4D:  24%|██▍       | 313/1314 [16:59<16:26,  1.01it/s][A
Processing lab 8FZMCIFG:  24%|██▍       | 313/1314 [16:59<16:26,  1.01it/s][A
Processing lab 8FZMCIFG:  24%|██▍       | 314/1314 [17:01<22:51,  1.37s/it][A
Processing lab 8G29TDOS:  24%|██▍       | 314/1314 [

Processing lab 9LSH625Y:  27%|██▋       | 359/1314 [19:25<29:55,  1.88s/it][A
Processing lab 9LSH625Y:  27%|██▋       | 360/1314 [19:26<24:10,  1.52s/it][A
Processing lab 9MC0DPDJ:  27%|██▋       | 360/1314 [19:26<24:10,  1.52s/it][A
Processing lab 9MC0DPDJ:  27%|██▋       | 361/1314 [19:26<18:11,  1.15s/it][A
Processing lab 9MC1YKKZ:  27%|██▋       | 361/1314 [19:26<18:11,  1.15s/it][A
Processing lab 9MC1YKKZ:  28%|██▊       | 362/1314 [19:27<18:40,  1.18s/it][A
Processing lab 9MEFUZQN:  28%|██▊       | 362/1314 [19:27<18:40,  1.18s/it][A
Processing lab 9MEFUZQN:  28%|██▊       | 363/1314 [19:28<18:09,  1.15s/it][A
Processing lab 9MG50RM7:  28%|██▊       | 363/1314 [19:28<18:09,  1.15s/it][A
Processing lab 9MG50RM7:  28%|██▊       | 364/1314 [19:29<14:50,  1.07it/s][A
Processing lab 9MZBKXJF:  28%|██▊       | 364/1314 [19:29<14:50,  1.07it/s][A
Processing lab 9MZBKXJF:  28%|██▊       | 365/1314 [19:30<17:34,  1.11s/it][A
Processing lab 9PWYZMNS:  28%|██▊       | 365/1314 [

Processing lab AOFJN8HX:  31%|███▏      | 411/1314 [22:45<39:52,  2.65s/it][A
Processing lab AOFPYGHC:  31%|███▏      | 411/1314 [22:45<39:52,  2.65s/it][A
Processing lab AOFPYGHC:  31%|███▏      | 412/1314 [22:45<28:34,  1.90s/it][A
Processing lab AOKRU4AF:  31%|███▏      | 412/1314 [22:45<28:34,  1.90s/it][A
Processing lab AOKRU4AF:  31%|███▏      | 413/1314 [22:48<33:20,  2.22s/it][A
Processing lab AOQQU910:  31%|███▏      | 413/1314 [22:48<33:20,  2.22s/it][A
Processing lab AOQQU910:  32%|███▏      | 414/1314 [22:48<25:52,  1.73s/it][A
Processing lab AR433PVR:  32%|███▏      | 414/1314 [22:48<25:52,  1.73s/it][A
Processing lab AR433PVR:  32%|███▏      | 415/1314 [22:57<57:44,  3.85s/it][A
Processing lab AS30HPUK:  32%|███▏      | 415/1314 [22:57<57:44,  3.85s/it][A
Processing lab AS30HPUK:  32%|███▏      | 416/1314 [23:00<53:30,  3.57s/it][A
Processing lab AUCMR8HU:  32%|███▏      | 416/1314 [23:00<53:30,  3.57s/it][A
Processing lab AUCMR8HU:  32%|███▏      | 417/1314 [

Processing lab C4W63WJ2:  35%|███▌      | 462/1314 [24:44<35:32,  2.50s/it][A
Processing lab C4W63WJ2:  35%|███▌      | 463/1314 [24:47<34:43,  2.45s/it][A
Processing lab CA0MBQ9S:  35%|███▌      | 463/1314 [24:47<34:43,  2.45s/it][A
Processing lab CA0MBQ9S:  35%|███▌      | 464/1314 [24:52<49:23,  3.49s/it][A
Processing lab CAO2H0WE:  35%|███▌      | 464/1314 [24:52<49:23,  3.49s/it][A
Processing lab CAO2H0WE:  35%|███▌      | 465/1314 [24:53<36:41,  2.59s/it][A
Processing lab CAQEITX6:  35%|███▌      | 465/1314 [24:53<36:41,  2.59s/it][A
Processing lab CAQEITX6:  35%|███▌      | 466/1314 [24:53<27:30,  1.95s/it][A
Processing lab CB714TAM:  35%|███▌      | 466/1314 [24:53<27:30,  1.95s/it][A
Processing lab CB714TAM:  36%|███▌      | 467/1314 [24:54<19:53,  1.41s/it][A
Processing lab CBCQST29:  36%|███▌      | 467/1314 [24:54<19:53,  1.41s/it][A
Processing lab CBCQST29:  36%|███▌      | 468/1314 [24:55<18:21,  1.30s/it][A
Processing lab CBFKYZ9S:  36%|███▌      | 468/1314 [

Processing lab DLSU0QRX:  39%|███▉      | 514/1314 [25:39<13:08,  1.01it/s][A
Processing lab DN01XVIU:  39%|███▉      | 514/1314 [25:39<13:08,  1.01it/s][A
Processing lab DQGG01WF:  39%|███▉      | 514/1314 [25:40<13:08,  1.01it/s][A
Processing lab DQGG01WF:  39%|███▉      | 516/1314 [25:40<11:12,  1.19it/s][A
Processing lab DRFCUPZO:  39%|███▉      | 516/1314 [25:40<11:12,  1.19it/s][A
Processing lab DRFCUPZO:  39%|███▉      | 517/1314 [25:41<10:32,  1.26it/s][A
Processing lab DSE2G8LF:  39%|███▉      | 517/1314 [25:41<10:32,  1.26it/s][A
Processing lab DSE2G8LF:  39%|███▉      | 518/1314 [25:42<09:36,  1.38it/s][A
Processing lab DY0KIZZ9:  39%|███▉      | 518/1314 [25:42<09:36,  1.38it/s][A
Processing lab DY0KIZZ9:  39%|███▉      | 519/1314 [25:43<11:01,  1.20it/s][A
Processing lab DZ2XFGQS:  39%|███▉      | 519/1314 [25:43<11:01,  1.20it/s][A
Processing lab DZ2XFGQS:  40%|███▉      | 520/1314 [25:44<11:14,  1.18it/s][A
Processing lab E3CE5WE9:  40%|███▉      | 520/1314 [

Processing lab F50DBVIK:  43%|████▎     | 566/1314 [27:07<21:01,  1.69s/it][A
Processing lab F8I0DT7Z:  43%|████▎     | 566/1314 [27:07<21:01,  1.69s/it][A
Processing lab F8I0DT7Z:  43%|████▎     | 567/1314 [27:10<26:11,  2.10s/it][A
Processing lab F8LNIZ27:  43%|████▎     | 567/1314 [27:10<26:11,  2.10s/it][A
Processing lab F8LNIZ27:  43%|████▎     | 568/1314 [27:11<22:32,  1.81s/it][A
Processing lab FCI1HZ3G:  43%|████▎     | 568/1314 [27:11<22:32,  1.81s/it][A
Processing lab FCI1HZ3G:  43%|████▎     | 569/1314 [27:13<22:56,  1.85s/it][A
Processing lab FEBWERSN:  43%|████▎     | 569/1314 [27:13<22:56,  1.85s/it][A
Processing lab FEBWERSN:  43%|████▎     | 570/1314 [27:15<23:53,  1.93s/it][A
Processing lab FH8TEJI1:  43%|████▎     | 570/1314 [27:15<23:53,  1.93s/it][A
Processing lab FH8TEJI1:  43%|████▎     | 571/1314 [27:16<19:02,  1.54s/it][A
Processing lab FHR8UUYO:  43%|████▎     | 571/1314 [27:16<19:02,  1.54s/it][A
Processing lab FHR8UUYO:  44%|████▎     | 572/1314 [

Processing lab GUWYJRRS:  47%|████▋     | 617/1314 [31:27<3:31:13, 18.18s/it][A
Processing lab GUWYJRRS:  47%|████▋     | 618/1314 [31:29<2:35:17, 13.39s/it][A
Processing lab GWJ0A1IK:  47%|████▋     | 618/1314 [31:29<2:35:17, 13.39s/it][A
Processing lab GWJ0A1IK:  47%|████▋     | 619/1314 [31:30<1:50:49,  9.57s/it][A
Processing lab GWP6E8FA:  47%|████▋     | 619/1314 [31:30<1:50:49,  9.57s/it][A
Processing lab GWP6E8FA:  47%|████▋     | 620/1314 [31:30<1:19:51,  6.90s/it][A
Processing lab GYCOAVYS:  47%|████▋     | 620/1314 [31:30<1:19:51,  6.90s/it][A
Processing lab GYCOAVYS:  47%|████▋     | 621/1314 [31:31<59:08,  5.12s/it]  [A
Processing lab GYCY8LCF:  47%|████▋     | 621/1314 [31:31<59:08,  5.12s/it][A
Processing lab GYCY8LCF:  47%|████▋     | 622/1314 [31:32<43:01,  3.73s/it][A
Processing lab GZMPRX5J:  47%|████▋     | 622/1314 [31:32<43:01,  3.73s/it][A
Processing lab GZMPRX5J:  47%|████▋     | 623/1314 [31:36<46:35,  4.05s/it][A
Processing lab H0WSDLJE:  47%|████▋ 

In [None]:
%%time
for i in train_features.iterrows():
    print(i)

In [None]:
train_features

In [None]:
train_features.loc[:,"sequence"]