# Genetic attribution preprocessing

In this notebook we overview the [Genetic Engineering Attrinution competition in Driven Data](https://www.drivendata.org/competitions/63/genetic-engineering-attribution/).   

In [1]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import csv
import requests
from tqdm import tqdm
from collections import OrderedDict


CPU times: user 240 ms, sys: 55 ms, total: 295 ms
Wall time: 295 ms


In [2]:
!which conda

/home/rio/anaconda3/envs/genetic_attribution/bin/conda


In [3]:
!which pip

/home/rio/anaconda3/envs/genetic_attribution/bin/pip


## Generating tree structure and fetching data

In [4]:
def generate_tree(generate_dirs,erase=False):
    pbar = tqdm(generate_dirs)
    for d in pbar:
        if not os.path.isdir(d):
            pbar.set_description(f"Generating directory {d}")
            os.mkdir(d)
        elif erase:
            pbar.set_description(f"Erasing directory {d}")
            pbar.set_description(f"Generating directory {d}")
            os.mkdir(d)
        else:
            pbar.set_description(f"Directory {d} already exists. Skipping.")



In [5]:
%%time
#parent_dir = "/content/genetic_engineering_attribution"
parent_dir = "/home/rio/data_sets/genetic_engineering_attribution"

### directory to put downloaded files
original_data_dir = os.path.join(parent_dir,"original_data") 

### directory to put string patterns
string_patterns_dir = os.path.join(parent_dir, "string_patterns")

### directories to put train, val and test data sets
train_dir = os.path.join(parent_dir,"train")
val_dir = os.path.join(parent_dir,"val")
test_dir = os.path.join(parent_dir,"test")

generate_dirs = [parent_dir, original_data_dir, string_patterns_dir, train_dir, val_dir, test_dir]
            
### generating tree
erase=False
generate_tree(generate_dirs,erase)

###checking
os.chdir(parent_dir)
current_dir = os.getcwd()
print("\n")
print(f"Current directory {current_dir}")
print(os.listdir(current_dir))

Directory /home/rio/data_sets/genetic_engineering_attribution/test already exists. Skipping.: 100%|██████████| 6/6 [00:00<00:00, 49.99it/s] 



Current directory /home/rio/data_sets/genetic_engineering_attribution
['val', 'string_patterns', 'test', 'train', 'original_data']
CPU times: user 25.7 ms, sys: 5.22 ms, total: 31 ms
Wall time: 132 ms





#### Fetching data

In [11]:
def fetch_data(url_dict,overwrite=False):
    pbar = tqdm(url_dict.items(),total=len(url_dict))
    for data,locs in pbar:
        #print("value: ", value)
        url, path = locs
        if os.path.isfile(path):
            if not overwrite:
                pbar.set_description(f"File {path} already exists. Skipping.")
                continue
            else:
                pbar.set_description(f"File {path} already exists. Overwriting.")
        else:
            pbar.set_description(f"Writing {path}.")
        response = requests.get(url)   
        with open(path, 'w') as f:
            writer = csv.writer(f)
            for line in response.iter_lines():
                writer.writerow(line.decode('utf-8').split(','))

In [12]:
%%time
####### training data
# training features
train_values_url = "https://drivendata-prod.s3.amazonaws.com/data/63/public/train_values.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200921%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200921T191351Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=f3b5911cbc8795264f6b8ffde9aee6f3b681f29bdbdf14c773120bf0192f86e7"
train_values_path = os.path.join(original_data_dir,"train_values.csv")
# training targets
train_labels_url = "https://drivendata-prod.s3.amazonaws.com/data/63/public/train_labels.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200921%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200921T191351Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=52fd0fd62b8627f6e530bc208058d322da578aa2b7b8f4f286c232ce00c6e34c" 
train_labels_path = os.path.join(original_data_dir,"train_labels.csv")
####### test data
# test features
test_values_url = "https://drivendata-prod.s3.amazonaws.com/data/63/public/test_values.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200921%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200921T191351Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=d2b1adab04148d904aa40674c6c2605a81e9e20e640e7989e89d8aafd0ac776a"
test_values_path = os.path.join(original_data_dir,"test_values.csv")
####### url_dict
url_dict = {"train_values": (train_values_url, train_values_path), "train_labels": (train_labels_url, train_labels_path), "test_values": (test_values_url, test_values_path)} 
####### fetching_data
overwrite = False
fetch_data(url_dict,overwrite)
######## Printing contents of directories
print("Contents of directories: \n")
print("original_data_dir: ", os.listdir(original_data_dir))
print("\n")
print("train_dir: ", os.listdir(train_dir))
print("\n")
print("val_dir: ", os.listdir(val_dir))
print("\n")
print("test_dir: ", os.listdir(test_dir))
print("\n")
#print("seqs_test_dir: ", os.listdir(seqs_test_dir))

Writing /home/rio/data_sets/genetic_engineering_attribution/original_data/test_values.csv.: 100%|██████████| 3/3 [14:05<00:00, 281.89s/it] 

Contents of directories: 

original_data_dir:  ['train_values.csv', 'test_values.csv', 'train_labels.csv']


train_dir:  []


val_dir:  []


test_dir:  []


CPU times: user 29.8 s, sys: 11.9 s, total: 41.7 s
Wall time: 14min 5s





#### Loading train_values

In [14]:
%%time
train_values = pd.read_csv(train_values_path)
print("shape of train_values: ", train_values.shape)
train_values.head(5)

shape of train_values:  (63017, 41)
CPU times: user 1.64 s, sys: 65.9 ms, total: 1.7 s
Wall time: 1.7 s


Unnamed: 0,sequence_id,sequence,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,copy_number_low_copy,copy_number_unknown,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,9ZIMC,CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCA...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,GCTGGATGGTTTGGGACATGTGCAGCCCCGTCTCTGTATGGAGTGA...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,NNCCGGGCTGTAGCTACACAGGGCGGAGATGAGAGCCCTACGAAAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,GCGGAGATGAAGAGCCCTACGAAAGCTGAGCCTGCGACTCCCGCAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,CGCGCATTACTTCACATGGTCCTCAAGGGTAACATGAAAGTGATCC...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Loading train_labels

In [15]:
%%time
train_labels = pd.read_csv(train_labels_path)
print("shape of train_labels: ", train_labels.shape)
train_labels.head(5)

shape of train_labels:  (63017, 1315)
CPU times: user 5.1 s, sys: 115 ms, total: 5.22 s
Wall time: 5.21 s


Unnamed: 0,sequence_id,00Q4V31T,012VT4JK,028IO5W2,03GRNN7N,03Y3W51H,09MQV1TY,0A4AHRCT,0A9M05NC,0B9GCUVV,...,ZQNGGY33,ZSHS4VJZ,ZT1IP3T6,ZU6860XU,ZU6TVFFU,ZU75P59K,ZUI6TDWV,ZWFD8OHC,ZX06ZDZN,ZZJVE4HO
0,9ZIMC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Loading test_values

In [16]:
%%time
test_values = pd.read_csv(test_values_path)
print("shape of test_labels: ", test_values.shape)
test_values.head(5)

shape of test_labels:  (18816, 41)
CPU times: user 485 ms, sys: 0 ns, total: 485 ms
Wall time: 484 ms


Unnamed: 0,sequence_id,sequence,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,copy_number_low_copy,copy_number_unknown,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,E0VFT,AGATCTATACATTGAATCAATATTGGCAATTAGCCATATTAGTCAT...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,TTRK5,GCGCGCGTTGACATTGATTATTGACTAGTTATTAATAGTAATCAAT...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2Z7FZ,GCTTAAGCGGTCGACGGATCGGGAGATCTCCCGATCCCCTATGGTG...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,VJI6E,ATGATGATGATGTCCCTGAACAGCAAGCAGGCGTTTAGCATGCCGC...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,721FI,GGTACCGAGCTCTTACGCGTGCTAGCCATACTATCAGCCACTTGTG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Generating training, val and test sets

#### Adding columns with lab_id and sequence lengths to train_values

In [17]:
def add_columns(df_values, df_labels=None):
    lab_ids = []
    seq_lengths = []
    pbar = tqdm(df_values.index)
    df_values = df_values.copy()
    if df_labels is None:
        lab_id = None
    for ix in pbar:
        if df_labels is not None:
            lab_id = df_labels.columns[df_labels.loc[ix,:]==1][0]
        seq_id = df_values.loc[ix,"sequence_id"]
        seq_len = len(df_values.loc[ix,"sequence"])
        pbar.set_description(f"Sequence: {seq_id} Lab: {lab_id}")
        lab_ids.append(lab_id)
        seq_lengths.append(seq_len)
    cols = list(df_values.columns)[df_values.columns.get_loc("bacterial_resistance_ampicillin"):]
    df_values["lab_id"] = lab_ids
    df_values["seq_length"] = seq_lengths
    cols = ["sequence_id", "lab_id", "sequence", "seq_length"] + cols
    df_values = df_values.loc[:,cols]
    return df_values
        

#### Generating training and test data sets

In [18]:
%%time
df_train_val = add_columns(train_values,train_labels)
df_test = add_columns(test_values,None)


Sequence: U5MR3 Lab: 3EZXYI3U: 100%|██████████| 63017/63017 [01:25<00:00, 736.29it/s]
Sequence: IAZLY Lab: None:   7%|▋         | 1302/18816 [00:00<00:07, 2500.79it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Sequence: HRMSG Lab: None:  22%|██▏       | 4071/18816 [00:01<00:05, 2911.42it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Sequence: LASAX Lab: None:  41%|████      | 7726/18816 [00:02<00:03, 3113.44it/s]IOPub message rate ex

CPU times: user 1min 36s, sys: 5.77 s, total: 1min 42s
Wall time: 1min 32s


In [19]:
print("Shape of df_train_val: ", df_train_val.shape)
df_train_val.head()

Shape of df_train_val:  (63017, 43)


Unnamed: 0,sequence_id,lab_id,sequence,seq_length,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,9ZIMC,RYUA3GVO,CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCA...,7151,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,RYUA3GVO,GCTGGATGGTTTGGGACATGTGCAGCCCCGTCTCTGTATGGAGTGA...,456,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,RYUA3GVO,NNCCGGGCTGTAGCTACACAGGGCGGAGATGAGAGCCCTACGAAAG...,1450,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,RYUA3GVO,GCGGAGATGAAGAGCCCTACGAAAGCTGAGCCTGCGACTCCCGCAG...,914,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,RYUA3GVO,CGCGCATTACTTCACATGGTCCTCAAGGGTAACATGAAAGTGATCC...,1350,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
print("Shape of df_test: ", df_test.shape)
df_test.head()

Shape of df_test:  (18816, 43)


Unnamed: 0,sequence_id,lab_id,sequence,seq_length,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,E0VFT,,AGATCTATACATTGAATCAATATTGGCAATTAGCCATATTAGTCAT...,9379,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,TTRK5,,GCGCGCGTTGACATTGATTATTGACTAGTTATTAATAGTAATCAAT...,6673,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2Z7FZ,,GCTTAAGCGGTCGACGGATCGGGAGATCTCCCGATCCCCTATGGTG...,9044,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,VJI6E,,ATGATGATGATGTCCCTGAACAGCAAGCAGGCGTTTAGCATGCCGC...,1230,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,721FI,,GGTACCGAGCTCTTACGCGTGCTAGCCATACTATCAGCCACTTGTG...,6378,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Generating validation set

Note that the number of samples per lab_id is highly unbalanced. 

In [21]:
df_train_val['lab_id'].value_counts()

I7FXTVDP    8286
RKJHZGDQ    2732
GTVTUGVY    2672
A18S09P2    1064
Q2K8NHZY     973
            ... 
58BSUZQB       3
G2P73NZ0       3
WB78G3XF       2
ON9AXMKF       1
0L3Y6ZB2       1
Name: lab_id, Length: 1314, dtype: int64

#### Making val data

In [22]:
def make_val(df_train_val,train_size=0.8):
    shuffle_ix = np.random.permutation(df_train_val.index)
    split_point = int(len(shuffle_ix)*train_size)
    train_ix = shuffle_ix[:split_point]
    val_ix = shuffle_ix[split_point:]
    df_train = df_train_val.loc[train_ix,:]
    df_val = df_train_val.loc[val_ix,:]
    return df_train, df_val
    

In [24]:
%%time
np.random.seed(38748)
train_size = 0.8
df_train, df_val = make_val(df_train_val,train_size)
print(f"Shape of training set: {df_train.shape} % training set: {len(df_train)/len(df_train_val)} ")
print(f"Shape of validation set: {df_val.shape} % training set: {len(df_val)/len(df_train_val)} ")

Shape of training set: (50413, 43) % training set: 0.7999904787596998 
Shape of validation set: (12604, 43) % training set: 0.20000952124030025 
CPU times: user 25.3 ms, sys: 0 ns, total: 25.3 ms
Wall time: 24.7 ms


In [25]:
df_train.head()

Unnamed: 0,sequence_id,lab_id,sequence,seq_length,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
4350,SMVSS,I7FXTVDP,GACGGATCGGGAGATCTCCCGATCCCCTATGGTGCACTCTCAGTAC...,7723,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10470,6MC5M,I7FXTVDP,GCTAGAGCCGTGAACGACAGGGCGAACGCCAGCCCGCCGACGGCGA...,10550,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
30997,BHOUO,P3Q11IAK,GGCTTTGTTAGCAGCCGGATCCTTATCAGTCTGCGGCAGGATTGGC...,1085,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
29915,EI1J7,I7FXTVDP,GACGGATCGGGAGATCTCCCGATCCCCTATGGTGCACTCTCAGTAC...,9216,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19843,NVS4Z,AMV4U0A0,TATACGACTCACTATAGGGCGAATTGGGCCCTCTAGATGCATGCTC...,975,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


#### Saving data sets

In [26]:
%%time
train_path = os.path.join(train_dir,"train.csv")
val_path = os.path.join(val_dir,"val.csv")
test_path = os.path.join(test_dir,"test.csv")
print(f"Saving train_df in {train_path}")
df_train.to_csv(train_path)
print(f"Saving train_df in {val_path}")
df_val.to_csv(val_path)
print(f"Saving train_df in {test_path}")
df_test.to_csv(test_path)

Saving train_df in /home/rio/data_sets/genetic_engineering_attribution/train/train.csv
Saving train_df in /home/rio/data_sets/genetic_engineering_attribution/val/val.csv
Saving train_df in /home/rio/data_sets/genetic_engineering_attribution/test/test.csv
CPU times: user 5.9 s, sys: 109 ms, total: 6.01 s
Wall time: 6.02 s


## The code below is DRERECATED

## Generating small samples of datasets

In [9]:
np.random.seed(98346)
n_samples = 1000
sample_train_ixs = np.random.permutation(len(train_values))[:n_samples]
sample_test_ixs = np.random.permutation(len(test_values))[:n_samples]
sample_train_values_path = os.path.join(sample_train_dir,"sample_train_values.csv")
sample_train_labels_path = os.path.join(sample_train_dir,"sample_train_labels.csv")
sample_test_values_path = os.path.join(sample_test_dir,"sample_test_values.csv")
if SMALL_SAMPLE:
    if os.path.isfile(sample_train_values_path):
        train_values = pd.read_csv(sample_train_values_path, index_col=0)
        train_labels = pd.read_csv(sample_train_labels_path, index_col=0)
        test_values = pd.read_csv(sample_test_values_path, index_col=0)
    else:
        train_values = train_values.iloc[sample_train_ixs,:]
        train_labels = train_labels.iloc[sample_train_ixs,:]
        test_values = test_values.iloc[sample_test_ixs,:]
        ### saving to csv
        train_values.to_csv(sample_train_values_path)
        train_labels.to_csv(sample_train_labels_path)
        test_values.to_csv(sample_test_values_path)

In [10]:
print("shape of train_values: ", train_values.shape)
#print(train_values.head(10))
display(train_values.head(5))

shape of train_values:  (63017, 41)


Unnamed: 0,sequence_id,sequence,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,copy_number_low_copy,copy_number_unknown,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,9ZIMC,CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCA...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,GCTGGATGGTTTGGGACATGTGCAGCCCCGTCTCTGTATGGAGTGA...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,NNCCGGGCTGTAGCTACACAGGGCGGAGATGAGAGCCCTACGAAAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,GCGGAGATGAAGAGCCCTACGAAAGCTGAGCCTGCGACTCCCGCAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,CGCGCATTACTTCACATGGTCCTCAAGGGTAACATGAAAGTGATCC...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
print("shape of train_labels: ", train_labels.shape)
#print(train_values.head(10))
display(train_labels.head(5))

shape of train_labels:  (63017, 1315)


Unnamed: 0,sequence_id,00Q4V31T,012VT4JK,028IO5W2,03GRNN7N,03Y3W51H,09MQV1TY,0A4AHRCT,0A9M05NC,0B9GCUVV,...,ZQNGGY33,ZSHS4VJZ,ZT1IP3T6,ZU6860XU,ZU6TVFFU,ZU75P59K,ZUI6TDWV,ZWFD8OHC,ZX06ZDZN,ZZJVE4HO
0,9ZIMC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
print("shape of test_values: ", test_values.shape)
#print(train_values.head(10))
display(test_values.head(5))

shape of test_values:  (18816, 41)


Unnamed: 0,sequence_id,sequence,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,copy_number_low_copy,copy_number_unknown,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,E0VFT,AGATCTATACATTGAATCAATATTGGCAATTAGCCATATTAGTCAT...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,TTRK5,GCGCGCGTTGACATTGATTATTGACTAGTTATTAATAGTAATCAAT...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2Z7FZ,GCTTAAGCGGTCGACGGATCGGGAGATCTCCCGATCCCCTATGGTG...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,VJI6E,ATGATGATGATGTCCCTGAACAGCAAGCAGGCGTTTAGCATGCCGC...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,721FI,GGTACCGAGCTCTTACGCGTGCTAGCCATACTATCAGCCACTTGTG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Preprocessing

#### One-hot encoding DNA sequences

In [13]:
def dna2dict(dna_seq):
    """
    DNA bases are ordered in tuples alphabetically as (A,C,G,N,T)
    """
    dna_dict = OrderedDict()
    dna_seq = dna_seq.upper()
    for ix,b in enumerate(dna_seq):
        dna_dict[ix] = (int(b=="A"),int(b=="C"),int(b=="G"),int(b=="N"),int(b=="T"))
    return dna_dict
        

In [14]:
dna_dict = dna2dict(train_values.sequence.values[1])
dna_dict

OrderedDict([(0, (0, 0, 1, 0, 0)),
             (1, (0, 1, 0, 0, 0)),
             (2, (0, 0, 0, 0, 1)),
             (3, (0, 0, 1, 0, 0)),
             (4, (0, 0, 1, 0, 0)),
             (5, (1, 0, 0, 0, 0)),
             (6, (0, 0, 0, 0, 1)),
             (7, (0, 0, 1, 0, 0)),
             (8, (0, 0, 1, 0, 0)),
             (9, (0, 0, 0, 0, 1)),
             (10, (0, 0, 0, 0, 1)),
             (11, (0, 0, 0, 0, 1)),
             (12, (0, 0, 1, 0, 0)),
             (13, (0, 0, 1, 0, 0)),
             (14, (0, 0, 1, 0, 0)),
             (15, (1, 0, 0, 0, 0)),
             (16, (0, 1, 0, 0, 0)),
             (17, (1, 0, 0, 0, 0)),
             (18, (0, 0, 0, 0, 1)),
             (19, (0, 0, 1, 0, 0)),
             (20, (0, 0, 0, 0, 1)),
             (21, (0, 0, 1, 0, 0)),
             (22, (0, 1, 0, 0, 0)),
             (23, (1, 0, 0, 0, 0)),
             (24, (0, 0, 1, 0, 0)),
             (25, (0, 1, 0, 0, 0)),
             (26, (0, 1, 0, 0, 0)),
             (27, (0, 1, 0, 0, 0)),
  

In [15]:
len(dna_dict)

456

In [16]:
list(dna_dict.values())

[(0, 0, 1, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 0, 0, 1),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 1, 0, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 1, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 1, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 0,

#### One-hot encoded DNA sequences as dataframes

In [17]:
def dna2df(dna_seq,seq_id=None, lab_id = None):
    dna_dict = dna2dict(dna_seq)
    dna_df = pd.DataFrame(dna_dict.values(),columns=("A","C","G","N","T"))
    if seq_id:
        columns = ["sequence_id"] + list(dna_df.columns)
        dna_df["sequence_id"] = [seq_id]*len(dna_df)
        dna_df = dna_df.loc[:,columns]
    if lab_id:
        columns = ["lab_id"] + list(dna_df.columns)
        dna_df["lab_id"] = [lab_id]*len(dna_df)
        dna_df = dna_df.loc[:,columns]
    return dna_df


In [18]:
dna_seq = train_values.sequence.values[0]
seq_id = train_values.sequence_id.values[0]
lab_id = train_labels.columns[train_labels.iloc[0,:]==1][0]
dna_df = dna2df(dna_seq=dna_seq,seq_id=seq_id,lab_id=lab_id)
dna_df

Unnamed: 0,lab_id,sequence_id,A,C,G,N,T
0,RYUA3GVO,9ZIMC,0,1,0,0,0
1,RYUA3GVO,9ZIMC,1,0,0,0,0
2,RYUA3GVO,9ZIMC,0,0,0,0,1
3,RYUA3GVO,9ZIMC,0,0,1,0,0
4,RYUA3GVO,9ZIMC,0,1,0,0,0
...,...,...,...,...,...,...,...
7146,RYUA3GVO,9ZIMC,1,0,0,0,0
7147,RYUA3GVO,9ZIMC,0,1,0,0,0
7148,RYUA3GVO,9ZIMC,0,1,0,0,0
7149,RYUA3GVO,9ZIMC,0,0,1,0,0


In [19]:
lab_id

'RYUA3GVO'

## Generating separate csvs for each DNA sequence

The .csvs are grouped by lab in different dirs.

In [22]:
def get_lab_id(ix,labels):
    unique_labs = labels.columns[1:] 
    lab_id = unique_labs[labels.iloc[ix,1:]==1]
    return lab_id

In [23]:
%%time
#labs_train_dir = os.path.join(processed_train_dir,"labs_train_dir")
def split_labs(df_values,df_labels=None,savedir=None,usecols=None):
    #pbar = tqdm(train_values.iterrows(),total=len(train_values))
    if usecols is None:
        usecols = list(df_values.columns[df_values.columns.get_loc("sequence")+1:])
    ixs = df_values.index
    lab_id = None
    pbar = tqdm(ixs)
    #print("len(ixs): ", len(ixs))
    for ix in pbar:
        row = df_values.loc[ix,:]
        dna_seq = row.sequence
        seq_id = row.sequence_id
        #print("row: ", row)
        if df_labels is not None:
            lab_id = df_labels.columns[df_labels.loc[ix,:]==1][0]
            #print("lab_id: ", lab_id)
        pbar.set_description(f"Processing sequence_id: {seq_id} lab_id: {lab_id}")
        dna_df = dna2df(dna_seq,seq_id,lab_id)
        #pbar.set_description(f"Processing sample {seq_id}")
        #print("dna_df: ", dna_df)
        if usecols:
            repeat = row.loc[usecols].values.reshape(1,-1)
            repeat = np.repeat( repeat,len(dna_df), axis=0)
            #print("shape: ", np.repeat(sample.loc[usecols].values,len(dna_df),axis=1).shape)
            #print(np.repeat( sample.loc[usecols].values,len(dna_df)  ))
            extra_cols_df = pd.DataFrame( repeat ,columns=usecols)   
            dna_df = pd.concat([dna_df,extra_cols_df],axis=1)
            #df_lab_concat.append(dna_df)
            #df_lab = pd.concat(df_lab_concat,axis=0,ignore_index=True)
        if savedir is not None:
            if df_labels is not None:
                labdir =os.path.join(savedir,lab_id)
                if not os.path.isdir(labdir):
                    os.mkdir(labdir)
                savepath = os.path.join(labdir,seq_id+".csv")
            else:
                savepath = os.path.join(savedir,seq_id+".csv")
            #print("savepath: ", savepath)
            dna_df.to_csv(savepath,index=True)
        
        
        
        
  
    

CPU times: user 10 µs, sys: 2 µs, total: 12 µs
Wall time: 16.9 µs


#### Saving a .csv for each lab (TRAINING)

In [24]:
%%time
split_labs(train_values,train_labels,labs_train_dir,usecols=None)
#split_labs(train_values,train_labels,labs_train_dir,usecols=None)

Processing sequence_id: U5MR3 lab_id: 3EZXYI3U: 100%|██████████| 63017/63017 [1:14:16<00:00, 14.14it/s]


CPU times: user 1h 12min 54s, sys: 1min 17s, total: 1h 14min 12s
Wall time: 1h 14min 16s


#### Saving a .csv for each seq (TEST)

In [25]:
%%time
split_labs(test_values,None,seqs_test_dir,usecols=None)

Processing sequence_id: BD9BA lab_id: None: 100%|██████████| 18816/18816 [22:50<00:00, 13.73it/s]


CPU times: user 22min 32s, sys: 24 s, total: 22min 56s
Wall time: 22min 50s
