# Genetic attribution preprocessing

In this notebook we overview the [Genetic Engineering Attrinution competition in Driven Data](https://www.drivendata.org/competitions/63/genetic-engineering-attribution/).   

In [1]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import csv
import requests
from tqdm import tqdm
from collections import OrderedDict


CPU times: user 277 ms, sys: 24.1 ms, total: 301 ms
Wall time: 299 ms


In [2]:
!which conda

/home/rio/anaconda3/envs/genetic_attribution/bin/conda


In [3]:
!which pip

/home/rio/anaconda3/envs/genetic_attribution/bin/pip


## Generating tree structure

In [4]:
def generate_tree(generate_dirs,erase=False):
    pbar = tqdm(generate_dirs)
    for d in pbar:
        if not os.path.isdir(d):
            pbar.set_description(f"Generating directory {d}")
            os.mkdir(d)
        elif erase:
            pbar.set_description(f"Erasing directory {d}")
            pbar.set_description(f"Generating directory {d}")
            os.mkdir(d)
        else:
            pbar.set_description(f"Directory {d} already exists. Skipping.")



In [20]:
%%time
#parent_dir = "/content/genetic_engineering_attribution"
parent_dir = "/home/rio/data_sets/genetic_engineering_attribution"

### directory to put downloaded files
original_data_dir = os.path.join(parent_dir,"original_data") 

### directory to put string patterns
string_patterns_dir = os.path.join(parent_dir, "string_patterns")

### directories to put train, val and test data sets
full_train_dir = os.path.join(parent_dir,"full_train")
train_dir = os.path.join(parent_dir,"train")
val_dir = os.path.join(parent_dir,"val")
test_dir = os.path.join(parent_dir,"test")

### directory with fitted pca objects
pca_dir = os.path.join(parent_dir,"pca")

### directory with pca engineered data sets
pca_engineered_data_dir = os.path.join(parent_dir,"pca_engineered_datasets")

generate_dirs = [parent_dir, original_data_dir, string_patterns_dir, pca_dir, full_train_dir, 
                 train_dir, val_dir, test_dir, pca_engineered_data_dir]
            
### generating tree
erase=False
generate_tree(generate_dirs,erase)

###checking
os.chdir(parent_dir)
current_dir = os.getcwd()
print("\n")
print(f"Current directory {current_dir}")
print(os.listdir(current_dir))

Directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets already exists. Skipping.: 100%|██████████| 9/9 [00:00<00:00, 2420.26it/s]



Current directory /home/rio/data_sets/genetic_engineering_attribution
['val', 'string_patterns', 'pca_engineered_datasets', 'pca', 'test', 'train', 'full_train', 'original_data']
CPU times: user 2.86 ms, sys: 4.12 ms, total: 6.98 ms
Wall time: 6.24 ms





## Fetching data

In [6]:
def fetch_data(url_dict,overwrite=False):
    pbar = tqdm(url_dict.items(),total=len(url_dict))
    for data,locs in pbar:
        #print("value: ", value)
        url, path = locs
        if os.path.isfile(path):
            if not overwrite:
                pbar.set_description(f"File {path} already exists. Skipping.")
                continue
            else:
                pbar.set_description(f"File {path} already exists. Overwriting.")
        else:
            pbar.set_description(f"Writing {path}.")
        response = requests.get(url)   
        with open(path, 'w') as f:
            writer = csv.writer(f)
            for line in response.iter_lines():
                writer.writerow(line.decode('utf-8').split(','))

In [7]:
%%time
####### training data
# training features
train_values_url = "https://drivendata-prod.s3.amazonaws.com/data/63/public/train_values.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200921%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200921T191351Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=f3b5911cbc8795264f6b8ffde9aee6f3b681f29bdbdf14c773120bf0192f86e7"
train_values_path = os.path.join(original_data_dir,"train_values.csv")
# training targets
train_labels_url = "https://drivendata-prod.s3.amazonaws.com/data/63/public/train_labels.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200921%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200921T191351Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=52fd0fd62b8627f6e530bc208058d322da578aa2b7b8f4f286c232ce00c6e34c" 
train_labels_path = os.path.join(original_data_dir,"train_labels.csv")
####### test data
# test features
test_values_url = "https://drivendata-prod.s3.amazonaws.com/data/63/public/test_values.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200921%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200921T191351Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=d2b1adab04148d904aa40674c6c2605a81e9e20e640e7989e89d8aafd0ac776a"
test_values_path = os.path.join(original_data_dir,"test_values.csv")
####### url_dict
url_dict = {"train_values": (train_values_url, train_values_path), "train_labels": (train_labels_url, train_labels_path), "test_values": (test_values_url, test_values_path)} 
####### fetching_data
overwrite = False
fetch_data(url_dict,overwrite)
######## Printing contents of directories
print("Contents of directories: \n")
print("original_data_dir: ", os.listdir(original_data_dir))
print("\n")
print("train_dir: ", os.listdir(train_dir))
print("\n")
print("val_dir: ", os.listdir(val_dir))
print("\n")
print("test_dir: ", os.listdir(test_dir))
print("\n")
#print("seqs_test_dir: ", os.listdir(seqs_test_dir))

File /home/rio/data_sets/genetic_engineering_attribution/original_data/test_values.csv already exists. Skipping.: 100%|██████████| 3/3 [00:00<00:00, 1662.87it/s]

Contents of directories: 

original_data_dir:  ['submission_format_3TFRxH6.csv', 'train_values.csv', 'test_values.csv', 'train_labels.csv']


train_dir:  ['train.csv']


val_dir:  ['val.csv']


test_dir:  ['test.csv']


CPU times: user 6.08 ms, sys: 191 µs, total: 6.27 ms
Wall time: 4.59 ms





#### Loading train_values

In [8]:
%%time
train_values = pd.read_csv(train_values_path)
print("shape of train_values: ", train_values.shape)
train_values.head(5)

shape of train_values:  (63017, 41)
CPU times: user 1.87 s, sys: 173 ms, total: 2.04 s
Wall time: 2.9 s


Unnamed: 0,sequence_id,sequence,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,copy_number_low_copy,copy_number_unknown,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,9ZIMC,CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCA...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,GCTGGATGGTTTGGGACATGTGCAGCCCCGTCTCTGTATGGAGTGA...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,NNCCGGGCTGTAGCTACACAGGGCGGAGATGAGAGCCCTACGAAAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,GCGGAGATGAAGAGCCCTACGAAAGCTGAGCCTGCGACTCCCGCAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,CGCGCATTACTTCACATGGTCCTCAAGGGTAACATGAAAGTGATCC...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Loading train_labels

In [9]:
%%time
train_labels = pd.read_csv(train_labels_path)
print("shape of train_labels: ", train_labels.shape)
train_labels.head(5)

shape of train_labels:  (63017, 1315)
CPU times: user 5.27 s, sys: 377 ms, total: 5.64 s
Wall time: 6.23 s


Unnamed: 0,sequence_id,00Q4V31T,012VT4JK,028IO5W2,03GRNN7N,03Y3W51H,09MQV1TY,0A4AHRCT,0A9M05NC,0B9GCUVV,...,ZQNGGY33,ZSHS4VJZ,ZT1IP3T6,ZU6860XU,ZU6TVFFU,ZU75P59K,ZUI6TDWV,ZWFD8OHC,ZX06ZDZN,ZZJVE4HO
0,9ZIMC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Loading test_values

In [10]:
%%time
test_values = pd.read_csv(test_values_path)
print("shape of test_labels: ", test_values.shape)
test_values.head(5)

shape of test_labels:  (18816, 41)
CPU times: user 539 ms, sys: 35.2 ms, total: 574 ms
Wall time: 907 ms


Unnamed: 0,sequence_id,sequence,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,copy_number_low_copy,copy_number_unknown,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,E0VFT,AGATCTATACATTGAATCAATATTGGCAATTAGCCATATTAGTCAT...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,TTRK5,GCGCGCGTTGACATTGATTATTGACTAGTTATTAATAGTAATCAAT...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2Z7FZ,GCTTAAGCGGTCGACGGATCGGGAGATCTCCCGATCCCCTATGGTG...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,VJI6E,ATGATGATGATGTCCCTGAACAGCAAGCAGGCGTTTAGCATGCCGC...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,721FI,GGTACCGAGCTCTTACGCGTGCTAGCCATACTATCAGCCACTTGTG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Generating training, val and test sets

#### Adding columns with lab_id and sequence lengths to train_values

In [11]:
def add_columns(df_values, df_labels=None):
    lab_ids = []
    seq_lengths = []
    pbar = tqdm(df_values.index)
    df_values = df_values.copy()
    if df_labels is None:
        lab_id = None
    for ix in pbar:
        if df_labels is not None:
            lab_id = df_labels.columns[df_labels.loc[ix,:]==1][0]
        seq_id = df_values.loc[ix,"sequence_id"]
        seq_len = len(df_values.loc[ix,"sequence"])
        pbar.set_description(f"Sequence: {seq_id} Lab: {lab_id}")
        lab_ids.append(lab_id)
        seq_lengths.append(seq_len)
    cols = list(df_values.columns)[df_values.columns.get_loc("bacterial_resistance_ampicillin"):]
    df_values["lab_id"] = lab_ids
    df_values["seq_length"] = seq_lengths
    cols = ["sequence_id", "lab_id", "sequence", "seq_length"] + cols
    df_values = df_values.loc[:,cols]
    return df_values
        

#### Generating training and test data sets

In [12]:
%%time
df_train_val = add_columns(train_values,train_labels)
df_test = add_columns(test_values,None)


Sequence: U5MR3 Lab: 3EZXYI3U: 100%|██████████| 63017/63017 [01:24<00:00, 749.91it/s]
Sequence: IOOW4 Lab: None:   6%|▋         | 1198/18816 [00:00<00:07, 2373.18it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Sequence: P6CLX Lab: None:  21%|██        | 3904/18816 [00:01<00:04, 3031.18it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

Sequence: K5AZR Lab: None:  40%|███▉      | 7450/18816 [00:02<00:03, 2905.39it/s]IOPub message rate ex

CPU times: user 1min 35s, sys: 5.89 s, total: 1min 40s
Wall time: 1min 30s


In [13]:
print("Shape of df_train_val: ", df_train_val.shape)
df_train_val.head()

Shape of df_train_val:  (63017, 43)


Unnamed: 0,sequence_id,lab_id,sequence,seq_length,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,9ZIMC,RYUA3GVO,CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCA...,7151,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,RYUA3GVO,GCTGGATGGTTTGGGACATGTGCAGCCCCGTCTCTGTATGGAGTGA...,456,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,RYUA3GVO,NNCCGGGCTGTAGCTACACAGGGCGGAGATGAGAGCCCTACGAAAG...,1450,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,RYUA3GVO,GCGGAGATGAAGAGCCCTACGAAAGCTGAGCCTGCGACTCCCGCAG...,914,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,RYUA3GVO,CGCGCATTACTTCACATGGTCCTCAAGGGTAACATGAAAGTGATCC...,1350,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
print("Shape of df_test: ", df_test.shape)
df_test.head()

Shape of df_test:  (18816, 43)


Unnamed: 0,sequence_id,lab_id,sequence,seq_length,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,E0VFT,,AGATCTATACATTGAATCAATATTGGCAATTAGCCATATTAGTCAT...,9379,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,TTRK5,,GCGCGCGTTGACATTGATTATTGACTAGTTATTAATAGTAATCAAT...,6673,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2Z7FZ,,GCTTAAGCGGTCGACGGATCGGGAGATCTCCCGATCCCCTATGGTG...,9044,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,VJI6E,,ATGATGATGATGTCCCTGAACAGCAAGCAGGCGTTTAGCATGCCGC...,1230,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,721FI,,GGTACCGAGCTCTTACGCGTGCTAGCCATACTATCAGCCACTTGTG...,6378,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Generating validation set

Note that the number of samples per lab_id is highly unbalanced. 

In [15]:
df_train_val['lab_id'].value_counts()

I7FXTVDP    8286
RKJHZGDQ    2732
GTVTUGVY    2672
A18S09P2    1064
Q2K8NHZY     973
            ... 
G2P73NZ0       3
58BSUZQB       3
WB78G3XF       2
ON9AXMKF       1
0L3Y6ZB2       1
Name: lab_id, Length: 1314, dtype: int64

#### Making val data

In [16]:
def make_val(df_train_val,train_size=0.8):
    shuffle_ix = np.random.permutation(df_train_val.index)
    split_point = int(len(shuffle_ix)*train_size)
    train_ix = shuffle_ix[:split_point]
    val_ix = shuffle_ix[split_point:]
    df_train = df_train_val.loc[train_ix,:]
    df_val = df_train_val.loc[val_ix,:]
    return df_train, df_val
    

In [17]:
%%time
np.random.seed(38748)
train_size = 0.8
df_train, df_val = make_val(df_train_val,train_size)
print(f"Shape of training set: {df_train.shape} % training set: {len(df_train)/len(df_train_val)} ")
print(f"Shape of validation set: {df_val.shape} % training set: {len(df_val)/len(df_train_val)} ")

Shape of training set: (50413, 43) % training set: 0.7999904787596998 
Shape of validation set: (12604, 43) % training set: 0.20000952124030025 
CPU times: user 17.2 ms, sys: 22 µs, total: 17.2 ms
Wall time: 16.6 ms


In [18]:
df_train.head()

Unnamed: 0,sequence_id,lab_id,sequence,seq_length,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
4350,SMVSS,I7FXTVDP,GACGGATCGGGAGATCTCCCGATCCCCTATGGTGCACTCTCAGTAC...,7723,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10470,6MC5M,I7FXTVDP,GCTAGAGCCGTGAACGACAGGGCGAACGCCAGCCCGCCGACGGCGA...,10550,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
30997,BHOUO,P3Q11IAK,GGCTTTGTTAGCAGCCGGATCCTTATCAGTCTGCGGCAGGATTGGC...,1085,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
29915,EI1J7,I7FXTVDP,GACGGATCGGGAGATCTCCCGATCCCCTATGGTGCACTCTCAGTAC...,9216,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19843,NVS4Z,AMV4U0A0,TATACGACTCACTATAGGGCGAATTGGGCCCTCTAGATGCATGCTC...,975,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


#### Saving data sets

In [22]:
%%time
full_train_path = os.path.join(full_train_dir,"full_train.csv")
train_path = os.path.join(train_dir,"train.csv")
val_path = os.path.join(val_dir,"val.csv")
test_path = os.path.join(test_dir,"test.csv")
print(f"Saving train_val_df in {full_train_path}")
df_train_val.to_csv(full_train_path)
print(f"Saving train_df in {train_path}")
df_train.to_csv(train_path)
print(f"Saving train_df in {val_path}")
df_val.to_csv(val_path)
print(f"Saving train_df in {test_path}")
df_test.to_csv(test_path)

Saving train_val_df in /home/rio/data_sets/genetic_engineering_attribution/full_train/full_train.csv
Saving train_df in /home/rio/data_sets/genetic_engineering_attribution/train/train.csv
Saving train_df in /home/rio/data_sets/genetic_engineering_attribution/val/val.csv
Saving train_df in /home/rio/data_sets/genetic_engineering_attribution/test/test.csv
CPU times: user 10.4 s, sys: 469 ms, total: 10.9 s
Wall time: 13.3 s
