# Genetic attribution preprocessing

In this notebook we overview the [Genetic Engineering Attrinution competition in Driven Data](https://www.drivendata.org/competitions/63/genetic-engineering-attribution/).   

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import csv
import requests
from tqdm import tqdm
from collections import OrderedDict
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Generating tree structure and fetching data

In [2]:
!rm -r /content/genetic_engineering_attribution

#### Generating tree structure

In [3]:
parent_dir = "/content/genetic_engineering_attribution"
### train/test directories
train_dir = os.path.join(parent_dir,"train") 
test_dir = os.path.join(parent_dir,"test")
### train/test directories with processed data
#processed_dir = os.path.join(parent_dir,"processed")
processed_train_dir = os.path.join(train_dir,"processed")
labs_train_dir = os.path.join(processed_train_dir,"labs")
processed_test_dir = os.path.join(test_dir,"processed")
labs_test_dir = os.path.join(processed_test_dir,"labs")
#split_labs_dir = os.path.join(parent_dir,"split_labs")
#if not os.path.isdir(split_labs_dir):
#    os.mkdir(split_labs_dir)

generate_dirs = [parent_dir, train_dir, test_dir, processed_train_dir, labs_train_dir, processed_test_dir, labs_test_dir]
pbar = tqdm(generate_dirs)
for d in pbar:
    if not os.path.isdir(d):
        pbar.set_description(f"Generating directory {d}")
        os.mkdir(d)
    else:
        pbar.set_description(f"Directory {d} already exists. Skipping.")
os.chdir(parent_dir)
current_dir = os.getcwd()
print("\n")
print(f"Current directory {current_dir}")
print(os.listdir(current_dir))

Generating directory /content/genetic_engineering_attribution/test/processed/labs: 100%|██████████| 7/7 [00:00<00:00, 446.19it/s]



Current directory /content/genetic_engineering_attribution
['train', 'test']





#### Fetching data

In [4]:
%%time
####### training data
# training features
train_values_url = "https://drivendata-prod.s3.amazonaws.com/data/63/public/train_values.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200902%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200902T200909Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=6d1100ca13af398723e10b0171fc20c404beebb62cb46c5db44d05ddbea4bb77"
train_values_path = os.path.join(train_dir,"train_values.csv")
# training targets
train_labels_url = "https://drivendata-prod.s3.amazonaws.com/data/63/public/train_labels.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200902%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200902T200909Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=654b65ffb7caa66786abf6480edd34bc29705ac3691018a64d49d484c25ad776" 
train_labels_path = os.path.join(train_dir,"train_labels.csv")
####### test data
# test features
test_values_url = "https://drivendata-prod.s3.amazonaws.com/data/63/public/test_values.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARVBOBDCY3EFSLNZR%2F20200902%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20200902T200909Z&X-Amz-Expires=86400&X-Amz-SignedHeaders=host&X-Amz-Signature=bf8a45c19cc66c566bdc85d462559e564c40673de713f4c178a9786871b930c8"
test_values_path = os.path.join(test_dir,"test_values.csv")
url_dict = {"train_values": (train_values_url, train_values_path), "train_labels": (train_labels_url, train_labels_path), "test_values": (test_values_url, test_values_path)} 
pbar = tqdm(url_dict.items(),total=len(url_dict))
#print("url-dict; ", url_dict)
for key,value in pbar:
    pbar.set_description(f"Processing {key}")
    #print("value: ", value)
    url, path = value
    response = requests.get(url)   
    with open(path, 'w') as f:
        writer = csv.writer(f)
        for line in response.iter_lines():
            writer.writerow(line.decode('utf-8').split(','))

Processing test_values: 100%|██████████| 3/3 [00:49<00:00, 16.56s/it]

CPU times: user 24.1 s, sys: 4.49 s, total: 28.6 s
Wall time: 49.7 s





In [5]:
print("Contents of directories: \n")
print("train_dir: ", os.listdir(train_dir))
print("\n")
print("processed_train_dir: ", os.listdir(processed_train_dir))
print("\n")
print("test_dir: ", os.listdir(test_dir))
print("\n")
print("processed_test_dir: ", os.listdir(processed_test_dir))

Contents of directories: 

train_dir:  ['train_labels.csv', 'train_values.csv', 'processed']


processed_train_dir:  ['labs']


test_dir:  ['test_values.csv', 'processed']


processed_test_dir:  ['labs']


#### Loading data

In [6]:
%%time
train_values = pd.read_csv(train_values_path)
print("shape of train_values: ", train_values.shape)
#print(train_values.head(10))
display(train_values.head(5))

shape of train_values:  (63017, 41)


Unnamed: 0,sequence_id,sequence,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,copy_number_low_copy,copy_number_unknown,growth_strain_ccdb_survival,growth_strain_dh10b,growth_strain_dh5alpha,growth_strain_neb_stable,growth_strain_other,growth_strain_stbl3,growth_strain_top10,growth_strain_xl1_blue,growth_temp_30,growth_temp_37,growth_temp_other,selectable_markers_blasticidin,selectable_markers_his3,selectable_markers_hygromycin,selectable_markers_leu2,selectable_markers_neomycin,selectable_markers_other,selectable_markers_puromycin,selectable_markers_trp1,selectable_markers_ura3,selectable_markers_zeocin,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,9ZIMC,CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCA...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,GCTGGATGGTTTGGGACATGTGCAGCCCCGTCTCTGTATGGAGTGA...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,NNCCGGGCTGTAGCTACACAGGGCGGAGATGAGAGCCCTACGAAAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,GCGGAGATGAAGAGCCCTACGAAAGCTGAGCCTGCGACTCCCGCAG...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,CGCGCATTACTTCACATGGTCCTCAAGGGTAACATGAAAGTGATCC...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


CPU times: user 2.96 s, sys: 60.4 ms, total: 3.02 s
Wall time: 3.04 s


#### Training targets

In [7]:
%%time
train_labels = pd.read_csv(train_labels_path)
print("shape of train_labels: ", train_labels.shape)
#print(train_values.head(10))
display(train_labels.head(5))

shape of train_labels:  (63017, 1315)


Unnamed: 0,sequence_id,00Q4V31T,012VT4JK,028IO5W2,03GRNN7N,03Y3W51H,09MQV1TY,0A4AHRCT,0A9M05NC,0B9GCUVV,0CL7QVG8,0CML4B5I,0DTHTJLJ,0FFBBVE1,0HWCWFNU,0L3Y6ZB2,0M44GDO8,0MDYJM3H,0N3V9P9M,0NP55E93,0PJ91ZT6,0R296F9R,0T2AZBD6,0URA80CN,0VRP2DI6,0W6O08VX,0WHP4PPK,0XPTGGLP,0XS4FHP3,0Y24J5G2,10TEBWK2,11TTDKTM,131RRHBV,13LZE1F7,14PBN8C2,15D0Z97U,15S88O4Q,18C9J8EH,19CAUKJB,1AP294AT,...,Z1C99MVU,Z1Y066QU,Z6LWLWFZ,Z7YFK3I0,Z7ZKDLZG,Z80NVAXF,Z8BWVZZX,ZAYLY2YU,ZB6DPIG5,ZB862XHR,ZBQD50GN,ZC07UYVV,ZCU48L3S,ZEAZQ1QQ,ZEB7PDQK,ZEBTRK7D,ZEJOQQJF,ZELU1VMX,ZFBSIW7Q,ZGY1YZ7P,ZH6LR5MO,ZIGUIE0J,ZIJRW95G,ZK6YBV02,ZLSXM0KN,ZMCRIYYJ,ZMEZU4BS,ZMUIMBDX,ZOI7FJEN,ZQ5A6IY9,ZQNGGY33,ZSHS4VJZ,ZT1IP3T6,ZU6860XU,ZU6TVFFU,ZU75P59K,ZUI6TDWV,ZWFD8OHC,ZX06ZDZN,ZZJVE4HO
0,9ZIMC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


CPU times: user 9.04 s, sys: 1.57 s, total: 10.6 s
Wall time: 10.6 s


## Preprocessing

#### One-hot encoding DNA sequences

In [8]:
def dna2dict(dna_seq):
    """
    DNA bases are ordered in tuples alphabetically as (A,C,G,N,T)
    """
    dna_dict = OrderedDict()
    dna_seq = dna_seq.upper()
    for ix,b in enumerate(dna_seq):
        dna_dict[ix] = (int(b=="A"),int(b=="C"),int(b=="G"),int(b=="N"),int(b=="T"))
    return dna_dict
        

In [9]:
dna_dict = dna2dict(train_values.sequence.values[1])
dna_dict

OrderedDict([(0, (0, 0, 1, 0, 0)),
             (1, (0, 1, 0, 0, 0)),
             (2, (0, 0, 0, 0, 1)),
             (3, (0, 0, 1, 0, 0)),
             (4, (0, 0, 1, 0, 0)),
             (5, (1, 0, 0, 0, 0)),
             (6, (0, 0, 0, 0, 1)),
             (7, (0, 0, 1, 0, 0)),
             (8, (0, 0, 1, 0, 0)),
             (9, (0, 0, 0, 0, 1)),
             (10, (0, 0, 0, 0, 1)),
             (11, (0, 0, 0, 0, 1)),
             (12, (0, 0, 1, 0, 0)),
             (13, (0, 0, 1, 0, 0)),
             (14, (0, 0, 1, 0, 0)),
             (15, (1, 0, 0, 0, 0)),
             (16, (0, 1, 0, 0, 0)),
             (17, (1, 0, 0, 0, 0)),
             (18, (0, 0, 0, 0, 1)),
             (19, (0, 0, 1, 0, 0)),
             (20, (0, 0, 0, 0, 1)),
             (21, (0, 0, 1, 0, 0)),
             (22, (0, 1, 0, 0, 0)),
             (23, (1, 0, 0, 0, 0)),
             (24, (0, 0, 1, 0, 0)),
             (25, (0, 1, 0, 0, 0)),
             (26, (0, 1, 0, 0, 0)),
             (27, (0, 1, 0, 0, 0)),
  

In [10]:
len(dna_dict)

456

In [11]:
list(dna_dict.values())

[(0, 0, 1, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 0, 0, 1),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 1, 0, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 1, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 1, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 1, 0, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 0, 0, 1),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (1, 0, 0, 0, 0),
 (0, 0, 1, 0, 0),
 (0, 0, 0, 0, 1),
 (0, 0, 1, 0, 0),
 (0, 0, 0,

#### One-hot encoded DNA sequences as dataframes

In [12]:
def dna2df(dna_seq,seq_id=None, lab_id = None):
    dna_dict = dna2dict(dna_seq)
    dna_df = pd.DataFrame(dna_dict.values(),columns=("A","C","G","N","T"))
    if seq_id:
        columns = ["sequence_id"] + list(dna_df.columns)
        dna_df["sequence_id"] = [seq_id]*len(dna_df)
        dna_df = dna_df.loc[:,columns]
    if lab_id:
        columns = ["lab_id"] + list(dna_df.columns)
        dna_df["lab_id"] = [lab_id]*len(dna_df)
        dna_df = dna_df.loc[:,columns]
    return dna_df


In [13]:
dna_seq = train_values.sequence.values[0]
seq_id = train_values.sequence_id.values[0]
lab_id = train_labels.columns[train_labels.iloc[0,:]==1][0]
dna_df = dna2df(dna_seq=dna_seq,seq_id=seq_id,lab_id=lab_id)
dna_df

Unnamed: 0,lab_id,sequence_id,A,C,G,N,T
0,RYUA3GVO,9ZIMC,0,1,0,0,0
1,RYUA3GVO,9ZIMC,1,0,0,0,0
2,RYUA3GVO,9ZIMC,0,0,0,0,1
3,RYUA3GVO,9ZIMC,0,0,1,0,0
4,RYUA3GVO,9ZIMC,0,1,0,0,0
...,...,...,...,...,...,...,...
7146,RYUA3GVO,9ZIMC,1,0,0,0,0
7147,RYUA3GVO,9ZIMC,0,1,0,0,0
7148,RYUA3GVO,9ZIMC,0,1,0,0,0
7149,RYUA3GVO,9ZIMC,0,0,1,0,0


In [14]:
lab_id

'RYUA3GVO'

#### Generating separate csvs for each lab

In [15]:
%%time
#labs_train_dir = os.path.join(processed_train_dir,"labs_train_dir")
def split_labs(train_values,train_labels,savedir=labs_train_dir,usecols=None):
    #pbar = tqdm(train_values.iterrows(),total=len(train_values))
    if usecols is None:
        usecols = list(train_values.columns[train_values.columns.get_loc("sequence")+1:])
    unique_labs = list(train_labels.columns[1:])
    pbar = tqdm(unique_labs)
    #print("unique_labs; ", unique_labs)
    for lab_id in pbar:
        #print("lab_id: ", lab_id)
        lab_dir = os.path.join(savedir,lab_id)
        pbar.set_description(f"Processing lab {lab_id}")
        ixs = train_labels.index[train_labels.loc[:,lab_id]==1]
        if not os.path.isdir(lab_dir):
          os.mkdir(lab_dir)
        #df_lab_concat = []
        for ix in ixs:
            row = train_values.loc[ix,:]
            dna_seq = row.sequence
            seq_id = row.sequence_id
            #lab_id = train_labels.columns[train_labels.iloc[ix,:]==1][0]
            dna_df = dna2df(dna_seq,seq_id,lab_id)
        #pbar.set_description(f"Processing sample {seq_id}")
        #print("dna_df: ", dna_df)
            if usecols:
                repeat = row.loc[usecols].values.reshape(1,-1)
                repeat = np.repeat( repeat,len(dna_df), axis=0)
                #print("shape: ", np.repeat(sample.loc[usecols].values,len(dna_df),axis=1).shape)
                #print(np.repeat( sample.loc[usecols].values,len(dna_df)  ))
                extra_cols_df = pd.DataFrame( repeat ,columns=usecols)   
                dna_df = pd.concat([dna_df,extra_cols_df],axis=1)
            #df_lab_concat.append(dna_df)
        #df_lab = pd.concat(df_lab_concat,axis=0,ignore_index=True)
            if savedir is not None:
                savepath = os.path.join(lab_dir,seq_id+".csv")
            #print("savepath: ", savepath)
                dna_df.to_csv(savepath,index=True)
    
    
    

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


#### Saving a .csv for each lab

In [16]:
%%time
split_labs(train_values,train_labels,labs_train_dir,usecols=None)

Processing lab ZZJVE4HO: 100%|██████████| 1314/1314 [1:39:41<00:00,  4.55s/it]

CPU times: user 1h 37min 35s, sys: 1min 50s, total: 1h 39min 25s
Wall time: 1h 39min 41s





In [17]:
!git checkout -b 2020-09-03-00


fatal: not a git repository (or any of the parent directories): .git


In [None]:
%%time
for i in train_features.iterrows():
    print(i)

In [None]:
train_features

In [None]:
train_features.loc[:,"sequence"]