# Genetic attribution overview

In this notebook we overview the [Genetic Engineering Attribution competition in Driven Data](https://www.drivendata.org/competitions/63/genetic-engineering-attribution/).   

In [1]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import csv
import requests
from tqdm import tqdm
from collections import OrderedDict, defaultdict, Counter
import seaborn as sns
import json

CPU times: user 424 ms, sys: 43.6 ms, total: 467 ms
Wall time: 466 ms


## Loading data

In [2]:
%%time
#parent_dir = "/content/genetic_engineering_attribution"
parent_dir = "/home/rio/data_sets/genetic_engineering_attribution"

### train/val/test directories
train_dir = os.path.join(parent_dir,"train") 
val_dir = os.path.join(parent_dir,"val")
test_dir = os.path.join(parent_dir,"test")

### Paths to csvs
train_path = os.path.join(train_dir,"train.csv")
val_path = os.path.join(val_dir,"val.csv")
test_path = os.path.join(test_dir,"test.csv")

### Dataframes
df_train = pd.read_csv(train_path,index_col=0)
df_val = pd.read_csv(val_path,index_col=0)
df_test = pd.read_csv(test_path,index_col=0)

### Printing shapes:
print(f"Shape of df_train: {df_train.shape}")
print(f"Shape of df_val: {df_val.shape}")
print(f"Shape of df_test: {df_test.shape}")

Shape of df_train: (50413, 43)
Shape of df_val: (12604, 43)
Shape of df_test: (18816, 43)
CPU times: user 2.09 s, sys: 140 ms, total: 2.23 s
Wall time: 2.23 s


In [3]:
df_train.head()

Unnamed: 0,sequence_id,lab_id,sequence,seq_length,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
4350,SMVSS,I7FXTVDP,GACGGATCGGGAGATCTCCCGATCCCCTATGGTGCACTCTCAGTAC...,7723,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10470,6MC5M,I7FXTVDP,GCTAGAGCCGTGAACGACAGGGCGAACGCCAGCCCGCCGACGGCGA...,10550,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
30997,BHOUO,P3Q11IAK,GGCTTTGTTAGCAGCCGGATCCTTATCAGTCTGCGGCAGGATTGGC...,1085,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
29915,EI1J7,I7FXTVDP,GACGGATCGGGAGATCTCCCGATCCCCTATGGTGCACTCTCAGTAC...,9216,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19843,NVS4Z,AMV4U0A0,TATACGACTCACTATAGGGCGAATTGGGCCCTCTAGATGCATGCTC...,975,1.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


## Extracting features from training data

We now write a function that counts the occurences of a string pattern through the whole data set

In [4]:
def count_patterns(sequence, pattern_len=8,stride=1):
    pattern_dict = defaultdict(lambda: 0)
    start_ixs = range(0,len(sequence)-pattern_len,stride)
    for ix in start_ixs:
        pattern_dict[sequence[ix:ix+pattern_len]] += 1 
    return pattern_dict

def count_df_patterns(df, pattern_len, stride=1):
    pattern_dict = Counter(defaultdict(lambda: 0))
    sequences = df.sequence.values
    pbar = tqdm(sequences)
    for seq in pbar:
        seq_count = Counter(count_patterns(seq, pattern_len,stride))
        pattern_dict += seq_count
    return dict(pattern_dict)

## pattern_len = 4

In [5]:
%%time
pattern_len = 4
stride = 4
pattern_counts_4 = count_df_patterns(df_train, pattern_len, stride)

100%|██████████| 50413/50413 [00:13<00:00, 3825.65it/s]

CPU times: user 13.2 s, sys: 23.8 ms, total: 13.2 s
Wall time: 13.2 s





In [6]:
len(pattern_counts_4)

625

#### Saving to disk

In [7]:
%%time
string_patterns_dir = "/home/rio/data_sets/genetic_engineering_attribution/string_patterns"
filename = os.path.join(string_patterns_dir,"string_patterns_04.json")
with open(filename, 'w') as fp:
    json.dump(pattern_counts_4, fp)

CPU times: user 9.55 ms, sys: 23 µs, total: 9.57 ms
Wall time: 8 ms


## pattern_len = 8

In [8]:
%%time
pattern_len = 8
stride = 8
pattern_counts_8 = count_df_patterns(df_train, pattern_len, stride)

100%|██████████| 50413/50413 [02:06<00:00, 399.81it/s]

CPU times: user 2min 6s, sys: 452 ms, total: 2min 6s
Wall time: 2min 6s





#### Saving to disk

In [9]:
%%time
string_patterns_dir = "/home/rio/data_sets/genetic_engineering_attribution/string_patterns"
filename = os.path.join(string_patterns_dir,"string_patterns_08.json")
with open(filename, 'w') as fp:
    json.dump(pattern_counts_8, fp)

CPU times: user 54 ms, sys: 3.99 ms, total: 58 ms
Wall time: 57.3 ms


In [10]:
len(pattern_counts_8)

81096

In [24]:
5**8

390625

## pattern_len = 16

In [11]:
%%time
pattern_len = 16
stride = 16
pattern_counts_16 = count_df_patterns(df_train, pattern_len, stride)

100%|██████████| 50413/50413 [46:48<00:00, 17.95it/s]


CPU times: user 46min 49s, sys: 4.45 s, total: 46min 54s
Wall time: 46min 49s


In [12]:
len(pattern_counts_16)

2976658

In [14]:
pattern_counts_16

{'GACGGATCGGGAGATC': 1528,
 'TCCCGATCCCCTATGG': 1477,
 'TGCACTCTCAGTACAA': 1183,
 'TCTGCTCTGATGCCGC': 1708,
 'ATAGTTAAGCCAGTAT': 1554,
 'CTGCTCCCTGCTTGTG': 1458,
 'TGTTGGAGGTCGCTGA': 1457,
 'GTAGTGCGCGAGCAAA': 1455,
 'ATTTAAGCTACAACAA': 1457,
 'GGCAAGGCTTGACCGA': 1457,
 'CAATTGCATGAAGAAT': 1439,
 'CTGCTTAGGGTTAGGC': 1439,
 'GTTTTGCGCTGCTTCG': 1432,
 'CGATGTACGGGCCAGA': 1440,
 'TATACGCGTTGACATT': 1409,
 'GATTATTGACTAGTTA': 1449,
 'TTAATAGTAATCAATT': 1578,
 'ACGGGGTCATTAGTTC': 1579,
 'ATAGCCCATATATGGA': 1577,
 'GTTCCGCGTTACATAA': 1581,
 'CTTACGGTAAATGGCC': 1601,
 'CGCCTGGCTGACCGCC': 1601,
 'CAACGACCCCCGCCCA': 1598,
 'TTGACGTCAATAATGA': 1588,
 'CGTATGTTCCCATAGT': 1587,
 'AACGCCAATAGGGACT': 1678,
 'TTCCATTGACGTCAAT': 1678,
 'GGGTGGAGTATTTACG': 1169,
 'GTAAACTGCCCACTTG': 1677,
 'GCAGTACATCAAGTGT': 1678,
 'ATCATATGCCAAGTAC': 1651,
 'GCCCCCTATTGACGTC': 1686,
 'AATGACGGTAAATGGC': 1685,
 'CCGCCTGGCATTATGC': 1578,
 'CCAGTACATGACCTTA': 1675,
 'TGGGACTTTCCTACTT': 1657,
 'GGCAGTACATCTACGT': 1671,
 

In [13]:
5**16

152587890625

#### Saving to disk

In [15]:
%%time
string_patterns_dir = "/home/rio/data_sets/genetic_engineering_attribution/string_patterns"
filename = os.path.join(string_patterns_dir,"string_patterns_16.json")
with open(filename, 'w') as fp:
    json.dump(pattern_counts_16, fp)

CPU times: user 1.93 s, sys: 36 ms, total: 1.96 s
Wall time: 1.96 s


## pattern_len = 32

In [16]:
%%time
pattern_len = 32
stride = 32
pattern_counts_32 = count_df_patterns(df_train, pattern_len, stride)

100%|██████████| 50413/50413 [29:58<00:00, 28.04it/s]

CPU times: user 29min 59s, sys: 3.26 s, total: 30min 2s
Wall time: 29min 58s





In [17]:
len(pattern_counts_32)

1801833

In [20]:
pattern_counts_32

{'GACGGATCGGGAGATCTCCCGATCCCCTATGG': 1442,
 'TGCACTCTCAGTACAATCTGCTCTGATGCCGC': 1010,
 'ATAGTTAAGCCAGTATCTGCTCCCTGCTTGTG': 1423,
 'TGTTGGAGGTCGCTGAGTAGTGCGCGAGCAAA': 1424,
 'ATTTAAGCTACAACAAGGCAAGGCTTGACCGA': 1426,
 'CAATTGCATGAAGAATCTGCTTAGGGTTAGGC': 1410,
 'GTTTTGCGCTGCTTCGCGATGTACGGGCCAGA': 1399,
 'TATACGCGTTGACATTGATTATTGACTAGTTA': 1369,
 'TTAATAGTAATCAATTACGGGGTCATTAGTTC': 1489,
 'ATAGCCCATATATGGAGTTCCGCGTTACATAA': 1480,
 'CTTACGGTAAATGGCCCGCCTGGCTGACCGCC': 1505,
 'CAACGACCCCCGCCCATTGACGTCAATAATGA': 1496,
 'CGTATGTTCCCATAGTAACGCCAATAGGGACT': 1496,
 'TTCCATTGACGTCAATGGGTGGAGTATTTACG': 1080,
 'GTAAACTGCCCACTTGGCAGTACATCAAGTGT': 1583,
 'ATCATATGCCAAGTACGCCCCCTATTGACGTC': 1567,
 'AATGACGGTAAATGGCCCGCCTGGCATTATGC': 1486,
 'CCAGTACATGACCTTATGGGACTTTCCTACTT': 1572,
 'GGCAGTACATCTACGTATTAGTCATCGCTATT': 1576,
 'ACCATGGTGATGCGGTTTTGGCAGTACATCAA': 1456,
 'TGGGCGTGGATAGCGGTTTGACTCACGGGGAT': 1465,
 'TTCCAAGTCTCCACCCCATTGACGTCAATGGG': 1490,
 'AGTTTGTTTTGGCACCAAAATCAACGGGACTT': 1444,
 'TCCAAAATG

#### Saving to disk

In [23]:
%%time
string_patterns_dir = "/home/rio/data_sets/genetic_engineering_attribution/string_patterns"
filename = os.path.join(string_patterns_dir,"string_patterns_32.json")
with open(filename, 'w') as fp:
    json.dump(pattern_counts_32, fp)

CPU times: user 1.23 s, sys: 56.1 ms, total: 1.29 s
Wall time: 1.29 s


## pattern_len = 64

In [25]:
%%time
pattern_len = 64
stride = 64
pattern_counts_64 = count_df_patterns(df_train, pattern_len, stride)

100%|██████████| 50413/50413 [21:47<00:00, 38.56it/s]


CPU times: user 21min 47s, sys: 2.9 s, total: 21min 50s
Wall time: 21min 47s


In [26]:
len(pattern_counts_64)

1091497

#### Saving to disk

In [27]:
%%time
string_patterns_dir = "/home/rio/data_sets/genetic_engineering_attribution/string_patterns"
filename = os.path.join(string_patterns_dir,"string_patterns_64.json")
with open(filename, 'w') as fp:
    json.dump(pattern_counts_64, fp)

CPU times: user 870 ms, sys: 52.1 ms, total: 922 ms
Wall time: 921 ms


In [14]:
5**8

390625

In [18]:
len(set(pattern_counts_8.keys()))

625

In [60]:
Counter(defaultdict(lambda: 0))

Counter()

In [64]:
a = defaultdict(lambda:0)
b = defaultdict(lambda:0)
c = defaultdict(lambda:0)

In [65]:
a["a"] = 2
b["a"] = 1
b["b"] = 3
c["c"] = 4
c["b"] = 10
c["d"] += 1

In [66]:
Counter(a) + Counter(b) + Counter(c)

Counter({'a': 3, 'b': 13, 'c': 4, 'd': 1})