# Genetic attribution overview

In this notebook we overview the [Genetic Engineering Attribution competition in Driven Data](https://www.drivendata.org/competitions/63/genetic-engineering-attribution/).   

In [3]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import csv
import requests
from tqdm import tqdm
from collections import OrderedDict, defaultdict, Counter
import seaborn as sns
import json

CPU times: user 14 µs, sys: 2 µs, total: 16 µs
Wall time: 18.6 µs


## Loading data

In [4]:
%%time
#parent_dir = "/content/genetic_engineering_attribution"
parent_dir = "/home/rio/data_sets/genetic_engineering_attribution"

### train/val/test directories
full_train_dir = os.path.join(parent_dir,"full_train") 
train_dir = os.path.join(parent_dir,"train") 
val_dir = os.path.join(parent_dir,"val")
test_dir = os.path.join(parent_dir,"test")

### Paths to csvs
full_train_path = os.path.join(full_train_dir,"full_train.csv")
train_path = os.path.join(train_dir,"train.csv")
val_path = os.path.join(val_dir,"val.csv")
test_path = os.path.join(test_dir,"test.csv")

### Dataframes
df_full_train = pd.read_csv(full_train_path,index_col=0)
df_train = pd.read_csv(train_path,index_col=0)
df_val = pd.read_csv(val_path,index_col=0)
df_test = pd.read_csv(test_path,index_col=0)

### Printing shapes:
print(f"Shape of df_full_train: {df_full_train.shape}")
print(f"Shape of df_train: {df_train.shape}")
print(f"Shape of df_val: {df_val.shape}")
print(f"Shape of df_test: {df_test.shape}")

Shape of df_full_train: (63017, 43)
Shape of df_train: (50413, 43)
Shape of df_val: (12604, 43)
Shape of df_test: (18816, 43)
CPU times: user 3.61 s, sys: 304 ms, total: 3.92 s
Wall time: 3.91 s


In [5]:
df_full_train.head()

Unnamed: 0,sequence_id,lab_id,sequence,seq_length,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
0,9ZIMC,RYUA3GVO,CATGCATTAGTTATTAATAGTAATCAATTACGGGGTCATTAGTTCA...,7151,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5SAQC,RYUA3GVO,GCTGGATGGTTTGGGACATGTGCAGCCCCGTCTCTGTATGGAGTGA...,456,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,E7QRO,RYUA3GVO,NNCCGGGCTGTAGCTACACAGGGCGGAGATGAGAGCCCTACGAAAG...,1450,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,CT5FP,RYUA3GVO,GCGGAGATGAAGAGCCCTACGAAAGCTGAGCCTGCGACTCCCGCAG...,914,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7PTD8,RYUA3GVO,CGCGCATTACTTCACATGGTCCTCAAGGGTAACATGAAAGTGATCC...,1350,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Extracting features from full training data

We now write a function that counts the occurences of a string pattern through the whole data set

In [6]:
def count_patterns(sequence, pattern_len=8,stride=1):
    pattern_dict = defaultdict(lambda: 0)
    start_ixs = range(0,len(sequence)-pattern_len,stride)
    for ix in start_ixs:
        pattern_dict[sequence[ix:ix+pattern_len]] += 1 
    return pattern_dict

def count_df_patterns(df, pattern_len, stride=1):
    pattern_dict = Counter(defaultdict(lambda: 0))
    sequences = df.sequence.values
    pbar = tqdm(sequences)
    for seq in pbar:
        seq_count = Counter(count_patterns(seq, pattern_len,stride))
        pattern_dict += seq_count
    return dict(pattern_dict)

## pattern_len = 4

In [7]:
%%time
pattern_len = 4
stride = 4
pattern_counts_4 = count_df_patterns(df_full_train, pattern_len, stride)

100%|██████████| 63017/63017 [00:16<00:00, 3812.90it/s]

CPU times: user 16.5 s, sys: 28.9 ms, total: 16.6 s
Wall time: 16.5 s





In [6]:
len(pattern_counts_4)

625

#### Saving to disk

In [8]:
%%time
string_patterns_dir = "/home/rio/data_sets/genetic_engineering_attribution/string_patterns"
filename = os.path.join(string_patterns_dir,"string_patterns_04.json")
with open(filename, 'w') as fp:
    json.dump(pattern_counts_4, fp)

CPU times: user 0 ns, sys: 1.33 ms, total: 1.33 ms
Wall time: 11.2 ms


## pattern_len = 8

In [9]:
%%time
pattern_len = 8
stride = 8
pattern_counts_8 = count_df_patterns(df_full_train, pattern_len, stride)

100%|██████████| 63017/63017 [02:46<00:00, 377.73it/s]

CPU times: user 2min 46s, sys: 372 ms, total: 2min 47s
Wall time: 2min 46s





#### Saving to disk

In [10]:
%%time
string_patterns_dir = "/home/rio/data_sets/genetic_engineering_attribution/string_patterns"
filename = os.path.join(string_patterns_dir,"string_patterns_08.json")
with open(filename, 'w') as fp:
    json.dump(pattern_counts_8, fp)

CPU times: user 60.5 ms, sys: 2 µs, total: 60.5 ms
Wall time: 75.9 ms


In [11]:
len(pattern_counts_8)

84497

In [12]:
5**8

390625

## pattern_len = 16

In [14]:
%%time
pattern_len = 16
stride = 16
pattern_counts_16 = count_df_patterns(df_full_train, pattern_len, stride)

100%|██████████| 63017/63017 [56:20<00:00, 18.64it/s]


CPU times: user 56min 21s, sys: 5.27 s, total: 56min 27s
Wall time: 56min 20s


In [15]:
len(pattern_counts_16)

3486651

In [16]:
pattern_counts_16

{'CATGCATTAGTTATTA': 180,
 'ATAGTAATCAATTACG': 448,
 'GGGTCATTAGTTCATA': 446,
 'GCCCATATATGGAGTT': 436,
 'CCGCGTTACATAACTT': 437,
 'ACGGTAAATGGCCCGC': 1134,
 'CTGGCTGACCGCCCAA': 470,
 'CGACCCCCGCCCATTG': 464,
 'ACGTCAATAATGACGT': 479,
 'ATGTTCCCATAGTAAC': 480,
 'GCCAATAGGGACTTTC': 487,
 'CATTGACGTCAATGGG': 2492,
 'TGGAGTATTTACGGTA': 469,
 'AACTGCCCACTTGGCA': 487,
 'GTACATCAAGTGTATC': 486,
 'ATATGCCAAGTACGCC': 527,
 'CCCTATTGACGTCAAT': 589,
 'GACGGTAAATGGCCCG': 575,
 'CCTGGCATTATGCCCA': 573,
 'GTACATGACCTTATGG': 533,
 'GACTTTCCTACTTGGC': 582,
 'AGTACATCTACGTATT': 576,
 'AGTCATCGCTATTACC': 576,
 'ATGGTGATGCGGTTTT': 463,
 'GGCAGTACATCAATGG': 449,
 'GCGTGGATAGCGGTTT': 476,
 'GACTCACGGGGATTTC': 466,
 'CAAGTCTCCACCCCAT': 464,
 'TGACGTCAATGGGAGT': 466,
 'TTGTTTTGGCACCAAA': 459,
 'ATCAACGGGACTTTCC': 461,
 'AAAATGTCGTAACAAC': 414,
 'TCCGCCCCATTGACGC': 416,
 'AAATGGGCGGTAGGCG': 483,
 'TGTACGGTGGGAGGTC': 469,
 'TATATAAGCAGAGCTG': 266,
 'GTTTAGTGAACCGTCA': 416,
 'GATCCGCTAGCGCTAC': 167,
 'CGGTCGCC

In [18]:
3486651/5**16

2.28501159936e-05

#### Saving to disk

In [19]:
%%time
string_patterns_dir = "/home/rio/data_sets/genetic_engineering_attribution/string_patterns"
filename = os.path.join(string_patterns_dir,"string_patterns_16.json")
with open(filename, 'w') as fp:
    json.dump(pattern_counts_16, fp)

CPU times: user 2.28 s, sys: 48.1 ms, total: 2.33 s
Wall time: 2.78 s


## pattern_len = 32

In [16]:
%%time
pattern_len = 32
stride = 32
pattern_counts_32 = count_df_patterns(df_full_train, pattern_len, stride)

100%|██████████| 50413/50413 [29:58<00:00, 28.04it/s]

CPU times: user 29min 59s, sys: 3.26 s, total: 30min 2s
Wall time: 29min 58s





In [17]:
len(pattern_counts_32)

1801833

In [20]:
pattern_counts_32

{'GACGGATCGGGAGATCTCCCGATCCCCTATGG': 1442,
 'TGCACTCTCAGTACAATCTGCTCTGATGCCGC': 1010,
 'ATAGTTAAGCCAGTATCTGCTCCCTGCTTGTG': 1423,
 'TGTTGGAGGTCGCTGAGTAGTGCGCGAGCAAA': 1424,
 'ATTTAAGCTACAACAAGGCAAGGCTTGACCGA': 1426,
 'CAATTGCATGAAGAATCTGCTTAGGGTTAGGC': 1410,
 'GTTTTGCGCTGCTTCGCGATGTACGGGCCAGA': 1399,
 'TATACGCGTTGACATTGATTATTGACTAGTTA': 1369,
 'TTAATAGTAATCAATTACGGGGTCATTAGTTC': 1489,
 'ATAGCCCATATATGGAGTTCCGCGTTACATAA': 1480,
 'CTTACGGTAAATGGCCCGCCTGGCTGACCGCC': 1505,
 'CAACGACCCCCGCCCATTGACGTCAATAATGA': 1496,
 'CGTATGTTCCCATAGTAACGCCAATAGGGACT': 1496,
 'TTCCATTGACGTCAATGGGTGGAGTATTTACG': 1080,
 'GTAAACTGCCCACTTGGCAGTACATCAAGTGT': 1583,
 'ATCATATGCCAAGTACGCCCCCTATTGACGTC': 1567,
 'AATGACGGTAAATGGCCCGCCTGGCATTATGC': 1486,
 'CCAGTACATGACCTTATGGGACTTTCCTACTT': 1572,
 'GGCAGTACATCTACGTATTAGTCATCGCTATT': 1576,
 'ACCATGGTGATGCGGTTTTGGCAGTACATCAA': 1456,
 'TGGGCGTGGATAGCGGTTTGACTCACGGGGAT': 1465,
 'TTCCAAGTCTCCACCCCATTGACGTCAATGGG': 1490,
 'AGTTTGTTTTGGCACCAAAATCAACGGGACTT': 1444,
 'TCCAAAATG

#### Saving to disk

In [23]:
%%time
string_patterns_dir = "/home/rio/data_sets/genetic_engineering_attribution/string_patterns"
filename = os.path.join(string_patterns_dir,"string_patterns_32.json")
with open(filename, 'w') as fp:
    json.dump(pattern_counts_32, fp)

CPU times: user 1.23 s, sys: 56.1 ms, total: 1.29 s
Wall time: 1.29 s


## pattern_len = 64

In [25]:
%%time
pattern_len = 64
stride = 64
pattern_counts_64 = count_df_patterns(df_full_train, pattern_len, stride)

100%|██████████| 50413/50413 [21:47<00:00, 38.56it/s]


CPU times: user 21min 47s, sys: 2.9 s, total: 21min 50s
Wall time: 21min 47s


In [26]:
len(pattern_counts_64)

1091497

#### Saving to disk

In [27]:
%%time
string_patterns_dir = "/home/rio/data_sets/genetic_engineering_attribution/string_patterns"
filename = os.path.join(string_patterns_dir,"string_patterns_64.json")
with open(filename, 'w') as fp:
    json.dump(pattern_counts_64, fp)

CPU times: user 870 ms, sys: 52.1 ms, total: 922 ms
Wall time: 921 ms


In [14]:
5**8

390625

In [18]:
len(set(pattern_counts_8.keys()))

625

In [60]:
Counter(defaultdict(lambda: 0))

Counter()

In [64]:
a = defaultdict(lambda:0)
b = defaultdict(lambda:0)
c = defaultdict(lambda:0)

In [65]:
a["a"] = 2
b["a"] = 1
b["b"] = 3
c["c"] = 4
c["b"] = 10
c["d"] += 1

In [66]:
Counter(a) + Counter(b) + Counter(c)

Counter({'a': 3, 'b': 13, 'c': 4, 'd': 1})