# Generating samples (string patterns of length 16)

In this notebook we generate samples in the proper format we shall use to train. 

In [1]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil
import csv
import requests
from tqdm import tqdm
from collections import OrderedDict, defaultdict, Counter
import seaborn as sns
import json
from sklearn.decomposition import PCA
#from joblib import dump, load
import joblib

CPU times: user 551 ms, sys: 64.4 ms, total: 615 ms
Wall time: 614 ms


## Loading data

In [2]:
#parent_dir = "/content/genetic_engineering_attribution"
parent_dir = "/home/rio/data_sets/genetic_engineering_attribution"

#### Data

In [3]:
%%time
### pca directory
pca_dir = os.path.join(parent_dir,"pca")

### pca engineered data sets
pca_engineered_datasets_dir = os.path.join(parent_dir,"pca_engineered_datasets")
pca_32_95comp_dir = os.path.join(pca_engineered_datasets_dir,"pca_16_48comp")

### train/val/test dir
train_val_test_dir = os.path.join(pca_32_95comp_dir,"train_val_test")

### Paths to csvs
train_path = os.path.join(train_val_test_dir,"train.csv")
val_path = os.path.join(train_val_test_dir,"val.csv")
test_path = os.path.join(train_val_test_dir,"test.csv")

### Dataframes
df_train = pd.read_csv(train_path,index_col=0)
df_val = pd.read_csv(val_path,index_col=0)
df_test = pd.read_csv(test_path,index_col=0)

### Printing shapes:
print(f"Shape of df_train: {df_train.shape}")
print(f"Shape of df_val: {df_val.shape}")
print(f"Shape of df_test: {df_test.shape}")

  mask |= (ar1 == a)


Shape of df_train: (1314000, 138)
Shape of df_val: (1175000, 138)
Shape of df_test: (1881600, 138)
CPU times: user 59.7 s, sys: 2.26 s, total: 1min 1s
Wall time: 1min 1s


In [4]:
df_train.head()

Unnamed: 0,sequence_id,lab_id,sequence,seq_length,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
2003,Q5MU0,00Q4V31T,ACCGCCTTTGAGTGAGCTGATACCGCTCGCCG,32,-0.754907,0.013641,-0.449087,0.341165,0.200607,1.050232,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2700,HQPH7,00Q4V31T,TTCGTGTCGACACGGCAGACCACGCGTTTATC,32,-0.374325,-0.744543,0.548367,0.285129,0.575248,-0.5502,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2699,OB8FG,00Q4V31T,AGCTGCGGTAAAGCTCATCAGCGTGGTCGTGC,32,-0.576713,0.420651,-0.953987,0.02237,-0.895677,0.148758,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2700,HQPH7,00Q4V31T,TATAACGTTACTGGTTTCACATTCACCACCCT,32,0.454543,-0.389939,0.478635,0.764074,0.59399,0.791779,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2699,OB8FG,00Q4V31T,CGTAGTTATCTACACGACGGGGAGTCAGGCAA,32,-0.136239,0.615637,-0.252784,-0.41725,-0.672729,0.120384,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Generating train, val, test data split by sequence_id

#### Auxiliary function to generate directories

In [5]:
def generate_dir(directory,delete_dir=True):
    if not os.path.isdir(directory):
        print(f"Creating directory {directory}")
        os.makedirs(directory, exist_ok=True)
    elif delete_dir:
        print(f"Directory {directory} already exists. Deleting an recreating.")
        shutil.rmtree(directory)
        os.makedirs(directory, exist_ok=True)
    else:
        print(f"Directory {directory} already exists. I will either overwrite or add files to it.")

#### spliting data set by sequence id

In [6]:
def split_by_seq_id(df,seq_id):
    seq_id_mask = df.sequence_id == seq_id
    df_seq_id = df.loc[seq_id_mask,:]
    return df_seq_id

def generate_seq_id_data(df,savedir,delete_dir):
    generate_dir(savedir,delete_dir)
    unique_sequence_ids = sorted(list(df.sequence_id.unique()))
    pbar = tqdm(unique_sequence_ids)
    for seq_id in pbar:
        pbar.set_description(f"Processing {seq_id}")
        df_seq_id = split_by_seq_id(df,seq_id)
        savepath = os.path.join(savedir,seq_id+".csv")
        df_seq_id.to_csv(savepath,index=True)


#### Splitting training set by sequence_id

In [7]:
%%time
savedir = os.path.join(pca_32_95comp_dir,"train_val_test_sequence_id","train")
delete_dir = True
generate_seq_id_data(df_train,savedir,delete_dir)

Processing 000TM:   0%|          | 0/39285 [00:00<?, ?it/s]

Directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_val_test_sequence_id/train already exists. Deleting an recreating.


Processing ZZX64: 100%|██████████| 39285/39285 [38:26<00:00, 17.03it/s]

CPU times: user 38min 29s, sys: 11.9 s, total: 38min 41s
Wall time: 38min 26s





#### Splitting val set by sequence_id

In [8]:
%%time
savedir = os.path.join(pca_32_95comp_dir,"train_val_test_sequence_id","val")
delete_dir = True
generate_seq_id_data(df_val,savedir,delete_dir)

Processing 0092L:   0%|          | 2/11313 [00:00<10:29, 17.97it/s]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_val_test_sequence_id/val


Processing ZZURQ: 100%|██████████| 11313/11313 [11:47<00:00, 15.98it/s]

CPU times: user 11min 46s, sys: 5.19 s, total: 11min 52s
Wall time: 11min 48s





#### Splitting test set by sequence_id

In [9]:
%%time
savedir = os.path.join(pca_32_95comp_dir,"train_val_test_sequence_id","test")
delete_dir = True
generate_seq_id_data(df_test,savedir,delete_dir)

Processing 004JQ:   0%|          | 0/18816 [00:00<?, ?it/s]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_val_test_sequence_id/test


Processing ZZZ9J: 100%|██████████| 18816/18816 [25:24<00:00, 12.34it/s]

CPU times: user 25min 25s, sys: 6.78 s, total: 25min 32s
Wall time: 25min 25s





In [14]:
sorted(list(df_train.sequence_id.unique()))

['000MB',
 '000TM',
 '001KI',
 '001S6',
 '001TE',
 '002AS',
 '006DL',
 '007BG',
 '007EK',
 '007PU',
 '00994',
 '00D36',
 '00D6X',
 '00FTV',
 '00GLM',
 '00IF1',
 '00LG8',
 '00NA2',
 '00OM4',
 '00QQ7',
 '00R6B',
 '00S3F',
 '00SOT',
 '00UTV',
 '00VCD',
 '00WT3',
 '00YDN',
 '00ZHA',
 '010GG',
 '010ZC',
 '0113E',
 '012M3',
 '012XI',
 '013DI',
 '015O8',
 '017F7',
 '017GO',
 '01852',
 '019HJ',
 '019QL',
 '01B3K',
 '01DLH',
 '01DOU',
 '01E6B',
 '01E7V',
 '01HCN',
 '01HDI',
 '01IN1',
 '01JVK',
 '01ML5',
 '01ML8',
 '01MVS',
 '01N25',
 '01NB4',
 '01NSF',
 '01POP',
 '01S14',
 '01SBC',
 '01UPZ',
 '01V48',
 '01VPE',
 '01W5I',
 '01WHM',
 '01X5Q',
 '020U4',
 '02395',
 '023CP',
 '0242O',
 '0269E',
 '027IS',
 '0291L',
 '029EU',
 '029K3',
 '02AR0',
 '02DCT',
 '02H4E',
 '02IHJ',
 '02JJX',
 '02JO3',
 '02L1Z',
 '02L8I',
 '02LCA',
 '02M81',
 '02QD7',
 '02QOK',
 '02R61',
 '02TFA',
 '02TFT',
 '02TQP',
 '02UT2',
 '02UWZ',
 '02W4D',
 '02WEY',
 '02XUT',
 '02ZSC',
 '02ZVA',
 '0320K',
 '032PG',
 '033A3',
 '036AT',


In [15]:
mask = df_train.sequence_id == "096H3"
df_train.loc[mask,:]

Unnamed: 0,sequence_id,lab_id,sequence,seq_length,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
17825,096H3,35MKXPL0,ATGTATTTAGAAAAATAAACAAATAGGGGTTC,32,1.483346,-0.330815,-0.406804,-0.75914,0.201112,0.090069,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17825,096H3,35MKXPL0,TGGGTGGTTTTTGTCACGCGCTATCAGCTCTT,32,-0.026886,0.504211,0.767913,1.106203,-0.491358,0.101932,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17825,096H3,35MKXPL0,TTCGCCATTCAGGCTGCGCAACTGTTGGGAAG,32,-0.330568,0.34875,0.210691,0.200833,0.17389,0.176427,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17825,096H3,35MKXPL0,GGGCTCCCTTTAGGGTTCCGATTTAGTGCTTT,32,-0.088235,-0.173581,-0.288945,1.119342,-0.72404,0.106456,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17825,096H3,35MKXPL0,CCCCGTTGACTGCCTCTTCGCTGTACAGTTCT,32,-0.525726,0.150666,-0.720758,1.260578,-0.207384,-0.917216,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17825,096H3,35MKXPL0,TAGTACGCAGCTTCTTCTAGTTCAATTACACC,32,0.373995,-0.857661,-0.924258,0.723566,1.032072,0.599299,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17825,096H3,35MKXPL0,ACGCTTTTCCCAAACTAGTGTGTTTCAAGAAA,32,0.689562,-0.396472,0.018428,0.304578,0.144895,-0.105203,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17825,096H3,35MKXPL0,CTAACGAATTCGACGAACTGGAAATCCAGGGC,32,0.096901,-0.946846,0.759136,-0.678109,0.685801,-0.073971,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
aaaa

#### String patterns

In [5]:
%%time
### directory of string patterns
string_patterns_dir = os.path.join(parent_dir,"string_patterns")

### string patterns jsons
string_patterns_32 = os.path.join(string_patterns_dir,"string_patterns_32.json")

### choose string pattern
string_patterns_file = string_patterns_32
### loading string pattern
with open(string_patterns_file) as json_file:
    string_patterns = json.load(json_file)
print("Length of string_patterns_32: ", len(string_patterns))


Length of string_patterns_32:  1801833
CPU times: user 883 ms, sys: 279 ms, total: 1.16 s
Wall time: 1.82 s


In [6]:
string_patterns

{'GACGGATCGGGAGATCTCCCGATCCCCTATGG': 1442,
 'TGCACTCTCAGTACAATCTGCTCTGATGCCGC': 1010,
 'ATAGTTAAGCCAGTATCTGCTCCCTGCTTGTG': 1423,
 'TGTTGGAGGTCGCTGAGTAGTGCGCGAGCAAA': 1424,
 'ATTTAAGCTACAACAAGGCAAGGCTTGACCGA': 1426,
 'CAATTGCATGAAGAATCTGCTTAGGGTTAGGC': 1410,
 'GTTTTGCGCTGCTTCGCGATGTACGGGCCAGA': 1399,
 'TATACGCGTTGACATTGATTATTGACTAGTTA': 1369,
 'TTAATAGTAATCAATTACGGGGTCATTAGTTC': 1489,
 'ATAGCCCATATATGGAGTTCCGCGTTACATAA': 1480,
 'CTTACGGTAAATGGCCCGCCTGGCTGACCGCC': 1505,
 'CAACGACCCCCGCCCATTGACGTCAATAATGA': 1496,
 'CGTATGTTCCCATAGTAACGCCAATAGGGACT': 1496,
 'TTCCATTGACGTCAATGGGTGGAGTATTTACG': 1080,
 'GTAAACTGCCCACTTGGCAGTACATCAAGTGT': 1583,
 'ATCATATGCCAAGTACGCCCCCTATTGACGTC': 1567,
 'AATGACGGTAAATGGCCCGCCTGGCATTATGC': 1486,
 'CCAGTACATGACCTTATGGGACTTTCCTACTT': 1572,
 'GGCAGTACATCTACGTATTAGTCATCGCTATT': 1576,
 'ACCATGGTGATGCGGTTTTGGCAGTACATCAA': 1456,
 'TGGGCGTGGATAGCGGTTTGACTCACGGGGAT': 1465,
 'TTCCAAGTCTCCACCCCATTGACGTCAATGGG': 1490,
 'AGTTTGTTTTGGCACCAAAATCAACGGGACTT': 1444,
 'TCCAAAATG

#### Fitted pca

In [7]:
%%time
###  directory with fitted pcas
pca_dir = os.path.join(parent_dir,"pca")

### pca_32_95comp
pca_32_95comp = os.path.join(pca_dir,"pca_32_95comp.joblib")

### loading
pca_32 = joblib.load(pca_32_95comp)

CPU times: user 5.55 ms, sys: 264 µs, total: 5.82 ms
Wall time: 188 ms


## Generating snippets

We now code a function to generate snippets. The snippets are sequences of uniform length and are generated from the original sequences.

In [8]:
def sample_window(array_of_lengths, l):
    #print("array_of_lengths: ", array_of_lengths)
    sampled_lower_bounds = [np.random.randint(0,(length - l)) for length in array_of_lengths]
    sampled_upper_bounds = [lb + l for lb in sampled_lower_bounds]
    sampled_windows = np.array([sampled_lower_bounds, sampled_upper_bounds]).T
    return sampled_windows
    
def generate_snippets(df, l, min_seq_length=None, size=None, replace = True, p="seq_length"):
    if min_seq_length is None:
        min_seq_length = l+1
    df_sampled = df[df.seq_length>=min_seq_length].copy()
    ### selecting indices
    #a, p, replace = df.index.values, None, False
    a = df_sampled.index.values
    if size is None:
        size = len(a)
    else:
        if p == "seq_length":
            p = df_sampled.seq_length.values/np.sum(df_sampled.seq_length.values)
            replace = True
    #print("a: ", a, type(a))
    #print("size: ", size, type(size))
    #print("replace: ", replace, type(replace))
    #print("p: ", p, type(p))
    sampled_ixs = np.random.choice(a, size, replace, p)
    #print("sampled_ixs: ", sampled_ixs)
    df_sampled = df_sampled.loc[sampled_ixs,:] 
    array_of_lengths = df_sampled.seq_length.values
    #if np.any(array_of_lengths<min_seq_length):
    #    print("Found one!")
    sampled_windows = sample_window(array_of_lengths, l) 
    df_sampled.loc[:,"sequence"] = [seq[l:u] for seq,(l,u) in zip(df_sampled.sequence.values, sampled_windows)]
    df_sampled.loc[:,"seq_length"] = [len(seq) for seq in df_sampled.sequence.values]
    #return df_sampled,sampled_windows, array_of_lengths, sampled_ixs
    return df_sampled

def generate_lab_snippets(df, l, min_seq_length=None, size=None, replace = True, p="seq_length",test=False):
    train = not test
    concat = []
    if train:
        pbar = tqdm(np.sort(df.lab_id.unique()))
        for lab in pbar:
            pbar.set_description(f"Processing lab {lab}")
            df_lab = df.loc[df.lab_id == lab,:]
            df_snippets = generate_snippets(df_lab, l, min_seq_length, size, replace, p)
            concat.append(df_snippets)
    else:
        pbar = tqdm(np.sort(df.sequence_id.unique()))
        for seq in pbar:
            pbar.set_description(f"Processing sequence {seq}")
            df_seq = df.loc[df.sequence_id == seq,:]
            df_snippets = generate_snippets(df_seq, l, min_seq_length, size, replace, p)
            concat.append(df_snippets)       
    df_samples = pd.concat(concat)
    return df_samples
    

#### Generating training snippets 

In [9]:
%%time
np.random.seed(6739)
l= 32
min_seq_length = None
size = 1000
replace = True
p="seq_length"
test=False
train_snippets = generate_lab_snippets(df_train,l,min_seq_length,size,replace,p,test)
print("Shape of train_snippets: ", train_snippets.shape)
train_snippets.sample(20)

Processing lab ZZJVE4HO: 100%|██████████| 1314/1314 [00:10<00:00, 121.11it/s]


Shape of train_snippets:  (1314000, 43)
CPU times: user 11.3 s, sys: 402 ms, total: 11.7 s
Wall time: 11.2 s


Unnamed: 0,sequence_id,lab_id,sequence,seq_length,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
53832,J7M64,7QF2VB5B,TTAGAGTGGAGGTTTGACAGCCGCCTAGCATT,32,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5842,NM5ZW,PRU3JF6Y,GTTCTGTAAAAATGCAGCTCAGATTCTTTGTT,32,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18626,RU6DS,5H71LUBY,GCTCACCCAGAAACGCTGGTGAAAGTAAAAGA,32,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
46928,05D1L,MFZHQ165,AAACAGAATTTGCCTGGCGGCAGTAGCGCGGT,32,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3049,38QN0,GDV3S3ZG,GTTCTTCTGTTACGGTAAATGTCGTAGGATAT,32,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2611,UVOEL,5Q9ETXJL,TATCAGTGATAGAGATCGTCGACGAGCTCGTT,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6068,MYV0B,WM9JWC4B,TCTCTGGTATCTGAGAAATATACTCTCTGAAG,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
21044,R3BOZ,XCWSW5T9,GACGGTACCGCGGGCCCGGGATCCACCGGATC,32,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
24600,S4UYC,QV09SDY8,TTGCTGGTGACCCAATGCGACCAGATGCTCCA,32,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30288,3WOJQ,E6G69ESA,TTTCTAGGGTTAAAGAGCTCCAATTCGCCCTA,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
train_snippets.seq_length.unique()

array([32])

#### Generating validation snippets 

In [11]:
%%time
np.random.seed(8135)
l=32
min_seq_length = None
size = 1000
replace = True
p="seq_length"
test = False
val_snippets = generate_lab_snippets(df_val,l,min_seq_length,size,replace,p,test)
print("Shape of val_snippets: ", val_snippets.shape)
val_snippets.sample(10)

Processing lab ZZJVE4HO: 100%|██████████| 1175/1175 [00:07<00:00, 156.11it/s]


Shape of val_snippets:  (1175000, 43)
CPU times: user 7.98 s, sys: 336 ms, total: 8.31 s
Wall time: 7.78 s


Unnamed: 0,sequence_id,lab_id,sequence,seq_length,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
3866,TAV6B,Q3O4J4HB,CAACGCGCGGGGAGAGGCGGTTTGCGTATTGG,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5199,S31Z0,IMVSI4VW,ACAGAGCCCCAGTTCCAGCCTGGAGAGAACCT,32,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
11795,4XZ1F,384ASNLB,CAGCTACAGCACCATCTAGATGATCTCTTAAG,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
34183,6F2B8,EMJXDINV,TATGTGGACTACAGACTGGAAAGAATCAAGGA,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
59113,NP99V,A44GW57T,GACATCGTCGACTACTTCAAGAACTCCTGCCC,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
16447,47TEE,TJLVHJ87,ATAACTACGATACGGGAGGGCTTACCATCTGG,32,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
23364,S66EU,U5966IDO,AAAATGCCGCAAAAAAGGGAATAAGGGCGACA,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23741,348OY,UBXL2EGE,ACCTTTTTCAACTGAAAAATTGGGAGAAAAAG,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
57562,DOCRS,L76WWQ74,TCTAAAGTATATATGAGTAAACTTGGTCTGAC,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59114,4NNAU,A44GW57T,CGGGCTGAACGGGGGGTTCGTGCACACAGCCC,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [12]:
val_snippets.seq_length.unique()

array([32])

#### Generating test snippets

To generate test snippets, we first "augment" the sequences that are not long enough.

In [13]:
def augment_sequence_by_repetition(df,min_seq_length):
    df = df.copy()
    locs = df_test.seq_length<min_seq_length
    lengths = np.array([len(seq) for seq in df.loc[locs,"sequence"]])
    factors  = np.ceil(min_seq_length/lengths).astype(int)
    df.loc[locs,"sequence"] = df.loc[locs,"sequence"]*factors
    ### updating lengths of sequences
    df.loc[locs,"seq_length"] = [len(seq) for seq in df.loc[locs,"sequence"]]
    return df

In [14]:
%%time
min_seq_length = 32
df_test_augmented = augment_sequence_by_repetition(df_test,min_seq_length)

CPU times: user 5.3 ms, sys: 82 µs, total: 5.39 ms
Wall time: 4.73 ms


#### Checking augmentation

In [15]:
dci = {}
for ix, (seq, seq_len) in enumerate(zip(df_test_augmented.loc[:,"sequence"].values, df_test_augmented.loc[:,"seq_length"].values)):
    if len(seq)<32 or len(seq)!=seq_len:
        dci[ix] = (seq,seq_len)
dci

{}

In [16]:
%%time
np.random.seed(9278)
l=32
min_seq_length = None
size = 100
replace = True
p="seq_length"
test = True
test_snippets = generate_lab_snippets(df_test_augmented,l,min_seq_length,size,replace,p,test)
print("Shape of test_snippets: ", test_snippets.shape)
test_snippets.sample(10)

Processing sequence ZZZ9J: 100%|██████████| 18816/18816 [01:02<00:00, 301.45it/s]


Shape of test_snippets:  (1881600, 43)
CPU times: user 1min 12s, sys: 3.55 s, total: 1min 16s
Wall time: 1min 9s


Unnamed: 0,sequence_id,lab_id,sequence,seq_length,bacterial_resistance_ampicillin,bacterial_resistance_chloramphenicol,bacterial_resistance_kanamycin,bacterial_resistance_other,bacterial_resistance_spectinomycin,copy_number_high_copy,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
14673,3T2CT,,AAGTCCGGACTCAGATCTCGAGCTCAAGCTTC,32,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8617,ZXPLV,,AGTCGCAATCGCGAACAATAATGGGGGAAAGC,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14040,QXGFB,,CTCGTGACCACCCTGACCTACGGCGTGCAGTG,32,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10301,NZ22Z,,CATCTGGCCCCAGTGCTGCAATGATACCGCGA,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
11221,MP1PF,,TTGGAACGCGGATTCCCCGTGCCAAGAGTGAC,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2779,7V97R,,TACTCAACCAAGTCATTCTGAGAATAGTGTAT,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14786,O1Y51,,CTTGAAGGGCGACGTGAGCATGTACCTGCTGC,32,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7596,1NYX8,,AACCCACTCGTGCACCCAACTGATCTTCAGCA,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
10736,W6XDW,,AGGAAAGTCCCATAAGGTCATGTACTGGGCAT,32,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
6206,E6LV8,,ACAGGTTTGCGTTTTTCGCTGGACCTCGGCTA,32,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
test_snippets.seq_length.unique()

array([32])

## Mapping string patterns to vectors

In [18]:
def str2vec(sequence,keywords={"A":[1,0,0,0,0], "T":[0,1,0,0,0], "G":[0,0,1,0,0], "C":[0,0,0,1,0], "N":[0,0,0,0,1]}):
    vec = []
    for s in sequence:
        vec.extend(keywords[s])
    return np.array(vec)

def str2mat(sequences,keywords={"A":[1,0,0,0,0], "T":[0,1,0,0,0], "G":[0,0,1,0,0], "C":[0,0,0,1,0], "N":[0,0,0,0,1]}):
    mat = np.array([str2vec(seq) for seq in tqdm(sequences)])
    return mat

## Generating PCA engineered data sets

In [19]:
train_snippets.columns.get_loc("sequence")

2

In [20]:
def add_pca_features(df,pca):
    mat = str2mat(df.sequence.values)
    mat = pca.transform(mat)
    meta_cols = list(df.columns[:df.columns.get_loc("bacterial_resistance_ampicillin")])
    data_cols = list(df.columns[df.columns.get_loc("bacterial_resistance_ampicillin"):])
    pca_cols = [f"pca_{ix}".zfill(1) if ix <10 else f"pca_{ix}" for ix in range(mat.shape[1])]
    cols = meta_cols + pca_cols + data_cols 
    df_meta = df.loc[:,meta_cols]
    df_pca = pd.DataFrame(mat,columns=pca_cols,index=df.index)
    df_data = df.loc[:,data_cols]
    df = pd.concat([df_meta,df_pca,df_data],axis=1)
    return df

#### Training set

In [21]:
%%time
df_train_pca = add_pca_features(train_snippets,pca_32)
print(df_train_pca.shape)

100%|██████████| 1314000/1314000 [00:14<00:00, 89113.72it/s]


(1314000, 138)
CPU times: user 17.8 s, sys: 639 ms, total: 18.4 s
Wall time: 17.8 s


#### Validation set 

In [22]:
%%time
df_val_pca = add_pca_features(val_snippets,pca_32)
print(df_val_pca.shape)

100%|██████████| 1175000/1175000 [00:12<00:00, 91093.29it/s]


(1175000, 138)
CPU times: user 15.4 s, sys: 390 ms, total: 15.8 s
Wall time: 15.2 s


#### Test set

In [23]:
%%time
df_test_pca = add_pca_features(test_snippets,pca_32)
print(df_test_pca.shape)

100%|██████████| 1881600/1881600 [00:21<00:00, 89174.29it/s]


(1881600, 138)
CPU times: user 24.7 s, sys: 809 ms, total: 25.5 s
Wall time: 24.6 s


## Saving datasets with PCA engineered features

In [24]:
def save_datasets(savedir,dfs_dict,overwrite=True):
    pbar = tqdm(dfs_dict.items(),total=len(dfs_dict))
    if not os.path.isdir(savedir):
        print(f"Creating directory {savedir}")
        os.mkdir(savedir)
    elif overwrite:
        print(f"Directory {savedir} already exists. Overwriting.")
        shutil.rmtree(savedir)
        os.mkdir(savedir)
    else:
        print(f"Directory {savedir} already exists. Skipping.")
    for f, df in pbar:
        savepath = os.path.join(savedir,f+".csv")
        df.to_csv(savepath,index=True)
        

#### Saving 

In [25]:
%%time
savedir = os.path.join(pca_engineered_datasets_dir,"pca_32_95comp","train_val_test")
dfs_dict = {"train": df_train_pca, "val": df_val_pca, "test": df_test_pca}
overwrite=True
save_datasets(savedir,dfs_dict,overwrite)

  0%|          | 0/3 [00:00<?, ?it/s]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_val_test


100%|██████████| 3/3 [10:00<00:00, 200.07s/it]

CPU times: user 9min 51s, sys: 5.46 s, total: 9min 56s
Wall time: 10min



