# Generating training sets for multiclassifier approach

In this notebook we split the `lab_ids` in groups ang generate training/val test sets for each of these groups. We have already seen that there are `1314`unique lab ids. Our strategy here will be to separate `146` subsets of lab ids, each containing `9`lab_ids

In [1]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil
import csv
import requests
from tqdm import tqdm
from collections import OrderedDict, defaultdict, Counter
import seaborn as sns
import json
#from sklearn.decomposition import PCA
#from joblib import dump, load
import joblib
import xgboost as xgb
from sklearn.metrics import accuracy_score
import shutil

CPU times: user 533 ms, sys: 39.5 ms, total: 572 ms
Wall time: 571 ms


## Loading data

In [2]:
#parent_dir = "/content/genetic_engineering_attribution"
parent_dir = "/home/rio/data_sets/genetic_engineering_attribution"

#### Data

In [3]:
%%time
### pca directory
pca_dir = os.path.join(parent_dir,"pca")

### pca engineered data sets
pca_engineered_datasets_dir = os.path.join(parent_dir,"pca_engineered_datasets")

### pca_32_95comp dir
pca_32_95comp_dir = os.path.join(pca_engineered_datasets_dir,"pca_32_95comp")

### paths to csvs
train_path = os.path.join(pca_32_95comp_dir,"train.csv")
val_path = os.path.join(pca_32_95comp_dir,"val.csv")
test_path = os.path.join(pca_32_95comp_dir,"test.csv")

### loading dataframes
df_train = pd.read_csv(train_path,index_col=0)
df_val = pd.read_csv(val_path,index_col=0)
df_test = pd.read_csv(test_path,index_col=0)

### Printing shapes:
print(f"Shape of df_train: {df_train.shape}")
print(f"Shape of df_val: {df_val.shape}")
print(f"Shape of df_test: {df_test.shape}")

  mask |= (ar1 == a)


Shape of df_train: (1314000, 138)
Shape of df_val: (1175000, 138)
Shape of df_test: (1881600, 138)
CPU times: user 56.9 s, sys: 2.18 s, total: 59 s
Wall time: 59 s


In [4]:
df_train.sample(8)

Unnamed: 0,sequence_id,lab_id,sequence,seq_length,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
21170,1V42L,5H71LUBY,TGTCGGGTTTCGCCACCTCTGACTTGAGCGTC,32,-0.655459,-0.575996,-0.701846,0.865317,0.394443,-0.357696,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8650,E4O93,IJEA3NUI,CGCTCCGGCGACGTCGCGCGCGGTGAGCACCG,32,-1.731101,0.675946,0.288456,-0.085549,-0.74327,0.606746,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5336,2O2BC,L78GOBQS,GGGCTGAACTTGCGCTCCATGGTGCTGTTGCT,32,-0.457642,0.275613,1.727689,0.708453,-0.365478,-0.432843,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24960,JQTDQ,9MEFUZQN,TCACGCGTGGTACCTCTAGAGTCGAGCGGGAT,32,-0.491189,-0.016736,0.112171,-0.025907,0.077988,0.085255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
967,AULMU,36XLYYGZ,GTTTGTTTGCCGGATCAAGAGCTACCAACTCT,32,0.090328,0.859326,-0.545769,0.480141,0.397847,-0.459812,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11102,WKEU5,Y81SHRRC,GCCGTTTCTTGTTCTTCCTCTGCTGTTTGCTC,32,-0.246316,-0.849993,-0.928291,2.089029,0.191597,1.500329,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
48167,Z1H23,ETR2SP13,CCACTGGCAGCAGCCACTGGTAACAGGATTAG,32,-0.343082,-0.378656,-0.320363,-0.402903,0.756916,0.654228,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
42149,6XGH6,T5R7YFPH,GGGATGGGAAGCTTCAAGTGAAAGAATGTACC,32,0.315009,-0.28404,0.628746,-0.845094,-0.46587,0.094238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Generating train_splits dir

In [5]:
def generate_dir(directory,delete_dir=True):
    if not os.path.isdir(directory):
        print(f"Creating directory {directory}")
        os.makedirs(directory, exist_ok=True)
    elif delete_dir:
        print(f"Directory {directory} already exists. Deleting an recreating.")
        shutil.rmtree(directory)
        os.makedirs(directory, exist_ok=True)
    else:
        print(f"Directory {directory} already exists. I will either overwrite or add files to it.")

In [6]:
#train_splits_dir = os.path.join(os.path.join(pca_32_95comp_dir,"train_splits"))
#print("train_splits_dir: ", train_splits_dir)

In [7]:
#delete_dir = True
#generate_dir(train_splits_dir,delete_dir)

## Generating lab_id split data sets 

In [8]:
def generate_lab_id_datasets(df,n_lab_ids=9,batch_size=1000,impurity_ratio=3,savedir=None,delete_dir=True,verbose=False):
    generate_dir(savedir,delete_dir)
    #if not os.path.isdir(savedir):
    #    print(f"Creating directory {savedir}")
    #    os.mkdir(savedir)
    #elif delete_dir:
    #    print(f"Directory {savedir} already exists. Deleting an recreating.")
    #    shutil.rmtree(savedir)
    #    os.mkdir(savedir)
    #else:
    #    print(f"Directory {savedir} already exists. I will either overwrite or add files to it.")
    n_unique = len(df_train.lab_id.unique()) 
    assert n_unique%n_lab_ids ==0, f"n_lab_ids must be a divisor of the number of unique lab_ids, {n_unique}"
    shuffled_lab_ids = np.random.permutation(np.unique(df.lab_id)).reshape(-1,n_lab_ids)
    if verbose:
        pbar = tqdm(shuffled_lab_ids)
    else:
        pbar = shuffled_lab_ids
    for batch in pbar:
        if verbose:
            pbar.set_description("Processing batch ["+",".join(batch)+"]\r")
        mask = df.lab_id.isin(batch)
        df_batch = df.loc[mask,:]
        if batch_size is not None:
            #df_batch = pd.DataFrame({'col':np.random.randn(12000), 'target':np.random.randint(low = 0, high = 2, size=12000)})
            df_batch = df_batch.groupby('lab_id').apply(lambda x: x.sample(n=batch_size))#.reset_index(drop = True)
            df_batch.index = df_batch.index.get_level_values(1) 
        ### extracting random sample from out-of-batch classes
        if impurity_ratio is not None:
            df_impurity = df.loc[~mask,:].sample(impurity_ratio*batch_size,replace=True)
        df_batch = pd.concat([df_batch,df_impurity])
        ### saving df_batch
        filename = "_".join(batch)+".csv"
        savepath = os.path.join(savedir,filename)
        if verbose:
            pbar.set_description(f"Writing file {savepath} to disk")
        df_batch.to_csv(savepath,index=True)
    
def generate_splits(df,n_splits,n_lab_ids=9,batch_size=1000,impurity_ratio=3,savedir=None,delete_dir=True,verbose=False):
    #generate_dir(savedir,delete_dir)
    #if not os.path.isdir(savedir):
    #    print(f"Creating directory {savedir}")
    #    os.mkdir(savedir)
    #elif delete_dir:
    #    print(f"Directory {savedir} already exists. Deleting an recreating.")
    #    shutil.rmtree(savedir)
    #    os.mkdir(savedir)
    #else:
    #    print(f"Directory {savedir} already exists. I will either overwrite or add files to it.")
    pbar = tqdm(range(n_splits))
    zeros = np.ceil(np.log10(n_splits)).astype(int)
    for s in pbar:
        split_dir = os.path.join(savedir,"split_"+str(s).zfill(zeros),"data")
        #generate_dir(split_dir,delete_dir)
        #split_dir = os.path.join(split_dir,"data") ## generating a directory for data. Generate another one for models
        #generate_dir(split_dir,delete_dir)
        generate_lab_id_datasets(df,n_lab_ids,batch_size,impurity_ratio,split_dir,delete_dir,verbose)
        
            
    

#### Generating lab_id splits

In [9]:
train_splits_dir = os.path.join(os.path.join(pca_32_95comp_dir,"train_splits"))
train_splits_dir

'/home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits'

In [None]:
%%time
np.random.seed(1873)
n_splits = 100
n_lab_ids=9
batch_size=1000
impurity_ratio=3
savedir = train_splits_dir
delete_dir = True
verbose=False
generate_splits(df_train,n_splits,n_lab_ids,batch_size,impurity_ratio,savedir,delete_dir,verbose)

  0%|          | 0/100 [00:00<?, ?it/s]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_00/data


  1%|          | 1/100 [04:07<6:48:17, 247.45s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_01/data


  2%|▏         | 2/100 [08:14<6:43:52, 247.27s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_02/data


  3%|▎         | 3/100 [12:24<6:41:22, 248.27s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_03/data


  4%|▍         | 4/100 [16:38<6:39:33, 249.72s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_04/data


  5%|▌         | 5/100 [20:43<6:33:33, 248.56s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_05/data


  6%|▌         | 6/100 [24:49<6:28:14, 247.81s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_06/data


  7%|▋         | 7/100 [28:56<6:23:18, 247.29s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_07/data


  8%|▊         | 8/100 [33:05<6:19:58, 247.81s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_08/data


  9%|▉         | 9/100 [37:24<6:21:08, 251.31s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_09/data


 10%|█         | 10/100 [41:40<6:18:58, 252.65s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_10/data


 11%|█         | 11/100 [45:50<6:13:38, 251.89s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_11/data


 12%|█▏        | 12/100 [49:57<6:07:32, 250.59s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_12/data


 13%|█▎        | 13/100 [54:04<6:01:43, 249.46s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_13/data


 14%|█▍        | 14/100 [58:11<5:56:13, 248.53s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_14/data


 15%|█▌        | 15/100 [1:02:17<5:51:06, 247.84s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_15/data


 16%|█▌        | 16/100 [1:06:23<5:46:15, 247.33s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_16/data


 17%|█▋        | 17/100 [1:10:29<5:41:42, 247.02s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_17/data


 18%|█▊        | 18/100 [1:14:35<5:37:13, 246.75s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_18/data


 19%|█▉        | 19/100 [1:18:41<5:32:48, 246.52s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_19/data


 20%|██        | 20/100 [1:22:47<5:28:29, 246.36s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_20/data


 21%|██        | 21/100 [1:26:53<5:24:11, 246.22s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_21/data


 22%|██▏       | 22/100 [1:30:59<5:19:57, 246.13s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_22/data


 23%|██▎       | 23/100 [1:35:07<5:16:23, 246.54s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_23/data


## Now try a simple random forest!