# Generating training sets for multiclassifier approach

In this notebook we split the `lab_ids` in groups ang generate training/val test sets for each of these groups. We have already seen that there are `1314`unique lab ids. Our strategy here will be to separate `9` subsets of lab ids, each containing `146`lab_ids

In [1]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil
import csv
import requests
from tqdm import tqdm
from collections import OrderedDict, defaultdict, Counter
import seaborn as sns
import json
#from sklearn.decomposition import PCA
#from joblib import dump, load
import joblib
import xgboost as xgb
from sklearn.metrics import accuracy_score
import shutil

CPU times: user 533 ms, sys: 39.8 ms, total: 573 ms
Wall time: 572 ms


## Loading data

In [2]:
#parent_dir = "/content/genetic_engineering_attribution"
parent_dir = "/home/rio/data_sets/genetic_engineering_attribution"

#### Data

In [3]:
%%time
### pca directory
pca_dir = os.path.join(parent_dir,"pca")

### pca engineered data sets
pca_engineered_datasets_dir = os.path.join(parent_dir,"pca_engineered_datasets")

### pca_32_95comp dir
pca_16_48comp_dir = os.path.join(pca_engineered_datasets_dir,"pca_16_48comp")
train_val_test_dir = os.path.join(pca_16_48comp_dir,"train_val_test")

### paths to csvs
full_train_path = os.path.join(train_val_test_dir,"full_train.csv")
#train_path = os.path.join(pca_32_95comp_dir,"train.csv")
#val_path = os.path.join(pca_32_95comp_dir,"val.csv")
test_path = os.path.join(train_val_test_dir,"test.csv")

### loading dataframes
df_full_train = pd.read_csv(full_train_path,index_col=0)
#df_train = pd.read_csv(train_path,index_col=0)
#df_val = pd.read_csv(val_path,index_col=0)
df_test = pd.read_csv(test_path,index_col=0)

### Printing shapes:
print(f"Shape of df_full_train: {df_full_train.shape}")
#print(f"Shape of df_train: {df_train.shape}")
#print(f"Shape of df_val: {df_val.shape}")
print(f"Shape of df_test: {df_test.shape}")

  mask |= (ar1 == a)


Shape of df_full_train: (1971000, 91)
Shape of df_test: (1881600, 91)
CPU times: user 29.7 s, sys: 1.49 s, total: 31.1 s
Wall time: 31.1 s


In [4]:
df_full_train.sample(8)

Unnamed: 0,sequence_id,lab_id,sequence,seq_length,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
34884,9ER34,A2A1R52R,AGATTCAGGTTACAAT,16,0.963592,-0.464117,0.052355,-0.013126,0.209852,0.419804,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
30686,QCVF4,2FCX4O0X,CAGCCAGCCAGACGCA,16,-0.522922,-0.131776,0.173052,-0.802706,0.281142,-0.164166,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23030,T9LTO,EKHYS325,ACGAGCAGCGCTTTGC,16,-0.242151,-1.000646,-0.154113,-0.32027,-1.097626,-0.341326,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2404,ETXJT,Q2K8NHZY,TTGACGAGTTCTTCTG,16,0.166205,-0.564415,0.541259,0.673667,-1.054254,-0.281294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1788,QWUAU,JPO7CTQP,TGCCAACCTGCTCATT,16,-0.397965,0.592384,0.558828,0.670752,-0.561487,0.106409,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
34182,FD1GR,EMJXDINV,CCGATTATGCCAGCCT,16,-0.295273,-0.158168,-0.113019,0.241383,0.223386,0.329199,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
23148,XRZMO,ICRBJL24,ACGGATGGTGATCCCC,16,-0.742338,0.800034,0.26302,0.116042,-0.457552,-0.036876,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14899,U2VKI,1VPOX8VI,GGGTGCCACCAGAGGA,16,-0.357624,-0.543644,-0.949499,-0.871873,0.608746,0.189836,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Generating train_splits dir

In [5]:
def generate_dir(directory,delete_dir=True):
    if not os.path.isdir(directory):
        print(f"Creating directory {directory}")
        os.makedirs(directory, exist_ok=True)
    elif delete_dir:
        print(f"Directory {directory} already exists. Deleting and recreating.")
        shutil.rmtree(directory)
        os.makedirs(directory, exist_ok=True)
    else:
        print(f"Directory {directory} already exists. I will either overwrite or add files to it.")

In [6]:
#train_splits_dir = os.path.join(os.path.join(pca_32_95comp_dir,"train_splits"))
#print("train_splits_dir: ", train_splits_dir)

In [7]:
#delete_dir = True
#generate_dir(train_splits_dir,delete_dir)

## Generating lab_id split data sets 

In [8]:
l = ["a", "b", "c"]
dict(  zip( range(len(l)),l  ) )

{0: 'a', 1: 'b', 2: 'c'}

In [15]:
def generate_lab_id_datasets(df,n_lab_ids=9,batch_size=1000,impurity_ratio=3,savedir=None,delete_dir=True,
                             verbose=False):
    ### generating directories
    data_dir = os.path.join(savedir,"data")
    generate_dir(data_dir,delete_dir)
    json_dir = os.path.join(savedir,"json")
    generate_dir(json_dir,delete_dir)
    
    n_unique = len(df.lab_id.unique()) 
    assert n_unique%n_lab_ids ==0, f"n_lab_ids must be a divisor of the number of unique lab_ids, {n_unique}"
    shuffled_lab_ids = np.random.permutation(np.unique(df.lab_id)).reshape(-1,n_lab_ids)
    n_zeros = np.ceil(np.log10(n_unique//n_lab_ids).astype(int)).astype(int)
    pbar = enumerate(shuffled_lab_ids)
    batch_names_dict = {}
    if verbose:
        pbar = tqdm(pbar)
    for ix, batch in pbar:
        if verbose:
            pbar.set_description("Processing batch ["+",".join(batch)+"]\r")
        mask = df.lab_id.isin(batch)
        df_batch = df.loc[mask,:]
        if batch_size is not None:
            #df_batch = pd.DataFrame({'col':np.random.randn(12000), 'target':np.random.randint(low = 0, high = 2, size=12000)})
            df_batch = df_batch.groupby('lab_id').apply(lambda x: x.sample(n=batch_size))#.reset_index(drop = True)
            df_batch.index = df_batch.index.get_level_values(1) 
        ### extracting random sample from out-of-batch classes
        if impurity_ratio is not None:
            df_impurity = df.loc[~mask,:].sample(impurity_ratio*batch_size,replace=True)
        df_batch = pd.concat([df_batch,df_impurity])
        ### saving df_batch
        batch_name = f"batch_{str(ix).zfill(n_zeros)}.csv" 
        #filename = "_".join(batch)+".csv"
        batch_path = os.path.join(data_dir,batch_name)
        if verbose:
            pbar.set_description(f"Writing file {batch_path} to disk")
        df_batch.to_csv(batch_path,index=True)
        ### building json batch dict
        batch_names_dict[batch_name] = "_".join(batch)
    print("Dumping json with batch names...")
    batch_names_dict_path = os.path.join(json_dir,"batch_names.json")
    print(f"Dumping json with batch names in {batch_names_dict_path}")
    with open(batch_names_dict_path, 'w') as fp:
        json.dump(batch_names_dict, fp)  
    print("Done.")
    
def generate_splits(df,n_splits,n_lab_ids=9,batch_size=1000,impurity_ratio=3,savedir=None,delete_dir=True,verbose=False):
    #generate_dir(savedir,delete_dir)
    #if not os.path.isdir(savedir):
    #    print(f"Creating directory {savedir}")
    #    os.mkdir(savedir)
    #elif delete_dir:
    #    print(f"Directory {savedir} already exists. Deleting an recreating.")
    #    shutil.rmtree(savedir)
    #    os.mkdir(savedir)
    #else:
    #    print(f"Directory {savedir} already exists. I will either overwrite or add files to it.")
    pbar = tqdm(range(n_splits))
    zeros = np.ceil(np.log10(n_splits)).astype(int)
    for s in pbar:
        split_dir = os.path.join(savedir,"split_"+str(s).zfill(zeros))
        ##generate_dir(split_dir,delete_dir)
        #split_dir = os.path.join(split_dir,"data") ## generating a directory for data. Generate another one for models
        #generate_dir(split_dir,delete_dir)
        generate_lab_id_datasets(df,n_lab_ids,batch_size,impurity_ratio,split_dir,delete_dir,verbose)
        
            
    

#### Generating lab_id splits

In [16]:
full_train_splits_dir = os.path.join(os.path.join(pca_16_48comp_dir,"full_train_splits"))
full_train_splits_dir

'/home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits'

In [17]:
%%time
np.random.seed(8773)
n_splits = 10
n_lab_ids=146
batch_size=1000
impurity_ratio=8
savedir = full_train_splits_dir
delete_dir = True
verbose=False
generate_splits(df_full_train,n_splits,n_lab_ids,batch_size,impurity_ratio,savedir,delete_dir,verbose)

  0%|          | 0/10 [00:00<?, ?it/s]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_0/data
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_0/json


 10%|█         | 1/10 [01:39<14:59, 99.93s/it]

Dumping json with batch names...
Dumping json with batch names in /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_0/json/batch_names.json
Done.
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_1/data
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_1/json


 20%|██        | 2/10 [03:18<13:17, 99.64s/it]

Dumping json with batch names...
Dumping json with batch names in /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_1/json/batch_names.json
Done.
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_2/data
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_2/json


 30%|███       | 3/10 [04:56<11:32, 98.94s/it]

Dumping json with batch names...
Dumping json with batch names in /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_2/json/batch_names.json
Done.
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_3/data
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_3/json


 40%|████      | 4/10 [06:34<09:51, 98.65s/it]

Dumping json with batch names...
Dumping json with batch names in /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_3/json/batch_names.json
Done.
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_4/data
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_4/json


 50%|█████     | 5/10 [08:15<08:17, 99.47s/it]

Dumping json with batch names...
Dumping json with batch names in /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_4/json/batch_names.json
Done.
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_5/data
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_5/json


 60%|██████    | 6/10 [09:56<06:39, 99.98s/it]

Dumping json with batch names...
Dumping json with batch names in /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_5/json/batch_names.json
Done.
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_6/data
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_6/json


 70%|███████   | 7/10 [11:37<05:00, 100.33s/it]

Dumping json with batch names...
Dumping json with batch names in /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_6/json/batch_names.json
Done.
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_7/data
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_7/json


 80%|████████  | 8/10 [13:17<03:20, 100.23s/it]

Dumping json with batch names...
Dumping json with batch names in /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_7/json/batch_names.json
Done.
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_8/data
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_8/json


 90%|█████████ | 9/10 [14:55<01:39, 99.40s/it] 

Dumping json with batch names...
Dumping json with batch names in /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_8/json/batch_names.json
Done.
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_9/data
Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_9/json


100%|██████████| 10/10 [16:32<00:00, 99.26s/it]

Dumping json with batch names...
Dumping json with batch names in /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_16_48comp/full_train_splits/split_9/json/batch_names.json
Done.
CPU times: user 15min 59s, sys: 28.3 s, total: 16min 28s
Wall time: 16min 32s





## Now try a simple random forest!