# Generating training sets for multiclassifier approach

In this notebook we split the `lab_ids` in groups ang generate training/val test sets for each of these groups. We have already seen that there are `1314`unique lab ids. Our strategy here will be to separate `9` subsets of lab ids, each containing `146`lab_ids

In [1]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil
import csv
import requests
from tqdm import tqdm
from collections import OrderedDict, defaultdict, Counter
import seaborn as sns
import json
#from sklearn.decomposition import PCA
#from joblib import dump, load
import joblib
import xgboost as xgb
from sklearn.metrics import accuracy_score
import shutil

CPU times: user 500 ms, sys: 52.3 ms, total: 552 ms
Wall time: 551 ms


## Loading data

In [2]:
#parent_dir = "/content/genetic_engineering_attribution"
parent_dir = "/home/rio/data_sets/genetic_engineering_attribution"

#### Data

In [3]:
%%time
### pca directory
pca_dir = os.path.join(parent_dir,"pca")

### pca engineered data sets
pca_engineered_datasets_dir = os.path.join(parent_dir,"pca_engineered_datasets")

### pca_32_95comp dir
pca_16_48comp_dir = os.path.join(pca_engineered_datasets_dir,"pca_16_48comp")

### paths to csvs
full_train_path = os.path.join(pca_16_48comp_dir,"full_train.csv")
#train_path = os.path.join(pca_32_95comp_dir,"train.csv")
#val_path = os.path.join(pca_32_95comp_dir,"val.csv")
test_path = os.path.join(pca_16_48comp_dir,"test.csv")

### loading dataframes
df_full_train = pd.read_csv(full_train_path,index_col=0)
#df_train = pd.read_csv(train_path,index_col=0)
#df_val = pd.read_csv(val_path,index_col=0)
df_test = pd.read_csv(test_path,index_col=0)

### Printing shapes:
print(f"Shape of df_train: {df_train.shape}")
print(f"Shape of df_train: {df_train.shape}")
print(f"Shape of df_val: {df_val.shape}")
print(f"Shape of df_test: {df_test.shape}")

  mask |= (ar1 == a)


Shape of df_train: (1314000, 138)
Shape of df_val: (1175000, 138)
Shape of df_test: (1881600, 138)
CPU times: user 1min 8s, sys: 5.51 s, total: 1min 13s
Wall time: 1min 28s


In [4]:
df_train.sample(8)

Unnamed: 0,sequence_id,lab_id,sequence,seq_length,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
21170,1V42L,5H71LUBY,TGTCGGGTTTCGCCACCTCTGACTTGAGCGTC,32,-0.655459,-0.575996,-0.701846,0.865317,0.394443,-0.357696,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8650,E4O93,IJEA3NUI,CGCTCCGGCGACGTCGCGCGCGGTGAGCACCG,32,-1.731101,0.675946,0.288456,-0.085549,-0.74327,0.606746,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5336,2O2BC,L78GOBQS,GGGCTGAACTTGCGCTCCATGGTGCTGTTGCT,32,-0.457642,0.275613,1.727689,0.708453,-0.365478,-0.432843,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24960,JQTDQ,9MEFUZQN,TCACGCGTGGTACCTCTAGAGTCGAGCGGGAT,32,-0.491189,-0.016736,0.112171,-0.025907,0.077988,0.085255,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
967,AULMU,36XLYYGZ,GTTTGTTTGCCGGATCAAGAGCTACCAACTCT,32,0.090328,0.859326,-0.545769,0.480141,0.397847,-0.459812,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11102,WKEU5,Y81SHRRC,GCCGTTTCTTGTTCTTCCTCTGCTGTTTGCTC,32,-0.246316,-0.849993,-0.928291,2.089029,0.191597,1.500329,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
48167,Z1H23,ETR2SP13,CCACTGGCAGCAGCCACTGGTAACAGGATTAG,32,-0.343082,-0.378656,-0.320363,-0.402903,0.756916,0.654228,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
42149,6XGH6,T5R7YFPH,GGGATGGGAAGCTTCAAGTGAAAGAATGTACC,32,0.315009,-0.28404,0.628746,-0.845094,-0.46587,0.094238,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Generating train_splits dir

In [5]:
def generate_dir(directory,delete_dir=True):
    if not os.path.isdir(directory):
        print(f"Creating directory {directory}")
        os.makedirs(directory, exist_ok=True)
    elif delete_dir:
        print(f"Directory {directory} already exists. Deleting an recreating.")
        shutil.rmtree(directory)
        os.makedirs(directory, exist_ok=True)
    else:
        print(f"Directory {directory} already exists. I will either overwrite or add files to it.")

In [6]:
#train_splits_dir = os.path.join(os.path.join(pca_32_95comp_dir,"train_splits"))
#print("train_splits_dir: ", train_splits_dir)

In [7]:
#delete_dir = True
#generate_dir(train_splits_dir,delete_dir)

## Generating lab_id split data sets 

In [8]:
def generate_lab_id_datasets(df,n_lab_ids=9,batch_size=1000,impurity_ratio=3,savedir=None,delete_dir=True,verbose=False):
    generate_dir(savedir,delete_dir)
    #if not os.path.isdir(savedir):
    #    print(f"Creating directory {savedir}")
    #    os.mkdir(savedir)
    #elif delete_dir:
    #    print(f"Directory {savedir} already exists. Deleting an recreating.")
    #    shutil.rmtree(savedir)
    #    os.mkdir(savedir)
    #else:
    #    print(f"Directory {savedir} already exists. I will either overwrite or add files to it.")
    n_unique = len(df_train.lab_id.unique()) 
    assert n_unique%n_lab_ids ==0, f"n_lab_ids must be a divisor of the number of unique lab_ids, {n_unique}"
    shuffled_lab_ids = np.random.permutation(np.unique(df.lab_id)).reshape(-1,n_lab_ids)
    if verbose:
        pbar = tqdm(shuffled_lab_ids)
    else:
        pbar = shuffled_lab_ids
    for batch in pbar:
        if verbose:
            pbar.set_description("Processing batch ["+",".join(batch)+"]\r")
        mask = df.lab_id.isin(batch)
        df_batch = df.loc[mask,:]
        if batch_size is not None:
            #df_batch = pd.DataFrame({'col':np.random.randn(12000), 'target':np.random.randint(low = 0, high = 2, size=12000)})
            df_batch = df_batch.groupby('lab_id').apply(lambda x: x.sample(n=batch_size))#.reset_index(drop = True)
            df_batch.index = df_batch.index.get_level_values(1) 
        ### extracting random sample from out-of-batch classes
        if impurity_ratio is not None:
            df_impurity = df.loc[~mask,:].sample(impurity_ratio*batch_size,replace=True)
        df_batch = pd.concat([df_batch,df_impurity])
        ### saving df_batch
        filename = "_".join(batch)+".csv"
        savepath = os.path.join(savedir,filename)
        if verbose:
            pbar.set_description(f"Writing file {savepath} to disk")
        df_batch.to_csv(savepath,index=True)
    
def generate_splits(df,n_splits,n_lab_ids=9,batch_size=1000,impurity_ratio=3,savedir=None,delete_dir=True,verbose=False):
    #generate_dir(savedir,delete_dir)
    #if not os.path.isdir(savedir):
    #    print(f"Creating directory {savedir}")
    #    os.mkdir(savedir)
    #elif delete_dir:
    #    print(f"Directory {savedir} already exists. Deleting an recreating.")
    #    shutil.rmtree(savedir)
    #    os.mkdir(savedir)
    #else:
    #    print(f"Directory {savedir} already exists. I will either overwrite or add files to it.")
    pbar = tqdm(range(n_splits))
    zeros = np.ceil(np.log10(n_splits)).astype(int)
    for s in pbar:
        split_dir = os.path.join(savedir,"split_"+str(s).zfill(zeros),"data")
        #generate_dir(split_dir,delete_dir)
        #split_dir = os.path.join(split_dir,"data") ## generating a directory for data. Generate another one for models
        #generate_dir(split_dir,delete_dir)
        generate_lab_id_datasets(df,n_lab_ids,batch_size,impurity_ratio,split_dir,delete_dir,verbose)
        
            
    

#### Generating lab_id splits

In [9]:
train_splits_dir = os.path.join(os.path.join(pca_32_95comp_dir,"train_splits"))
train_splits_dir

'/home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits'

In [10]:
%%time
np.random.seed(1873)
n_splits = 100
n_lab_ids=9
batch_size=1000
impurity_ratio=3
savedir = train_splits_dir
delete_dir = True
verbose=False
generate_splits(df_train,n_splits,n_lab_ids,batch_size,impurity_ratio,savedir,delete_dir,verbose)

  0%|          | 0/100 [00:00<?, ?it/s]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_00/data


  1%|          | 1/100 [04:07<6:48:17, 247.45s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_01/data


  2%|▏         | 2/100 [08:14<6:43:52, 247.27s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_02/data


  3%|▎         | 3/100 [12:24<6:41:22, 248.27s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_03/data


  4%|▍         | 4/100 [16:38<6:39:33, 249.72s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_04/data


  5%|▌         | 5/100 [20:43<6:33:33, 248.56s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_05/data


  6%|▌         | 6/100 [24:49<6:28:14, 247.81s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_06/data


  7%|▋         | 7/100 [28:56<6:23:18, 247.29s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_07/data


  8%|▊         | 8/100 [33:05<6:19:58, 247.81s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_08/data


  9%|▉         | 9/100 [37:24<6:21:08, 251.31s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_09/data


 10%|█         | 10/100 [41:40<6:18:58, 252.65s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_10/data


 11%|█         | 11/100 [45:50<6:13:38, 251.89s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_11/data


 12%|█▏        | 12/100 [49:57<6:07:32, 250.59s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_12/data


 13%|█▎        | 13/100 [54:04<6:01:43, 249.46s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_13/data


 14%|█▍        | 14/100 [58:11<5:56:13, 248.53s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_14/data


 15%|█▌        | 15/100 [1:02:17<5:51:06, 247.84s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_15/data


 16%|█▌        | 16/100 [1:06:23<5:46:15, 247.33s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_16/data


 17%|█▋        | 17/100 [1:10:29<5:41:42, 247.02s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_17/data


 18%|█▊        | 18/100 [1:14:35<5:37:13, 246.75s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_18/data


 19%|█▉        | 19/100 [1:18:41<5:32:48, 246.52s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_19/data


 20%|██        | 20/100 [1:22:47<5:28:29, 246.36s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_20/data


 21%|██        | 21/100 [1:26:53<5:24:11, 246.22s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_21/data


 22%|██▏       | 22/100 [1:30:59<5:19:57, 246.13s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_22/data


 23%|██▎       | 23/100 [1:35:07<5:16:23, 246.54s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_23/data


 24%|██▍       | 24/100 [1:39:16<5:13:30, 247.51s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_24/data


 25%|██▌       | 25/100 [1:43:25<5:09:45, 247.81s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_25/data


 26%|██▌       | 26/100 [1:47:32<5:05:19, 247.57s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_26/data


 27%|██▋       | 27/100 [1:51:39<5:01:00, 247.41s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_27/data


 28%|██▊       | 28/100 [1:55:45<4:56:26, 247.03s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_28/data


 29%|██▉       | 29/100 [1:59:51<4:51:57, 246.73s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_29/data


 30%|███       | 30/100 [2:03:57<4:47:27, 246.40s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_30/data


 31%|███       | 31/100 [2:08:04<4:43:31, 246.54s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_31/data


 32%|███▏      | 32/100 [2:12:11<4:39:42, 246.80s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_32/data


 33%|███▎      | 33/100 [2:16:17<4:35:19, 246.57s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_33/data


 34%|███▍      | 34/100 [2:20:24<4:31:28, 246.80s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_34/data


 35%|███▌      | 35/100 [2:24:31<4:27:23, 246.82s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_35/data


 36%|███▌      | 36/100 [2:28:38<4:23:08, 246.70s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_36/data


 37%|███▋      | 37/100 [2:32:44<4:18:53, 246.56s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_37/data


 38%|███▊      | 38/100 [2:36:51<4:14:47, 246.58s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_38/data


 39%|███▉      | 39/100 [2:40:57<4:10:28, 246.37s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_39/data


 40%|████      | 40/100 [2:45:02<4:06:08, 246.15s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_40/data


 41%|████      | 41/100 [2:49:08<4:02:02, 246.14s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_41/data


 42%|████▏     | 42/100 [2:53:15<3:58:12, 246.42s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_42/data


 43%|████▎     | 43/100 [2:57:22<3:54:16, 246.61s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_43/data


 44%|████▍     | 44/100 [3:01:29<3:50:11, 246.63s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_44/data


 45%|████▌     | 45/100 [3:05:37<3:46:18, 246.89s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_45/data


 46%|████▌     | 46/100 [3:09:43<3:42:10, 246.87s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_46/data


 47%|████▋     | 47/100 [3:13:49<3:37:47, 246.56s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_47/data


 48%|████▊     | 48/100 [3:17:55<3:33:29, 246.34s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_48/data


 49%|████▉     | 49/100 [3:22:00<3:29:07, 246.03s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_49/data


 50%|█████     | 50/100 [3:26:06<3:24:57, 245.95s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_50/data


 51%|█████     | 51/100 [3:30:12<3:20:53, 245.99s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_51/data


 52%|█████▏    | 52/100 [3:34:18<3:16:50, 246.05s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_52/data


 53%|█████▎    | 53/100 [3:38:24<3:12:42, 246.00s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_53/data


 54%|█████▍    | 54/100 [3:42:31<3:08:43, 246.16s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_54/data


 55%|█████▌    | 55/100 [3:46:37<3:04:33, 246.08s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_55/data


 56%|█████▌    | 56/100 [3:50:43<3:00:27, 246.08s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_56/data


 57%|█████▋    | 57/100 [3:54:49<2:56:22, 246.11s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_57/data


 58%|█████▊    | 58/100 [3:58:55<2:52:21, 246.23s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_58/data


 59%|█████▉    | 59/100 [4:03:02<2:48:16, 246.25s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_59/data


 60%|██████    | 60/100 [4:07:09<2:44:18, 246.46s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_60/data


 61%|██████    | 61/100 [4:11:15<2:40:12, 246.48s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_61/data


 62%|██████▏   | 62/100 [4:15:21<2:35:57, 246.24s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_62/data


 63%|██████▎   | 63/100 [4:19:27<2:31:46, 246.11s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_63/data


 64%|██████▍   | 64/100 [4:23:33<2:27:37, 246.04s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_64/data


 65%|██████▌   | 65/100 [4:27:39<2:23:31, 246.05s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_65/data


 66%|██████▌   | 66/100 [4:31:45<2:19:29, 246.17s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_66/data


 67%|██████▋   | 67/100 [4:35:51<2:15:25, 246.23s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_67/data


 68%|██████▊   | 68/100 [4:39:57<2:11:11, 246.00s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_68/data


 69%|██████▉   | 69/100 [4:44:03<2:07:07, 246.05s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_69/data


 70%|███████   | 70/100 [4:48:09<2:03:02, 246.08s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_70/data


 71%|███████   | 71/100 [4:52:16<1:59:01, 246.26s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_71/data


 72%|███████▏  | 72/100 [4:56:22<1:54:53, 246.20s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_72/data


 73%|███████▎  | 73/100 [5:00:28<1:50:46, 246.18s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_73/data


 74%|███████▍  | 74/100 [5:04:35<1:46:43, 246.28s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_74/data


 75%|███████▌  | 75/100 [5:08:40<1:42:33, 246.13s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_75/data


 76%|███████▌  | 76/100 [5:12:47<1:38:27, 246.15s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_76/data


 77%|███████▋  | 77/100 [5:16:53<1:34:21, 246.15s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_77/data


 78%|███████▊  | 78/100 [5:20:59<1:30:17, 246.24s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_78/data


 79%|███████▉  | 79/100 [5:25:05<1:26:11, 246.24s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_79/data


 80%|████████  | 80/100 [5:29:12<1:22:05, 246.27s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_80/data


 81%|████████  | 81/100 [5:33:18<1:18:00, 246.36s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_81/data


 82%|████████▏ | 82/100 [5:37:25<1:13:56, 246.46s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_82/data


 83%|████████▎ | 83/100 [5:41:32<1:09:50, 246.50s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_83/data


 84%|████████▍ | 84/100 [5:45:38<1:05:43, 246.47s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_84/data


 85%|████████▌ | 85/100 [5:49:45<1:01:38, 246.54s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_85/data


 86%|████████▌ | 86/100 [5:53:52<57:32, 246.61s/it]  

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_86/data


 87%|████████▋ | 87/100 [5:57:58<53:26, 246.68s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_87/data


 88%|████████▊ | 88/100 [6:02:04<49:18, 246.50s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_88/data


 89%|████████▉ | 89/100 [6:06:11<45:10, 246.41s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_89/data


 90%|█████████ | 90/100 [6:10:17<41:04, 246.48s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_90/data


 91%|█████████ | 91/100 [6:14:24<36:59, 246.58s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_91/data


 92%|█████████▏| 92/100 [6:18:31<32:53, 246.63s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_92/data


 93%|█████████▎| 93/100 [6:22:38<28:46, 246.65s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_93/data


 94%|█████████▍| 94/100 [6:26:44<24:39, 246.51s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_94/data


 95%|█████████▌| 95/100 [6:30:51<20:33, 246.64s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_95/data


 96%|█████████▌| 96/100 [6:34:58<16:27, 246.89s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_96/data


 97%|█████████▋| 97/100 [6:39:05<12:20, 246.90s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_97/data


 98%|█████████▊| 98/100 [6:43:12<08:13, 246.84s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_98/data


 99%|█████████▉| 99/100 [6:47:19<04:06, 246.81s/it]

Creating directory /home/rio/data_sets/genetic_engineering_attribution/pca_engineered_datasets/pca_32_95comp/train_splits/split_99/data


100%|██████████| 100/100 [6:51:25<00:00, 246.85s/it]

CPU times: user 6h 25min 6s, sys: 25min 53s, total: 6h 51min
Wall time: 6h 51min 25s





## Now try a simple random forest!