# Fisrt shot with xgboost (using pca_32_95comp data)

In this notebook we run our first `xgboost` model on the data located at `.../pca_engineered_datasets/pca32_95comps` . 

In [1]:
%%time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import shutil
import csv
import requests
from tqdm import tqdm
from collections import OrderedDict, defaultdict, Counter
import seaborn as sns
import json
#from sklearn.decomposition import PCA
#from joblib import dump, load
import joblib
import xgboost as xgb
from sklearn.metrics import accuracy_score

CPU times: user 995 ms, sys: 209 ms, total: 1.2 s
Wall time: 3.45 s


## Loading data

In [2]:
#parent_dir = "/content/genetic_engineering_attribution"
parent_dir = "/home/rio/data_sets/genetic_engineering_attribution"

#### Data

In [3]:
%%time
### pca directory
pca_dir = os.path.join(parent_dir,"pca")

### pca engineered data sets
pca_engineered_datasets_dir = os.path.join(parent_dir,"pca_engineered_datasets")

### pca_32_95comp dir
pca_32_95comp_dir = os.path.join(pca_engineered_datasets_dir,"pca_32_95comp")

### paths to csvs
train_path = os.path.join(pca_32_95comp_dir,"train.csv")
val_path = os.path.join(pca_32_95comp_dir,"val.csv")
test_path = os.path.join(pca_32_95comp_dir,"test.csv")

### loading dataframes
df_train = pd.read_csv(train_path,index_col=0)
df_val = pd.read_csv(val_path,index_col=0)
df_test = pd.read_csv(test_path,index_col=0)

### Printing shapes:
print(f"Shape of df_train: {df_train.shape}")
print(f"Shape of df_val: {df_val.shape}")
print(f"Shape of df_test: {df_test.shape}")

  mask |= (ar1 == a)


Shape of df_train: (1314000, 138)
Shape of df_val: (1175000, 138)
Shape of df_test: (1881600, 138)
CPU times: user 1min 6s, sys: 4.02 s, total: 1min 10s
Wall time: 1min 23s


In [4]:
df_train.head()

Unnamed: 0,sequence_id,lab_id,sequence,seq_length,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
2003,Q5MU0,00Q4V31T,ACCGCCTTTGAGTGAGCTGATACCGCTCGCCG,32,-0.754907,0.013641,-0.449087,0.341165,0.200607,1.050232,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2700,HQPH7,00Q4V31T,TTCGTGTCGACACGGCAGACCACGCGTTTATC,32,-0.374325,-0.744543,0.548367,0.285129,0.575248,-0.5502,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2699,OB8FG,00Q4V31T,AGCTGCGGTAAAGCTCATCAGCGTGGTCGTGC,32,-0.576713,0.420651,-0.953987,0.02237,-0.895677,0.148758,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2700,HQPH7,00Q4V31T,TATAACGTTACTGGTTTCACATTCACCACCCT,32,0.454543,-0.389939,0.478635,0.764074,0.59399,0.791779,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2699,OB8FG,00Q4V31T,CGTAGTTATCTACACGACGGGGAGTCAGGCAA,32,-0.136239,0.615637,-0.252784,-0.41725,-0.672729,0.120384,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


## Selecting features and targets

#### features

In [5]:
pca_0_ix = df_train.columns.get_loc("pca_0")
features = list(df_train.columns[pca_0_ix:])
print("Features to be used: ")
print(features)
print("Number of features: ", len(features))

Features to be used: 
['pca_0', 'pca_1', 'pca_2', 'pca_3', 'pca_4', 'pca_5', 'pca_6', 'pca_7', 'pca_8', 'pca_9', 'pca_10', 'pca_11', 'pca_12', 'pca_13', 'pca_14', 'pca_15', 'pca_16', 'pca_17', 'pca_18', 'pca_19', 'pca_20', 'pca_21', 'pca_22', 'pca_23', 'pca_24', 'pca_25', 'pca_26', 'pca_27', 'pca_28', 'pca_29', 'pca_30', 'pca_31', 'pca_32', 'pca_33', 'pca_34', 'pca_35', 'pca_36', 'pca_37', 'pca_38', 'pca_39', 'pca_40', 'pca_41', 'pca_42', 'pca_43', 'pca_44', 'pca_45', 'pca_46', 'pca_47', 'pca_48', 'pca_49', 'pca_50', 'pca_51', 'pca_52', 'pca_53', 'pca_54', 'pca_55', 'pca_56', 'pca_57', 'pca_58', 'pca_59', 'pca_60', 'pca_61', 'pca_62', 'pca_63', 'pca_64', 'pca_65', 'pca_66', 'pca_67', 'pca_68', 'pca_69', 'pca_70', 'pca_71', 'pca_72', 'pca_73', 'pca_74', 'pca_75', 'pca_76', 'pca_77', 'pca_78', 'pca_79', 'pca_80', 'pca_81', 'pca_82', 'pca_83', 'pca_84', 'pca_85', 'pca_86', 'pca_87', 'pca_88', 'pca_89', 'pca_90', 'pca_91', 'pca_92', 'pca_93', 'pca_94', 'bacterial_resistance_ampicillin', 'b

#### targets

Below we map our target values to integers, to make things simpler.

In [6]:
target = "lab_id"
target_values = np.sort(df_train.loc[:,target].unique()).tolist()
print("Target values (these are the lab_ids): ")
print(target_values)
print("\n")
print("Number of different target values (lab_ids): ", len(target_values))

Target values (these are the lab_ids): 
['00Q4V31T', '012VT4JK', '028IO5W2', '03GRNN7N', '03Y3W51H', '09MQV1TY', '0A4AHRCT', '0A9M05NC', '0B9GCUVV', '0CL7QVG8', '0CML4B5I', '0DTHTJLJ', '0FFBBVE1', '0HWCWFNU', '0L3Y6ZB2', '0M44GDO8', '0MDYJM3H', '0N3V9P9M', '0NP55E93', '0PJ91ZT6', '0R296F9R', '0T2AZBD6', '0URA80CN', '0VRP2DI6', '0W6O08VX', '0WHP4PPK', '0XPTGGLP', '0XS4FHP3', '0Y24J5G2', '10TEBWK2', '11TTDKTM', '131RRHBV', '13LZE1F7', '14PBN8C2', '15D0Z97U', '15S88O4Q', '18C9J8EH', '19CAUKJB', '1AP294AT', '1B9BJ2IP', '1BE35FI1', '1CIHYCE4', '1DJ9L58E', '1DTDCRUO', '1EDZ6CA7', '1HCQTAYT', '1HK4VXP8', '1IXFZ3HO', '1K11RCST', '1KC6XYO6', '1KNFJ6KQ', '1KZHNVYR', '1LBGAU5Z', '1NXRMDN6', '1OQJ21E9', '1OWZDF82', '1PA232PA', '1PIGWQFY', '1Q1IUY3G', '1S515B69', '1TC200QC', '1TI4HS4X', '1UOA7CA1', '1UREJUSJ', '1UU0CHTK', '1VPOX8VI', '1VQS4WNS', '1X0VC0O1', '1XU60MET', '1ZC8RPN1', '20ABQYHS', '20CEB9KE', '216DWMG6', '21ZFBX5E', '24SL2992', '25UVYUID', '26KK8UM5', '27OS3BTP', '28D4D4QM', '298AMR5C',

In [7]:
target_dict = {t: ix for ix, t in enumerate(target_values)}
print("target_dict: ")
print(target_dict)

target_dict: 
{'00Q4V31T': 0, '012VT4JK': 1, '028IO5W2': 2, '03GRNN7N': 3, '03Y3W51H': 4, '09MQV1TY': 5, '0A4AHRCT': 6, '0A9M05NC': 7, '0B9GCUVV': 8, '0CL7QVG8': 9, '0CML4B5I': 10, '0DTHTJLJ': 11, '0FFBBVE1': 12, '0HWCWFNU': 13, '0L3Y6ZB2': 14, '0M44GDO8': 15, '0MDYJM3H': 16, '0N3V9P9M': 17, '0NP55E93': 18, '0PJ91ZT6': 19, '0R296F9R': 20, '0T2AZBD6': 21, '0URA80CN': 22, '0VRP2DI6': 23, '0W6O08VX': 24, '0WHP4PPK': 25, '0XPTGGLP': 26, '0XS4FHP3': 27, '0Y24J5G2': 28, '10TEBWK2': 29, '11TTDKTM': 30, '131RRHBV': 31, '13LZE1F7': 32, '14PBN8C2': 33, '15D0Z97U': 34, '15S88O4Q': 35, '18C9J8EH': 36, '19CAUKJB': 37, '1AP294AT': 38, '1B9BJ2IP': 39, '1BE35FI1': 40, '1CIHYCE4': 41, '1DJ9L58E': 42, '1DTDCRUO': 43, '1EDZ6CA7': 44, '1HCQTAYT': 45, '1HK4VXP8': 46, '1IXFZ3HO': 47, '1K11RCST': 48, '1KC6XYO6': 49, '1KNFJ6KQ': 50, '1KZHNVYR': 51, '1LBGAU5Z': 52, '1NXRMDN6': 53, '1OQJ21E9': 54, '1OWZDF82': 55, '1PA232PA': 56, '1PIGWQFY': 57, '1Q1IUY3G': 58, '1S515B69': 59, '1TC200QC': 60, '1TI4HS4X': 61, '1U

In [8]:
reverse_target_dict = {ix: t for ix, t in enumerate(target_values)}
print("reverse_target_dict: ")
print(reverse_target_dict)

reverse_target_dict: 
{0: '00Q4V31T', 1: '012VT4JK', 2: '028IO5W2', 3: '03GRNN7N', 4: '03Y3W51H', 5: '09MQV1TY', 6: '0A4AHRCT', 7: '0A9M05NC', 8: '0B9GCUVV', 9: '0CL7QVG8', 10: '0CML4B5I', 11: '0DTHTJLJ', 12: '0FFBBVE1', 13: '0HWCWFNU', 14: '0L3Y6ZB2', 15: '0M44GDO8', 16: '0MDYJM3H', 17: '0N3V9P9M', 18: '0NP55E93', 19: '0PJ91ZT6', 20: '0R296F9R', 21: '0T2AZBD6', 22: '0URA80CN', 23: '0VRP2DI6', 24: '0W6O08VX', 25: '0WHP4PPK', 26: '0XPTGGLP', 27: '0XS4FHP3', 28: '0Y24J5G2', 29: '10TEBWK2', 30: '11TTDKTM', 31: '131RRHBV', 32: '13LZE1F7', 33: '14PBN8C2', 34: '15D0Z97U', 35: '15S88O4Q', 36: '18C9J8EH', 37: '19CAUKJB', 38: '1AP294AT', 39: '1B9BJ2IP', 40: '1BE35FI1', 41: '1CIHYCE4', 42: '1DJ9L58E', 43: '1DTDCRUO', 44: '1EDZ6CA7', 45: '1HCQTAYT', 46: '1HK4VXP8', 47: '1IXFZ3HO', 48: '1K11RCST', 49: '1KC6XYO6', 50: '1KNFJ6KQ', 51: '1KZHNVYR', 52: '1LBGAU5Z', 53: '1NXRMDN6', 54: '1OQJ21E9', 55: '1OWZDF82', 56: '1PA232PA', 57: '1PIGWQFY', 58: '1Q1IUY3G', 59: '1S515B69', 60: '1TC200QC', 61: '1TI4HS

#### Generating x_train, x_val, x_test, y_train, y_val, y_test

In [9]:
%%time
### x_train, x_val, x_test
x_train = df_train.loc[:,features]
x_val = df_val.loc[:,features]
x_test = df_test.loc[:,features]

### y_train, y_val
y_train = np.array([target_dict[t] for t in df_train.loc[:,target]])
y_val = np.array([target_dict[t] for t in df_val.loc[:,target]])

### printing shapes
print("Shape of x_train: ", x_train.shape)
print("Shape of y_train: ", y_train.shape)
print("\n")
print("Shape of x_val: ", x_val.shape)
print("Shape of y_val: ", y_val.shape)
print("\n")
print("Shape of x_test: ", x_test.shape)


Shape of x_train:  (1314000, 134)
Shape of y_train:  (1314000,)


Shape of x_val:  (1175000, 134)
Shape of y_val:  (1175000,)


Shape of x_test:  (1881600, 134)
CPU times: user 768 ms, sys: 276 ms, total: 1.04 s
Wall time: 1.04 s


In [10]:
len(np.unique(y_train))

1314

## Fitting xgboost model

In [14]:
y_ix

array([ True,  True,  True, ..., False, False, False])

In [43]:
y_ix_t = (y_train == 10) | (y_train == 15) | (y_train == 32)
x = x_train[y_ix_t]
y = y_train[y_ix_t]

In [44]:
y_ix = (y_val == 10) | (y_val == 15) | (y_val == 32)
x_v = x_val[y_ix]
y_v = y_val[y_ix]

In [45]:
y_v[y_v==10] = 10
y_v[y_v==15] = 15
y_v[y_v==32] = 32

In [46]:
%%time
max_depth = 15
subsample = 0.2
objective = 'binary:logistic'
n_estimators = 100
learning_rate = 0.01
n_jobs = 6
verbosity = 1
model =  xgb.XGBClassifier(max_depth=max_depth,
                        subsample=subsample,
                        objective=objective,
                        n_estimators=n_estimators,
                        learning_rate = learning_rate,
                        n_jobs = n_jobs,
                        verbosity=verbosity)

CPU times: user 55 µs, sys: 4 µs, total: 59 µs
Wall time: 93.9 µs


#### Fitting on train, evaluating on val

In [47]:
%%time
eval_set = [(x_v, y_v)]
val_metrics = ["error", "logloss"]

verbose = True
early_stopping_rounds=5
model.fit(x,y,eval_set=eval_set, verbose=verbose, early_stopping_rounds=early_stopping_rounds)

[0]	validation_0-merror:0.04100
Will train until validation_0-merror hasn't improved in 5 rounds.
[1]	validation_0-merror:0.00267
[2]	validation_0-merror:0.00267
[3]	validation_0-merror:0.00267
[4]	validation_0-merror:0.00267
[5]	validation_0-merror:0.00267
[6]	validation_0-merror:0.00267
Stopping. Best iteration:
[1]	validation_0-merror:0.00267

CPU times: user 531 ms, sys: 11.9 ms, total: 543 ms
Wall time: 98.5 ms


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.01, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=6, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.2,
              tree_method='exact', validate_parameters=1, verbosity=1)

In [27]:
df_val.iloc[y_ix,:].sample(20)

Unnamed: 0,sequence_id,lab_id,sequence,seq_length,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,...,species_budding_yeast,species_fly,species_human,species_mouse,species_mustard_weed,species_nematode,species_other,species_rat,species_synthetic,species_zebrafish
38904,WWX1H,3XE0BJDW,GGGCTTACCATCTGGCCCCCCAAGGCGGTAAT,32,-0.670194,-0.124826,0.964169,0.118969,0.517596,0.148056,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
29073,VPUJL,3XE0BJDW,CCGCCTCCGTCTGAATTTTTGCTTTCGGTTTG,32,-0.133009,0.046507,-0.437236,1.587674,0.341267,-0.133127,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2002,RQCWJ,00Q4V31T,GTAAAAAGGCCGCGTTGCTGGCGTTTTTCCAT,32,-0.053634,-0.69157,0.057415,0.423896,-0.809367,-0.387825,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
29073,VPUJL,3XE0BJDW,GTAAACTTGGTCTGACAGTTACCAATGCTTAA,32,0.707724,0.325097,0.520116,0.194114,0.089968,-0.367062,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2006,E96O7,00Q4V31T,ATTTTAACAAAATATTAACGTTTACAATTTCG,32,1.670937,-0.089839,-0.915875,0.340197,-0.011248,-0.777935,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
29073,VPUJL,3XE0BJDW,AAGGCCAGCAAAAGGCCAGGAACCGTAAAAAG,32,0.133783,-0.668931,0.142749,-1.596903,0.406815,0.258442,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
32700,DYNYF,3XE0BJDW,CGCCCGGGTACCCGTATTCCCAATAAAGCCTC,32,-0.592256,-0.195067,-0.121973,0.250136,0.553952,-0.062066,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
32700,DYNYF,3XE0BJDW,AGCTGGGGAGAATTGTGAAATTGTTATCCGCT,32,0.374222,-0.397397,-0.378129,0.025693,-0.963564,-0.149413,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2006,E96O7,00Q4V31T,GTTGGAGTCCACGTTCTTTAATAGTGGACTCT,32,0.308361,0.783453,-0.114933,0.72476,-0.379844,0.213713,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2006,E96O7,00Q4V31T,CGCGGCCTTTTTACGGTTCCTGGGCTTTTGCT,32,-0.445155,0.540189,0.066569,1.44073,-0.404966,0.028601,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
y_train == 1

In [None]:
%%time
xgb_params = {"objective": "multi:softprob", "max_depth": 8, "silent": False, "num_class":len(target_values)}
num_rounds = 1000
verbose_eval = True 
dtrain = xgb.DMatrix(x_train, y_train)    #training data
dvalid = xgb.DMatrix(x_val, y_val)    #validation data
thisxgb = xgb.train(xgb_params, dtrain, num_rounds, \
                    [(dtrain,'train'),(dvalid,'val')], \
                    early_stopping_rounds=10,\
                    verbose_eval=verbose_eval
                   )    #stop if no improvement in 10 rounds


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




#### Predicting on training set

In [None]:
#y_train_pred = model.predict(x_train)