### Importing Modules

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle
from joblib import Parallel, delayed
from model_trainer import train_svms

# Scores
from sklearn.model_selection import StratifiedKFold

# Classifiers
from sklearn.svm import SVC

# Ignoring Errors
import warnings
warnings.simplefilter('ignore')

In [2]:
def predict_species(idx, x, y_pred, y, model):
    if y_pred != y:
        return 'xyz'
    
    else:
        if x == 'train':
            y_pred_species = model[y].predict(X_train[idx, :].reshape(1,-1))
            return str(y_pred_species[0])
        elif x == 'test':
            y_pred_species = model[y].predict(X_test[idx, :].reshape(1,-1))
            return str(y_pred_species[0])
    
predict_species = np.vectorize(predict_species)

### To obtain Combined Data-Set For Each Class

In [3]:
kingdom_names = ['Aves', 'Chiroptera', 'Rodentia', 'Polypodiopsida', 'Pucciniomycetes']
before_combine_df = []
for data in kingdom_names:
    if data == 'Aves' or data == 'Chiroptera' or data == 'Rodentia':
        csv_file = pd.read_csv("{}.Cleaned.k{}.csv".format(data, 6))
        csv_file = csv_file.drop(['Unnamed: 0', 'nucleotides'],1)
        taxonomies = pd.DataFrame(np.ravel([data for i in range(csv_file.shape[0])]), columns = ["class"])
        taxonomy_class_added = pd.concat([taxonomies, csv_file], axis=1)
        before_combine_df.append(taxonomy_class_added)
    else:
        csv_file = pd.read_csv("{1}kmerized.{0}.csv".format(data, 6))
        csv_file = pd.concat([csv_file[['class']], csv_file[['genus name']].rename(columns={'genus name': 'genus_name'}), csv_file.iloc[:,:-2]], axis=1)
        before_combine_df.append(csv_file)

data = pd.concat(before_combine_df, axis=0)

### Finding hierarchical predictions

In [4]:
all_train, all_test = [], []
std = StandardScaler()
X = np.array(data.iloc[:,2:]) # X is the barcode sequences
X = std.fit_transform(X)
species = np.ravel(data.iloc[:,1])
y = np.ravel(data.iloc[:,0]) # y is the taxonomic classes

kf = StratifiedKFold(10, random_state=42, shuffle=True)
counter = 1
for train_index, test_index in kf.split(X, species):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    species_train, species_test = species[train_index], species[test_index]

    # Keeping Taxonomy Classes to Predict species
    kingdom_names = ['Aves', 'Chiroptera', 'Rodentia', 'Polypodiopsida', 'Pucciniomycetes']
    kingdoms = {}
    
    for kgdm in kingdom_names:
        kingdoms[kgdm] = [X_train[y_train == kgdm,:], species_train[y_train == kgdm]]
        
    kingdom_names.append('All')
    kingdoms['All'] = [X_train, y_train]
    
    with Parallel(n_jobs=6) as parallel:
        trained_models = parallel(delayed(train_svms)(name, taxonomies) for name in taxonomy_names

    kingdom_models = {}

    # Classifiers
    svm_taxonomy = trained_models[-1]
    for i in range(len(kingdom_names)):
        kingdom_models[kingdom_names[i]] = trained_models[i]

    y_pred_train = svm_taxonomy.predict(X_train)
    y_pred_test = svm_taxonomy.predict(X_test)

    # Predict the Species
    predictions_train = np.ravel(predict_species([i for i in range(X_train.shape[0])], 'train', y_pred_train, y_train, kingdom_models))
    predictions_test = np.ravel(predict_species([i for i in range(X_test.shape[0])], 'test', y_pred_test, y_test, kingdom_models))

    all_train.append((species_train, predictions_train))
    all_test.append((species_test, predictions_test))
    print('fold {} is done!'.format(counter))
    counter += 1 

fold 1 is done!
fold 2 is done!
fold 3 is done!
fold 4 is done!
fold 5 is done!
fold 6 is done!
fold 7 is done!
fold 8 is done!
fold 9 is done!
fold 10 is done!


### Storing The Scores

In [5]:
with open('hierarchical_svm_predicitions_k{}.db'.format(6),'wb') as score:
    pickle.dump([all_train, all_test], score)