## Importing Modules

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle
from joblib import Parallel, delayed
from model_trainer import train_svms

# Scores
from sklearn.model_selection import StratifiedKFold

# Classifiers
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# Ignoring Errors
import warnings
warnings.simplefilter('ignore')

In [2]:
def predict_species(idx, x, y_pred, y, model):
    if y_pred != y:
        return 'xyz'
    
    else:
        if x == 'train':
            y_pred_species = model[y].predict(X_train[idx, :].reshape(1,-1))
            return str(y_pred_species[0])
        elif x == 'test':
            y_pred_species = model[y].predict(X_test[idx, :].reshape(1,-1))
            return str(y_pred_species[0])
    
predict_species = np.vectorize(predict_species)

In [9]:
# def return_preditions(idx, x, y_pred, y, model, X_train, X_test):
#     return predict_species(idx, x, y_pred, y, model, X_train, X_test)

In [3]:
def train_rfs(name, taxonomies):
    hierarchical_rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
    hierarchical_rf.fit(taxonomies[name][0], taxonomies[name][1])
    return hierarchical_rf

In [4]:
k_number = 10
data = pd.read_csv("7kmerized.mutated{}.csv".format(k_number))
data = data.drop(["mutated nucleotide"], 1).dropna(axis=0)

## Begin to model training and estimations

In [5]:
hierarchical_svm_train, hierarchical_svm_test = [], []
hierarchical_rf_train, hierarchical_rf_test = [], []
single_rf_train, single_rf_test = [], []

# Data Preprocessing
std = StandardScaler()
X = np.array(data.iloc[:,2:]) # X is the barcode sequences
X = std.fit_transform(X)
species = np.ravel(data.iloc[:,1])
y = np.ravel(data.iloc[:,0]) # y is the taxonomic classes

kf = StratifiedKFold(10, random_state=42, shuffle=True)

counter = 1
for train_index, test_index in kf.split(X, species):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    species_train, species_test = species[train_index], species[test_index]
    
    # Training Taxonomy models and Species models for prediction
    taxonomy_names = ['Aves', 'Chiroptera', 'Rodentia', 'Polypodiopsida', 'Pucciniomycetes']
    taxonomies = {}
    
    for txnmy in taxonomy_names:
        taxonomies[txnmy] = [X_train[y_train == txnmy,:], species_train[y_train == txnmy]]
        
    taxonomy_names.append('All')
    taxonomies['All'] = [X_train, y_train]
    
    with Parallel(n_jobs=6) as parallel:
        trained_svm_models = parallel(delayed(train_svms)(name, taxonomies) for name in taxonomy_names)
    trained_rf_models = [train_rfs(name, taxonomies) for name in taxonomy_names]
        
    single_rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
    single_rf.fit(X_train, species_train)

    # Organizing Classifiers
    taxonomy_models = {'svms':{}, 'rfs':{}}
    
    for i in range(len(taxonomy_names)):
        taxonomy_models['svms'][taxonomy_names[i]] = trained_svm_models[i]
        taxonomy_models['rfs'][taxonomy_names[i]] = trained_rf_models[i]
        
    # Taxonomy Predictions
    svm_ypred_train = taxonomy_models["svms"]["All"].predict(X_train)
    svm_ypred_test = taxonomy_models["svms"]["All"].predict(X_test)
    
    rf_ypred_train = taxonomy_models["rfs"]["All"].predict(X_train)
    rf_ypred_test = taxonomy_models["rfs"]["All"].predict(X_test)

    singlerf_ypred_train = single_rf.predict(X_train)
    singlerf_ypred_test = single_rf.predict(X_test)

    # Predict Species
    X_train_range = np.ravel([i for i in range(X_train.shape[0])])
    X_test_range = np.ravel([i for i in range(X_test.shape[0])])
    
    svm_predictions_train = np.ravel(predict_species(X_train_range,'train', svm_ypred_train, y_train, taxonomy_models['svms']))
    svm_predictions_test = np.ravel(predict_species(X_test_range,'test', svm_ypred_test, y_test, taxonomy_models['svms']))
    
    rf_predictions_train = np.ravel(predict_species(X_train_range,'train', rf_ypred_train, y_train, taxonomy_models['rfs']))
    rf_predictions_test = np.ravel(predict_species(X_test_range,'test', rf_ypred_test, y_test, taxonomy_models['rfs']))
    
    
    # Appending each fold to corresponging data structure
    hierarchical_svm_train.append((species_train, svm_predictions_train)) # true, predicted
    hierarchical_svm_test.append((species_test, svm_predictions_test))

    hierarchical_rf_train.append((species_train, rf_predictions_train)) # true, predicted
    hierarchical_rf_test.append((species_test, rf_predictions_test))

    single_rf_train.append((species_train, singlerf_ypred_train)) # true, predicted
    single_rf_test.append((species_test, singlerf_ypred_test))
    print('Fold {} is done!'.format(counter))
    counter += 1


mutation_predictions = {'linearsvm':{'train':hierarchical_svm_train, 'test':hierarchical_svm_test},
                        'singlerf':{'train':single_rf_train, 'test':single_rf_test},
                        'hierarchicalrf':{'train':hierarchical_rf_train, 'test':hierarchical_rf_test}}

Fold 1 is done!
Fold 2 is done!
Fold 3 is done!
Fold 4 is done!
Fold 5 is done!
Fold 6 is done!
Fold 7 is done!
Fold 8 is done!
Fold 9 is done!
Fold 10 is done!


### Storing The Scores

In [6]:
with open('mutation_predictions{}.db'.format(k_number),'wb') as file:
    pickle.dump(mutation_predictions, file)