In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.simplefilter('ignore')

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score

In [3]:
def get_acc_scores(X, y):
    skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    acc_scores = {'svmrad test': [], 'svmrad train': [], 'svmlin test': [], 'svmlin train': [], 'rf test': [], 'rf train': []}
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index, :], y.iloc[test_index, :]
        svm_rad = SVC(gamma=1/len(X_train.index), kernel='rbf')
        svm_lin = SVC(gamma=1/len(X_train.index), kernel='linear')
        rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
        svm_rad.fit(X_train, y_train)
        svm_lin.fit(X_train, y_train)
        rf.fit(X_train, y_train)
        svm_rad_test = f1_score(y_test, svm_rad.predict(X_test), average='micro')
        svm_rad_train = f1_score(y_train, svm_rad.predict(X_train), average='micro')
        svm_lin_test = f1_score(y_test, svm_lin.predict(X_test), average='micro')
        svm_lin_train = f1_score(y_train, svm_lin.predict(X_train), average='micro')
        rf_test = f1_score(y_test, rf.predict(X_test), average='micro')
        rf_train = f1_score(y_train, rf.predict(X_train), average='micro')
        acc_scores['svmrad test'].append(svm_rad_test)
        acc_scores['svmrad train'].append(svm_rad_train)
        acc_scores['svmlin test'].append(svm_lin_test)
        acc_scores['svmlin train'].append(svm_lin_train)
        acc_scores['rf test'].append(rf_test)
        acc_scores['rf train'].append(rf_train)
    return acc_scores 

In [4]:
def combine_dataset():
    dataFrames = []
    for i in range(1, 8):
        chirop_df = pd.read_csv('../datasets/Chiroptera.Cleaned.k' + str(i) + '.csv')
        chirop_df.insert(3, 'class', 'Chiroptera')
        rodent_df = pd.read_csv('../datasets/Rodentia.Cleaned.k' + str(i) + '.csv')
        rodent_df.insert(3, 'class', 'Rodentia')
        aves_df = pd.read_csv('../datasets/Aves.Cleaned.k' + str(i) + '.csv')
        aves_df.insert(3, 'class', 'Aves')
        polypod_df = pd.read_csv('../datasets/Polypodiopsida.Cleaned.k' + str(i) + '.csv')
        polypod_df.insert(0, 'Unnamed: 0', np.arange(len(polypod_df)))
        polypod_df = polypod_df.rename(columns={'nucleotide': 'nucleotides', 'genus name': 'genus_name'})
        polypod_df.insert(3, 'class', 'Polypodiopsida')
        pucci_df = pd.read_csv('../datasets/Pucciniomycetes.Cleaned.k' + str(i) + '.csv')
        pucci_df.insert(0, 'Unnamed: 0', np.arange(len(pucci_df)))
        pucci_df.insert(3, 'class', 'Pucciniomycetes')
        pucci_df = pucci_df.rename(columns={'nucleotide': 'nucleotides', 'genus name': 'genus_name'})
        frames = [chirop_df, rodent_df, aves_df]
        merged_df = pd.concat(frames)
        merged_df.index = range(len(merged_df))
        dataFrames.append(merged_df)
    return dataFrames

In [5]:
def main(class_, name):
    clfs_acc = {}
    for i in range(1, len(class_)+1):
        X = class_[i-1].iloc[:, 4:]
        features = X.columns
        sc = StandardScaler()
        X = sc.fit_transform(X)
        y = pd.DataFrame(class_[i-1].iloc[:, 3])
        X = pd.DataFrame(X, columns=features)
        clfs_acc[name+'.k'+str(i)] = get_acc_scores(X, y)
    return clfs_acc

In [6]:
import json

In [7]:
merged_dfs = combine_dataset()
with open('non-hierarchical class_pred f1_scores.json', 'w') as fr:
    json.dump(main(merged_dfs, 'merged dataset f1_score'), fr)