In [1]:
import pandas as pd
import json
import numpy as np
import warnings
warnings.simplefilter('ignore')

# Data Preparation 

## Reading datasets

In [2]:
def read_csv(class_):
    dataFrames = []
    for i in range(1, 8):
        df = pd.read_csv('../datasets/' + class_ + '.Cleaned.k' + str(i) + '.csv')
        dataFrames.append(df)
    return dataFrames

### Chiroptera Class: k=1, 2, ...., 7

In [3]:
chirop_dfs = read_csv('Chiroptera')

### Rodentia Class: k=1, 2, ...., 7

In [4]:
rodent_dfs = read_csv('Rodentia')

### Aves Class: k=1, 2, ...., 7

In [5]:
aves_dfs = read_csv('Aves')

### Polypodiopsida Class: k=1, 2, ...., 7

In [6]:
polypod_dfs = read_csv('Polypodiopsida')

### Pucciniomycetes Class: k=1, 2, ...., 7 

In [7]:
pucci_dfs = read_csv('Pucciniomycetes')

In [8]:
for i in range(len(pucci_dfs)):
    pucci_dfs[i].insert(0, 'Unnamed: 0', np.arange(len(pucci_dfs[i])))
    polypod_dfs[i].insert(0, 'Unnamed: 0', np.arange(len(polypod_dfs[i])))

# Learning Models

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score

In [10]:
def get_acc_scores(X, y):
    skf = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
    acc_scores = {'svmrad test': [], 'svmrad train': [], 'svmlin test': [], 'svmlin train': [], 'rf test': [], 'rf train': []}
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index, :], X.iloc[test_index, :]
        y_train, y_test = y.iloc[train_index, :], y.iloc[test_index, :]
        svm_rad = SVC(gamma=1/len(X_train.index), kernel='rbf')
        svm_lin = SVC(gamma=1/len(X_train.index), kernel='linear')
        rf = RandomForestClassifier(n_estimators=10, n_jobs=-1)
        svm_rad.fit(X_train, y_train)
        svm_lin.fit(X_train, y_train)
        rf.fit(X_train, y_train)
        svm_rad_test = f1_score(y_test, svm_rad.predict(X_test), average='micro')
        svm_rad_train = f1_score(y_train, svm_rad.predict(X_train), average='micro')
        svm_lin_test = f1_score(y_test, svm_lin.predict(X_test), average='micro')
        svm_lin_train = f1_score(y_train, svm_lin.predict(X_train), average='micro')
        rf_test = f1_score(y_test, rf.predict(X_test), average='micro')
        rf_train = f1_score(y_train, rf.predict(X_train), average='micro')
        acc_scores['svmrad test'].append(svm_rad_test)
        acc_scores['svmrad train'].append(svm_rad_train)
        acc_scores['svmlin test'].append(svm_lin_test)
        acc_scores['svmlin train'].append(svm_lin_train)
        acc_scores['rf test'].append(rf_test)
        acc_scores['rf train'].append(rf_train)
    return acc_scores 

In [11]:
def main(class_, name):
    clfs_acc = {}
    for i in range(1, len(class_)+1):
        X = class_[i-1].iloc[:, 3:]
        features = X.columns
        sc = StandardScaler()
        X = sc.fit_transform(X)
        y = pd.DataFrame(class_[i-1].iloc[:, 2])
        X = pd.DataFrame(X, columns=features)
        clfs_acc[name+'.k'+str(i)] = get_acc_scores(X, y)
    return clfs_acc

In [12]:
def dump_results():
    results = {}
    results['Pucciniomycetes'] = main(pucci_dfs, 'Pucciniomycetes')
    results['Polypodiopsida'] = main(polypod_dfs, 'Polypodiopsida')
    results['Aves'] = main(aves_dfs, 'Aves')
    results['Chiroptera'] = main(chirop_dfs, 'Chiroptera')
    results['Rodentia'] = main(rodent_dfs, 'Rodentia')
    with open('non-hierarchical f1_scores.json', 'w') as fr:
        json.dump(results, fr)

In [13]:
dump_results()