In [4]:
import os
import ipynb
import warnings
from sklearn.exceptions import FitFailedWarning, ConvergenceWarning 

import numpy as np
import pandas as pd
import seaborn as sns

import statistics
from scipy.stats import loguniform

from matplotlib import pyplot as plt
from matplotlib.ticker import MaxNLocator

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, RandomizedSearchCV

from sklearn.svm import SVC

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.utils._testing import ignore_warnings 
from sklearn.metrics import classification_report

from ipynb.fs.full.preprocessing import preprocessing #import preprocessing class from preprocessing.ipynb

In [5]:
#option 1: call preprocessing class to generate dataframes
prep_dwt = preprocessing(file_path = '/Users/jhbyun/Documents/685-Pr/ovarian-cancer-classification/data/DWT.csv') #initialize class for DWT_8702
dwt = prep_dwt.label_df() #generate dataframe with labels

prep_wang = preprocessing(file_path = '/Users/jhbyun/Documents/685-Pr/ovarian-cancer-classification/data/Wang.csv') #initialize class for WPD_Wang_8702
wang = prep_wang.label_df()

prep_jones = preprocessing(file_path = '/Users/jhbyun/Documents/685-Pr/ovarian-cancer-classification/data/Jones.csv') #initialize class for WPD_Wang_8702
jones = prep_jones.label_df()


'''
#option 2: restore dataframes if previously stored in 00_EDA.ipynb
%store -r dwt8 
dwt8 = dwt8

%store -r wp8
wp8 = wp8
'''

'\n#option 2: restore dataframes if previously stored in 00_EDA.ipynb\n%store -r dwt8 \ndwt8 = dwt8\n\n%store -r wp8\nwp8 = wp8\n'

## SVM (balanced sampling, default parameters)

In [10]:
def svm(df, n_iter=1000):

    df = df

    subset_0 = df[df['state'] == 0]
    subset_1 = df[df['state'] == 1]

    subset_1 = subset_1.sample(n=91) #randomly sample cases to match number of controls
    df = pd.concat([subset_0, subset_1])
                     
    x = df.loc[:, df.columns != 'state'] #features
    y = df.loc[:, df.columns == 'state'] #supervisor

    eval_metrics = { #empty dictionary to store classification report
        '0_precision': [],
        '0_recall': [],
        '0_f1-score': [],
        '0_support': [],

        '1_precision': [],
        '1_recall': [],
        '1_f1-score': [],
        '1_support': [],
        
        'accuracy': [],
        'accuracy_train': [],
    }

    for i in range(n_iter): 

        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.67)

        scaler = StandardScaler()    
        scaler.fit(x_train)
        x_train = scaler.transform(x_train)
        x_test = scaler.transform(x_test)

        svm_model = SVC()
        svm_model.fit(x_train, y_train.values.ravel())
        
        y_pred = svm_model.predict(x_test)
        report = classification_report(y_test, y_pred, output_dict=True)

        y_pred_train = svm_model.predict(x_train)
        report_train = classification_report(y_train, y_pred_train, output_dict=True)

        eval_metrics['0_precision'].append(report['0']['precision']) #store classification_report values in dictionary
        eval_metrics['0_recall'].append(report['0']['recall'])
        eval_metrics['0_f1-score'].append(report['0']['f1-score'])
        eval_metrics['0_support'].append(report['0']['support'])

        eval_metrics['1_precision'].append(report['1']['precision'])
        eval_metrics['1_recall'].append(report['1']['recall'])
        eval_metrics['1_f1-score'].append(report['1']['f1-score'])
        eval_metrics['1_support'].append(report['1']['support'])

        eval_metrics['accuracy'].append(report['accuracy'])
        eval_metrics['accuracy_train'].append(report_train['accuracy'])

    eval_metrics_df = pd.DataFrame.from_dict(eval_metrics) #convert dictionary to dataframe

    return eval_metrics_df

In [23]:
acc_df_dwt = svm(dwt, n_iter=1000)
acc_df_wang = svm(wang, n_iter=1000)
acc_df_jones = svm(jones, n_iter=1000)

In [25]:
print('DWT Method Accuracy ', round(acc_df_dwt['accuracy'].mean(), 4))
print('Wang Method Accuracy ', round(acc_df_wang['accuracy'].mean(), 4))
print('Jones Method Accuracy ', round(acc_df_jones['accuracy'].mean(), 4))

DWT Method Accuracy  0.942
Wang Method Accuracy  0.9594
Jones Method Accuracy  0.8771


## SVM grid search

In [15]:
@ignore_warnings(category=(ConvergenceWarning, FitFailedWarning, UserWarning)) #ignore warnings for forbidden parameter combinations between solvers and penalty in grid

def svm_search(df, n_iter = 100): 

    df = df

    subset_0 = df[df['state'] == 0]
    subset_1 = df[df['state'] == 1]

    subset_1 = subset_1.sample(n=91) #randomly sample cases to match number of controls
    df = pd.concat([subset_0, subset_1])

    x = df.loc[:, df.columns != 'state'] #features
    y = df.loc[:, df.columns == 'state'] #supervisor

    eval_metrics = { #empty dictionary to store classification report
        '0_precision': [],
        '0_recall': [],
        '0_f1-score': [],
        '0_support': [],

        '1_precision': [],
        '1_recall': [],
        '1_f1-score': [],
        '1_support': [],
        
        'accuracy': [],
        'accuracy_train': [],

    }

    for i in range(n_iter): 

        x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.67)

        scaler = StandardScaler()
        
        svm_model = SVC()

        param_grid = {
            'svm__kernel': ['linear', 'rbf', 'poly'],
            'svm__C':[1, 10, 100, 1000]
        }


        cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1) 
        pipeline = Pipeline(steps=[("scaler", scaler), ("svm", svm_model)])


        search = RandomizedSearchCV(pipeline, param_grid, n_iter=50, scoring='accuracy', n_jobs=-1, cv=cv) #n_iter = number of parameter settings that are sampled
        
        result = search.fit(x_train, y_train.values.ravel())
        y_pred = result.predict(x_test)
        report = classification_report(y_test, y_pred, output_dict=True)

        y_pred_train = result.predict(x_train)
        report_train = classification_report(y_train, y_pred_train, output_dict=True)

        eval_metrics['0_precision'].append(report['0']['precision']) #store classification_report values in dictionary
        eval_metrics['0_recall'].append(report['0']['recall'])
        eval_metrics['0_f1-score'].append(report['0']['f1-score'])
        eval_metrics['0_support'].append(report['0']['support'])

        eval_metrics['1_precision'].append(report['1']['precision'])
        eval_metrics['1_recall'].append(report['1']['recall'])
        eval_metrics['1_f1-score'].append(report['1']['f1-score'])
        eval_metrics['1_support'].append(report['1']['support'])

        eval_metrics['accuracy'].append(report['accuracy'])
        eval_metrics['accuracy_train'].append(report_train['accuracy'])

    eval_metrics_df = pd.DataFrame.from_dict(eval_metrics) #convert dictionary to dataframe

    return eval_metrics_df

In [16]:
acc_df_dwt = svm_search(dwt, n_iter=500)
acc_df_wang = svm_search(wang, n_iter=500)
acc_df_jones = svm_search(jones, n_iter=500)

In [17]:
print('DWT Method Accuracy ', round(acc_df_dwt['accuracy'].mean(), 4))
print('Wang Method Accuracy ', round(acc_df_wang['accuracy'].mean(), 4))
print('Jones Method Accuracy ', round(acc_df_jones['accuracy'].mean(), 4))

DWT Method Accuracy  0.9259
Wang Method Accuracy  0.9515
Jones Method Accuracy  0.871
