# 1. Classification

## Compare the performance of 8 classifiers across 8 classification datasets.

### Import Required Libraries

In [2]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.impute import SimpleImputer

from scipy.io.arff import loadarff 

### Setup

In [2]:
TRAIN_SIZE=0.8
MODEL_TYPES = ["Linear Regression", "SVM", "Decision Tree", "Random Forest", "K-nearest Neighbors", "AdaBoost", "Gaussian Naive Bayes", "Neural Network"]
DATA_SOURCES = ["Diabetic Retinopathy", "Default of Credit Card Clients", "Breast Cancer Wisconsin", "Statlog", "Adult", "Yeast", "Thoracic Surgery", "Seismic_Bumps"]

# Store data sets in order of DATA_SOURCES.
X_training_sets = []
X_testing_sets = []
y_training_sets = []
y_testing_sets = []

def store_data(X, y):
    imp = SimpleImputer(strategy='mean')
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(np.array(X), np.array(y), train_size=TRAIN_SIZE, random_state=0)
    X_training_sets.append(imp.fit_transform(X_train))
    X_testing_sets.append(imp.fit_transform(X_test))
    y_training_sets.append(y_train)
    y_testing_sets.append(y_test)

## Dataset 1: Diabetic Retinopathy

In [3]:
# Import data
raw_data = loadarff('datasets/diabetic_messidor_features.arff')
diabetes_df = pd.DataFrame(raw_data[0])
diabetes_df.rename(columns={'0': 'Quality',
                            '1': 'Pre Screening Result',
                            '2': '#MAs at alpha=0.5',
                            '3': '#MAs at alpha=0.6',
                            '4': '#MAs at alpha=0.7',
                            '5': '#MAs at alpha=0.8',
                            '6': '#MAs at alpha=0.9',
                            '7': '#MAs at alpha=1',
                            '8': '#Exudates 1',
                            '9': '#Exudates 2',
                            '10': '#Exudates 3',
                            '11': '#Exudates 4',
                            '12': '#Exudates 5',
                            '13': '#Exudates 6',
                            '14': '#Exudates 7',
                            '15': '#Exudates 8',
                            '16': 'euclidean distance of the center of the macula and the center of the optic disc',
                            '17': 'diameter of the optic disc',
                            '18': 'AM/FM Classification'}, inplace=True)
# diabetes_df

In [4]:
# Configure data
Y = diabetes_df['AM/FM Classification']
X = diabetes_df.drop('AM/FM Classification', axis=1).drop('Class', axis=1)
store_data(X, Y)

## Dataset 2: Default of credit card clients

In [5]:
# Import data
credit_df=pd.read_csv('datasets/defaultofcreditcardclients.csv',  sep=',', dtype=int)
credit_df

Y = credit_df['default payment next month']
X = credit_df.drop('default payment next month', axis=1).drop('ID', axis=1)
store_data(X, Y)

## Dataset 3: Breast Cancer Wisconsin

In [6]:
# .names has 11 cols
names = ["ID", 
         "Clump Thickness", 
         "Uniformity of Cell Size", 
         "Uniformity of Cell Shape", 
         "Marginal Adhesion", 
         "Single Epithelial Cell Size", 
         "Bare Nuclei", 
         "Bland Chromatin", 
         "Normal Nucleoli", 
         "Mitoses", 
         "Class"]       
cancer_df = pd.read_csv("datasets/breast-cancer-wisconsin.data", names=names)
# cancer_df

Y = cancer_df['Class']
X = cancer_df.drop('Class', axis=1).drop('ID', axis=1)
X = X.replace('?', np.nan)
store_data(X, Y)

## Dataset 4: Statlog (German credit data)

In [7]:
# Import data. Assume last col is assessment.
statlog_df = pd.read_csv('datasets/german.data-numeric', delim_whitespace=True, header=None)
#statlog_df

Y = statlog_df[24]
X = statlog_df.drop(24, axis=1)
store_data(X, Y)

## Dataset 5: Adult

In [8]:
# Import data
adult_df = pd.read_csv('datasets/adult.data', header=None)
# Convert word data into cols of yes/no
adult_df = pd.get_dummies(adult_df)
# adult_df

Y = adult_df['14_ >50K']
X = adult_df.drop('14_ >50K', axis=1).drop('14_ <=50K', axis=1)
store_data(X, Y)

## Dataset 6: Yeast

In [9]:
# Import data
yeast_df = pd.read_csv('datasets/yeast.data', delim_whitespace=True, header=None)
# Ignore Sequence Name as it's unique
yeast_df = yeast_df.drop(0, axis=1)
# Convert word data into cols of yes/no
yeast_df = pd.get_dummies(yeast_df)
# yeast_df

# Will classify if Cystolic or Not (Including others falls under regression)
Y = yeast_df['9_CYT']
X = yeast_df.iloc[:, :8]
store_data(X, Y)


## Dataset 7: Thoracic Surgery Data

In [10]:
# Import data
raw_data = loadarff('datasets/ThoraricSurgery.arff')
thoracic_df = pd.DataFrame(raw_data[0])
# Don't want to run get_dummies on T/F since it would double it's weight
thoracic_df = thoracic_df.replace([b'F'], '0').replace([b'T'], '1')
# thoracic_df

Y = thoracic_df['Risk1Yr']
thoracic_df = thoracic_df.drop('Risk1Yr', axis=1)
X = pd.get_dummies(thoracic_df)
store_data(X, Y)

## Dataset 8: Seismic-Bumps

In [11]:
raw_data = loadarff('datasets/seismic-bumps.arff')
seismic_df = pd.DataFrame(raw_data[0])
seismic_df = seismic_df.replace([b'0'], '0').replace([b'1'], '1')
seismic_df

Y = seismic_df['class']
seismic_df = seismic_df.drop('class', axis=1)
X = pd.get_dummies(seismic_df)
store_data(X, Y)

### Model Phase

In [12]:

def generate_models():
    # Generate and store default models
    classification_models = []
    # Logistic Regression
    classification_models.append(LogisticRegression(fit_intercept=False, random_state=0))
    # SVM
    classification_models.append(LinearSVC(C=100, random_state=0))
    # Decision Tree
    classification_models.append(DecisionTreeClassifier(max_depth=100, random_state=0))
    # Random Forest
    classification_models.append(RandomForestClassifier(random_state=0))
    # K-nearest Neighbors
    classification_models.append(KNeighborsClassifier(n_neighbors=8))
    # AdaBoost
    classification_models.append(AdaBoostClassifier(random_state=0, n_estimators=100))
    # Gaussian Naive Bayes
    classification_models.append(GaussianNB())
    # Neural Network
    classification_models.append(MLPClassifier(random_state=0))
    
    return classification_models

### Training Phase

In [13]:
trained_models = []
for index, source in enumerate(DATA_SOURCES):
    print("Training: " + source + " with index " + str(index))
    trained_models.append([model.fit(X_training_sets[index], y_training_sets[index]) for model in generate_models()])   

Training: Diabetic Retinopathy with index 0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training: Default of Credit Card Clients with index 1


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training: Breast Cancer Wisconsin with index 2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training: Statlog with index 3




Training: Adult with index 4




Training: Yeast with index 5


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Training: Thoracic Surgery with index 6




Training: Seismic_Bumps with index 7


### Testing Phase

In [14]:
results = np.zeros(64)
for index_src, source in enumerate(DATA_SOURCES):
    print("Testing: " + source)
    for index, model in enumerate(trained_models[index_src]):
        results[index_src * 8 + index] = model.score(X_testing_sets[index_src], y_testing_sets[index_src])
print("Done")

Testing: Diabetic Retinopathy
Testing: Default of Credit Card Clients
Testing: Breast Cancer Wisconsin
Testing: Statlog
Testing: Adult
Testing: Yeast
Testing: Thoracic Surgery
Testing: Seismic_Bumps
Done


In [15]:
results = results.reshape(8, 8)
print('\t', end = '\t')
print(*MODEL_TYPES, sep='\t')
for index, row in enumerate(results):
    print(DATA_SOURCES[index], end = '\t')
    print(*row, sep='\t')

		Linear Regression	SVM	Decision Tree	Random Forest	K-nearest Neighbors	AdaBoost	Gaussian Naive Bayes	Neural Network
Diabetic Retinopathy	0.7056277056277056	0.7316017316017316	0.7316017316017316	0.7619047619047619	0.7272727272727273	0.7186147186147186	0.7229437229437229	0.7229437229437229
Default of Credit Card Clients	0.7838333333333334	0.7828333333333334	0.7376666666666667	0.8216666666666667	0.7791666666666667	0.826	0.36466666666666664	0.784
Breast Cancer Wisconsin	0.8642857142857143	0.9714285714285714	0.9357142857142857	0.9785714285714285	0.9785714285714285	0.9714285714285714	0.9571428571428572	0.9714285714285714
Statlog	0.755	0.51	0.67	0.735	0.725	0.76	0.71	0.745
Adult	0.7930293259634577	0.7858129894058038	0.8136035621065562	0.8484569322892677	0.785659450330109	0.8601259020420697	0.7936434822662367	0.2838937509596192
Yeast	0.6531986531986532	0.6632996632996633	0.6868686868686869	0.734006734006734	0.696969696969697	0.7239057239057239	0.35353535353535354	0.6902356902356902
Thoracic S