# Classification

Classifier | Dataset | Score | Rank
--- | --- | --- | ---
SVM/C | Cleaned+MinMaxScaled | `0.968750000000` | 1
RandomForest | Cleaned+MinMaxScaled | `0.942708333333` | 2
RandomForest | Cleaned | `0.942708333333` | 2
SVM/C | Cleaned | `0.942708333333` | 2
RandomForest | Full | `0.937500000000` | 5
SVM/C | Full | `0.932291666667` | 6

Notes:

- RandomForest with `n_estimators=50`, `oob_score=True` and `random_state=123456`.
- SVM/C with `kernel=linear`, `C=2.2`.

In [1]:
# IMPORTS AND NOTEBOOK SETUP
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
# IMPORTING OUR DATASET
data_full         = pd.read_csv('../dataset-numpy/dataset.csv')
data_clean_manual = pd.read_csv('../dataset-numpy/dataset-clean-manual.csv')
data_clean_manual[:5]

Unnamed: 0,area,contours,radius,hull_radius,centroid_x,centroid_y,weight_0_0,weight_0_1,weight_0_2,weight_0_3,...,weight_2_0,weight_2_1,weight_2_2,weight_2_3,weight_3_0,weight_3_1,weight_3_2,weight_3_3,num_holes,label
0,288.0,36,11.535813,12.633042,14.199074,19.315972,0,38,0,0,...,13,55,54,6,1,46,45,5,1,6
1,348.5,39,10.709997,13.732948,15.272119,14.962219,3,38,51,2,...,0,38,34,0,0,36,45,0,2,8
2,345.0,48,11.186796,14.444992,14.761353,15.188406,0,33,56,14,...,15,64,29,0,13,61,26,0,0,1
3,243.0,42,10.376487,12.915362,18.580247,10.78738,0,37,64,30,...,0,1,39,0,0,20,20,0,0,9
4,213.0,48,9.848953,13.695002,16.989828,15.674491,7,47,28,0,...,0,10,36,13,0,37,50,6,0,3


## Normalization

In [3]:
columns = ['area', 'radius', 'hull_radius', 'centroid_x', 'centroid_y']
for x in range(4):
    for y in range(4):
        columns.append('_'.join(['weight', str(x), str(y)]))
        
def scale(scaler):
    return pd.DataFrame(scaler.fit_transform(data_clean_manual[columns]), columns=columns)

### MinMaxScaler

In [4]:
from sklearn.preprocessing import MinMaxScaler

minmaxscaled = data_clean_manual.copy()
minmaxscaled[columns] = scale(MinMaxScaler())

### ?? Scaler

## Splitting the dataset

In [5]:
def split_data(data, split):
    border = int(len(data) * split)
    train_data = data[:border]
    test_data = data[border:]

    assert len(train_data) + len(test_data) == len(data), "Invalid split!"

    X_train = train_data.iloc[:,:-1]
    Y_train = train_data.iloc[:,-1]

    X_test = test_data.iloc[:,:-1]
    Y_test = test_data.iloc[:,-1]
    return X_train, Y_train, X_test, Y_test

X_train, Y_train, X_test, Y_test = split_data(minmaxscaled, .9)

## Try Random Forest Classifier

In [6]:
# RANDOM FOREST
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, oob_score=True, random_state=123456)
rf.fit(X_train, Y_train)

print 'Score:', rf.score(X_test, Y_test)

Score: 0.942708333333


## Try Support Vector Machine

In [7]:
# SVM
from sklearn import svm

svc = svm.SVC(kernel='linear', C=2.2)
svc.fit(X_train, Y_train)
print 'Score:', svc.score(X_test, Y_test)

Score: 0.96875


### Finding the optimal C value

In [None]:
Cs = np.arange(1, 3, 0.01)
scores = np.zeros((len(Cs)))
index = 0
for c in Cs:
    svc = svm.SVC(kernel='linear', C=c)
    svc.fit(X_train, Y_train)
    score = svc.score(X_test, Y_test)
    scores[index] = score
    index += 1

In [None]:
print 'Max:', scores.max()
plt.plot(Cs, scores)