# Assignment 3

In [1]:
import preprocess
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

## Get Preprocessed Data

In [2]:
train_target, train_data, test_data, train_df, test_df = preprocess.preprocess(
    "TrainingSet.csv", 'TestSet.csv', limit=None, remove_low_variance=True, remove_outliers=True)

In [3]:
X_g_train, X_g_test, y_g_train, y_g_test = train_test_split(train_data, train_target, test_size=0.30)

## Init some variables for later use

In [4]:
result_predict = dict()
test_predict = dict()

## Random Forest
### Use Random Forest with found parameters

In [6]:
cv = StratifiedKFold(n_splits=5)

model = RandomForestClassifier(n_estimators=128, criterion='entropy',n_jobs=-1)
for train_idx, test_idx, in cv.split(train_data, train_target):
    X_train, y_train = train_data[train_idx], train_target[train_idx]
    X_test, y_test = train_data[test_idx], train_target[test_idx]

    # Use SMOTE to oversample the dataset for better training accuracy
    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
    
    # Fit and predict
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)

    print(f'auc: {roc_auc_score(y_test, y_pred)}')

y_predict = model.predict(X_g_test)

print(roc_auc_score(y_g_test, y_predict))

In [7]:
result_predict['RandomForest'] = np.array(model.predict(test_data))
test_predict['RandomForest'] = np.array(model.predict(X_g_test))

## k-nearest neighbour
### Use k-nearest neighbour with found parameters

In [9]:
cv = StratifiedKFold(n_splits=5)

model = KNeighborsClassifier(n_neighbors=20, weights='uniform', n_jobs=-1)
for train_idx, test_idx, in cv.split(train_data, train_target):
    X_train, y_train = train_data[train_idx], train_target[train_idx]
    X_test, y_test = train_data[test_idx], train_target[test_idx]

    # Use SMOTE to oversample the dataset for better training accuracy
    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
    
    # Fit and predict
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)

    print(f'auc: {roc_auc_score(y_test, y_pred)}')

y_predict = model.predict(X_g_test)

print(roc_auc_score(y_g_test, y_predict))

In [10]:
result_predict['KNeighbors'] = np.array(model.predict(test_data))
test_predict['KNeighbors'] = np.array(model.predict(X_g_test))

## SVM
### Use SVM with found parameters

In [12]:
model = SVC(gamma='auto')
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

print(roc_auc_score(y_test, y_predict))

In [13]:
result_predict['SVM'] = np.array(model.predict(test_data))
test_predict['SVM'] = np.array(model.predict(X_g_test))

## Neural Network
### Use neural network with found parameters

In [None]:
cv = StratifiedKFold(n_splits=5)

model = MLPClassifier(solver='adam', alpha=0.001, learning_rate_init=0.0001,
                           hidden_layer_sizes=(25, 10,10), max_iter=1000)
for train_idx, test_idx, in cv.split(train_data, train_target):
    X_train, y_train = train_data[train_idx], train_target[train_idx]
    X_test, y_test = train_data[test_idx], train_target[test_idx]

    # Use SMOTE to oversample the dataset for better training accuracy
    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
    
    # Fit and predict
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)

    print(f'auc: {roc_auc_score(y_test, y_pred)}')

y_predict = model.predict(X_g_test)

print(roc_auc_score(y_g_test, y_predict))

In [None]:
result_predict['mlpNetwork'] = np.array(model.predict(test_data))
test_predict['mlpNetwork'] = np.array(model.predict(X_g_test))

## Do stuff with unsure rows

In [None]:
result_df = pd.DataFrame(result_predict)
result_df['Sum'] = result_df.sum(axis=1)


count = 0
final = []
width = len(result_df.keys())-1
for row in result_df['Sum']:
    if 0 < row < width:
        #print(1 if row > (width/2) else 0)
        final.append(1 if row > (width/2) else 0)
        count+=1
    else:
        final.append(0 if row == 0 else 1)
result_df['Final']=final
print(str(count/len(result_df)*100) + '% Unsure')

t_df = pd.DataFrame(test_predict)
t_df['Sum'] = t_df.sum(axis=1)


count = 0
final = []
width = len(t_df.keys())-1
for row in t_df['Sum']:
    if 0 < row < width:
        #print(1 if row > (width/2) else 0)
        final.append(1 if row > (width/2) else 0)
        count+=1
    else:
        final.append(0 if row == 0 else 1)
t_df['Final']=final
print(str(count/len(t_df)*100) + '% Unsure')


roc_auc_score(y_g_test,list(t_df['Final']))

### Save Result to file

In [None]:

test_df['QuoteConversion_Flag'] = pd.Series(result_df['Final'], index=test_df.index)

todrop=[]
for col in test_df.columns:
    if col not in ['Quote_ID','QuoteConversion_Flag']:
        todrop.append(col)
test_df.drop(columns=todrop, inplace=True)
test_df.to_csv('Kaggle_Submission.csv', index=False)
test_df.describe()