# Assignment 3

In [None]:
import preprocess
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

## Get Preprocessed Data

In [None]:
train_target, train_data, test_data, train_df, test_df = preprocess.preprocess(
    "TrainingSet.csv", 'TestSet.csv', limit=None, remove_low_variance=True, remove_outliers=True)

In [None]:
X_g_train, X_g_test, y_g_train, y_g_test = train_test_split(train_data, train_target, test_size=0.30)

## Init some variables for later use

In [None]:
result_predict = dict()
test_predict = dict()

## Random Forest

### Find best parameters for Random Forest

In [None]:
model = RandomForestClassifier(n_jobs=-1)

cv = StratifiedKFold(n_splits=5)

model = RandomForestClassifier(n_estimators=162, criterion='entropy',n_jobs=-1)
for train_idx, test_idx, in cv.split(train_data, train_target):
    X_train, y_train = train_data[train_idx], train_target[train_idx]
    X_test, y_test = train_data[test_idx], train_target[test_idx]

    # Use SMOTE to oversample the dataset for better training accuracy
    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
    
    parameters = {'n_estimators':range(100,200), 'criterion':('entropy','gini')}
    gridSearch = GridSearchCV(model, parameters, cv=5)
    gridSearch.fit(X_train_oversampled, y_train_oversampled)

    print(gridSearch.cv_results_['params'][gridSearch.best_index_])
    


### Use Random Forest with found parameters

In [None]:
cv = StratifiedKFold(n_splits=5)

model = RandomForestClassifier(n_estimators=162, criterion='entropy',n_jobs=-1)
for train_idx, test_idx, in cv.split(train_data, train_target):
    X_train, y_train = train_data[train_idx], train_target[train_idx]
    X_test, y_test = train_data[test_idx], train_target[test_idx]

    # Use SMOTE to oversample the dataset for better training accuracy
    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
    
    # Fit and predict
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)

    print(f'auc: {roc_auc_score(y_test, y_pred)}')

y_predict = model.predict(X_g_test)

print(roc_auc_score(y_g_test, y_predict))

In [None]:
result_predict['RandomForest'] = np.array(model.predict(test_data))
test_predict['RandomForest'] = np.array(model.predict(y_g_test))

## k-nearest neighbour
### Find best parameters for  k-nearest neighbour

In [None]:
#model = KNeighborsClassifier()
#parameters = {'n_neighbors':range(1,20), 'weights':('uniform', 'distance')}
#gridSearch = GridSearchCV(model, parameters, cv=5)
#gridSearch.fit(X_train, y_train)

gridSearch.best_params_
gridSearch.best_score_ 
gridSearch.cv_results_['params'][gridSearch.best_index_]


### Use k-nearest neighbour with found parameters

In [None]:
cv = StratifiedKFold(n_splits=5)

model = KNeighborsClassifier(n_neighbors=20, weights='uniform', n_jobs=-1)
for train_idx, test_idx, in cv.split(train_data, train_target):
    X_train, y_train = train_data[train_idx], train_target[train_idx]
    X_test, y_test = train_data[test_idx], train_target[test_idx]

    # Use SMOTE to oversample the dataset for better training accuracy
    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
    
    # Fit and predict
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)

    print(f'auc: {roc_auc_score(y_test, y_pred)}')

y_predict = model.predict(X_g_test)

print(roc_auc_score(y_g_test, y_predict))

In [None]:
result_predict['KNeighbors'] = np.array(model.predict(test_data))
test_predict['KNeighbors'] = np.array(model.predict(y_g_test))

## SVM
### Find best parameters for SVM

In [None]:
#model = KNeighborsClassifier()
#parameters = {'n_neighbors':[1,20], 'weights':('uniform', 'distance')}
#gridSearch = GridSearchCV(model, parameters, cv=5)
#gridSearch.fit(X_train, y_train)

#gridSearch.cv_results_['params'][gridSearch.best_index_]

### Use SVM with found parameters

In [None]:
#KNeighbors = KNeighborsClassifier(n_neighbors=20, weights='uniform')
#KNeighbors.fit(X_train, y_train)
#y_predict = KNeighbors.predict(X_test)

#print(roc_auc_score(y_test, y_predict))

In [None]:
#result_predict['SVM'] = np.array(model.predict(test_data))
#test_predict['SVM'] = np.array(model.predict(y_g_test))

## Neural Network
### Find best parameters for neural network

In [None]:
model = MLPClassifier(max_iter=500)
parameters = {'solver': ('adam',),
              'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6],
              'hidden_layer_sizes': [(6, 6), (7, 7), (8, 8), (9, 9), (10, 10),
                                     #(11, 11), (12, 12), (13, 13), (14, 14), (15, 15), (16, 16), (17, 17), (18, 18),
                                     #(19, 19), (20, 20), (21, 21), (22, 22), (23, 23), (24, 24), (25, 25), (26, 26),
                                     #(27, 27), (28, 28), (29, 29), (30, 30), (31, 31), (32, 32), (33, 33), (34, 34),
                                     #(35, 35), (36, 36), (37, 37), (38, 38), (39, 39), (40, 40), (41, 41), (42, 42),
                                     #(43, 43), (44, 44), (45, 45), (46, 46), (47, 47), (48, 48), (49, 49), (50, 50),
                                     #(51, 51), (52, 52), (53, 53), (54, 54), (55, 55), (56, 56), (57, 57), (58, 58),
                                     #(59, 59), (60, 60), (61, 61), (62, 62), (63, 63), (64, 64), (65, 65), (66, 66),
                                     #(67, 67), (68, 68), (69, 69), (70, 70), (71, 71), (72, 72), (73, 73), (74, 74),
                                     #(75, 75), (76, 76), (77, 77), (78, 78), (79, 79), (80, 80), (81, 81), (82, 82),
                                     #(83, 83), (84, 84), (85, 85), (86, 86), (87, 87), (88, 88), (89, 89), (90, 90),
                                     #(91, 91), (92, 92), (93, 93), (94, 94), (95, 95), (96, 96), (97, 97), (98, 98),
                                     #(99, 99), (100, 100), (1, 1, 1), (2, 2, 2), (3, 3, 3), (4, 4, 4), (5, 5, 5),
                                     (6, 6, 6), (7, 7, 7), (8, 8, 8), (9, 9, 9), (10, 10, 10), (11, 11, 11),
                                     (12, 12, 12), (13, 13, 13), (14, 14, 14), (15, 15, 15), (16, 16, 16), (17, 17, 17),
                                     (18, 18, 18), (19, 19, 19), (20, 20, 20), (21, 21, 21), (22, 22, 22), (23, 23, 23),
                                     (24, 24, 24), (25, 25, 25), (26, 26, 26), (27, 27, 27), (28, 28, 28), (29, 29, 29),
                                     (30, 30, 30), (31, 31, 31), (32, 32, 32), (33, 33, 33), (34, 34, 34), (35, 35, 35),
                                    # (36, 36, 36), (37, 37, 37), (38, 38, 38), (39, 39, 39), (40, 40, 40), (41, 41, 41),
                                    # (42, 42, 42), (43, 43, 43), (44, 44, 44), (45, 45, 45), (46, 46, 46), (47, 47, 47),
                                    # (48, 48, 48), (49, 49, 49), (50, 50, 50), (51, 51, 51), (52, 52, 52), (53, 53, 53),
                                    # (54, 54, 54), (55, 55, 55), (56, 56, 56), (57, 57, 57), (58, 58, 58), (59, 59, 59),
                                    #(60, 60, 60), (61, 61, 61), (62, 62, 62), (63, 63, 63), (64, 64, 64), (65, 65, 65),
                                     (66, 66, 66), (67, 67, 67), (68, 68, 68), (69, 69, 69), (70, 70, 70), (71, 71, 71),
                                     (72, 72, 72), (73, 73, 73), (74, 74, 74), (75, 75, 75), (76, 76, 76), (77, 77, 77),
                                    # (78, 78, 78), (79, 79, 79), (80, 80, 80), (81, 81, 81), (82, 82, 82), (83, 83, 83),
                                     #(84, 84, 84), (85, 85, 85), (86, 86, 86), (87, 87, 87), (88, 88, 88), (89, 89, 89),
                                     #(90, 90, 90), (91, 91, 91), (92, 92, 92), (93, 93, 93), (94, 94, 94), (95, 95, 95),
                                     #(96, 96, 96), (97, 97, 97), (98, 98, 98), (99, 99, 99), (100, 100, 100),
                                     #(1, 1, 1, 1), (2, 2, 2, 2), (3, 3, 3, 3), (4, 4, 4, 4), (5, 5, 5, 5), (6, 6, 6, 6),
                                     #(7, 7, 7, 7), (8, 8, 8, 8), (9, 9, 9, 9), (10, 10, 10, 10), (11, 11, 11, 11),
                                     #(12, 12, 12, 12), (13, 13, 13, 13), (14, 14, 14, 14), (15, 15, 15, 15),
                                     #(16, 16, 16, 16), (17, 17, 17, 17), (18, 18, 18, 18), (19, 19, 19, 19),
                                     #(20, 20, 20, 20), (21, 21, 21, 21), (22, 22, 22, 22), (23, 23, 23, 23),
                                     #(24, 24, 24, 24), (25, 25, 25, 25), (26, 26, 26, 26), (27, 27, 27, 27),
                                     (28, 28, 28, 28), (29, 29, 29, 29), (30, 30, 30, 30), (31, 31, 31, 31),
                                     (32, 32, 32, 32), (33, 33, 33, 33), (34, 34, 34, 34), (35, 35, 35, 35),
                                     (36, 36, 36, 36), (37, 37, 37, 37), (38, 38, 38, 38), (39, 39, 39, 39),
                                     (40, 40, 40, 40), (41, 41, 41, 41), (42, 42, 42, 42), (43, 43, 43, 43),
                                     (44, 44, 44, 44), (45, 45, 45, 45), (46, 46, 46, 46), (47, 47, 47, 47),
                                     (48, 48, 48, 48), (49, 49, 49, 49), (50, 50, 50, 50), (51, 51, 51, 51),
                                     (52, 52, 52, 52), (53, 53, 53, 53), (54, 54, 54, 54), (55, 55, 55, 55),
                                     (56, 56, 56, 56), (57, 57, 57, 57), (58, 58, 58, 58), (59, 59, 59, 59),
                                     (60, 60, 60, 60), (61, 61, 61, 61), (62, 62, 62, 62), (63, 63, 63, 63),
                                     (64, 64, 64, 64), (65, 65, 65, 65), (66, 66, 66, 66), (67, 67, 67, 67),
                                     (68, 68, 68, 68), (69, 69, 69, 69), (70, 70, 70, 70), (71, 71, 71, 71),
                                     (72, 72, 72, 72), (73, 73, 73, 73), (74, 74, 74, 74), (75, 75, 75, 75),
                                     (76, 76, 76, 76), (77, 77, 77, 77), (78, 78, 78, 78), (79, 79, 79, 79),
                                     (80, 80, 80, 80), (81, 81, 81, 81), (82, 82, 82, 82), (83, 83, 83, 83),
                                     (84, 84, 84, 84), (85, 85, 85, 85), (86, 86, 86, 86), (87, 87, 87, 87),
                                     (88, 88, 88, 88), (89, 89, 89, 89), (90, 90, 90, 90), (91, 91, 91, 91),
                                     (92, 92, 92, 92), (93, 93, 93, 93), (94, 94, 94, 94), (95, 95, 95, 95),
                                     (96, 96, 96, 96), (97, 97, 97, 97), (98, 98, 98, 98), (99, 99, 99, 99),
                                     (100, 100, 100, 100)]
              }
gridSearch = GridSearchCV(model, parameters, cv=5)
gridSearch.fit(X_train, y_train)

gridSearch.cv_results_['params'][gridSearch.best_index_]

In [None]:
#a = [2,3,4]
#b= range(1,101)
#for i in a:
#    for j in b:
#        print('(' + ((str(j) + ',') * i)[:-1] + '),' )

### Use neural network with found parameters

In [None]:
cv = StratifiedKFold(n_splits=5)

model = MLPClassifier(solver='adam', alpha=0.001, learning_rate_init=0.0001,
                           hidden_layer_sizes=(25, 10,10), max_iter=1000)
for train_idx, test_idx, in cv.split(train_data, train_target):
    X_train, y_train = train_data[train_idx], train_target[train_idx]
    X_test, y_test = train_data[test_idx], train_target[test_idx]

    # Use SMOTE to oversample the dataset for better training accuracy
    sm = SMOTE()
    X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
    
    # Fit and predict
    model.fit(X_train_oversampled, y_train_oversampled)  
    y_pred = model.predict(X_test)

    print(f'auc: {roc_auc_score(y_test, y_pred)}')

y_predict = model.predict(X_g_test)

print(roc_auc_score(y_g_test, y_predict))


In [None]:
result_predict['mlpNetwork'] = np.array(model.predict(test_data))
test_predict['mlpNetwork'] = np.array(model.predict(y_g_test))

## Do stuff with unsure rows

In [None]:
result_df = pd.DataFrame(result_predict)
result_df['Sum'] = result_df.sum(axis=1)


count = 0
final = []
width = len(result_df.keys())-1
for row in result_df['Sum']:
    if 0 < row < width:
        #print(1 if row > (width/2) else 0)
        final.append(1 if row > (width/2) else 0)
        count+=1
    else:
        final.append(0 if row == 0 else 1)
result_df['Final']=final
print(str(count/len(result_df)*100) + '% Unsure')

t_df = pd.DataFrame(result_predict)
t_df['Sum'] = t_df.sum(axis=1)


count = 0
final = []
width = len(t_df.keys())-1
for row in t_df['Sum']:
    if 0 < row < width:
        #print(1 if row > (width/2) else 0)
        final.append(1 if row > (width/2) else 0)
        count+=1
    else:
        final.append(0 if row == 0 else 1)
t_df['Final']=final
print(str(count/len(t_df)*100) + '% Unsure')


roc_auc_score(y_g_test,list(t_df['Final']))

### Save Result to file

In [None]:

test_df['QuoteConversion_Flag'] = pd.Series(result_df['Final'], index=test_df.index)

todrop=[]
for col in test_df.columns:
    if col not in ['Quote_ID','QuoteConversion_Flag']:
        todrop.append(col)
test_df.drop(columns=todrop, inplace=True)
test_df.to_csv('Kaggle_Submission.csv', index=False)
test_df.describe()