# Assignment 3

In [1]:
import preprocess
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

## Get Preprocessed Data

In [2]:
train_target, train_data, test_data, train_df, test_df = preprocess.preprocess(
    "TrainingSet.csv", 'TestSet.csv', limit=2000, remove_low_variance=True, remove_outliers=True)

Limited Sample: 2000
Remove Field_info2 with variance of 0.0013437892283028393
Remove Field_info4 with variance of 0.07113917319019872
Remove Personal_info1 with variance of 0.0044842240940289975
Remove Personal_info4 with variance of 0.0005002501250625312
Remove Property_info1 with variance of 0.12158932319012358
Remove Property_info2 with variance of 0.0
Remove Geographic_info4 with variance of 0.02392337309796038
DataFrame shape after feature selection:(1999, 25)
DataFrame shape after outlier removal:(1832, 25)


In [3]:
X_g_train, X_g_test, y_g_train, y_g_test = train_test_split(train_data, train_target, test_size=0.30)
print(f'Trainset has {train_target.sum()} times 1')

Trainset has 378 times 1


## Init some variables for later use

In [4]:
result_predict = dict()
test_predict = dict()

def kFoldModel(model, X, y):
    cv = StratifiedKFold(n_splits=5)
    for train_idx, test_idx, in cv.split(X, y):
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]

        # Use SMOTE to oversample the dataset for better training accuracy
        sm = SMOTE()
        X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)

        # Fit and predict
        model.fit(X_train_oversampled, y_train_oversampled)
        y_pred = model.predict(X_test)

        print(f'auc: {roc_auc_score(y_test, y_pred)}')
    return model

## Random Forest
### Use Random Forest with found parameters

In [5]:
print('Start Random Forest')
model = RandomForestClassifier(n_estimators=128, criterion='entropy', n_jobs=-1)
model = kFoldModel(model, train_data, train_target)
y_predict = model.predict(X_g_test)
auc_score = roc_auc_score(y_g_test, y_predict)
print(auc_score)

result_predict['RandomForest'] = np.array(model.predict(test_data))
test_predict['RandomForest'] = np.array(model.predict(X_g_test))

Start Random Forest
auc: 0.7150479291011032
auc: 0.6984536082474228
auc: 0.7888406583468982
auc: 0.6710652920962199
auc: 0.7532183908045977
0.9344985875706214


## k-nearest neighbour
### Use k-nearest neighbour with found parameters

In [6]:
print('Start k-nearest neighbour')
model = KNeighborsClassifier(n_neighbors=20, weights='uniform', n_jobs=-1)
model = kFoldModel(model, train_data, train_target)
y_predict = model.predict(X_g_test)
auc_score = roc_auc_score(y_g_test, y_predict)
print(auc_score)

result_predict['KNeighbors'] = np.array(model.predict(test_data))
test_predict['KNeighbors'] = np.array(model.predict(X_g_test))

Start k-nearest neighbour
auc: 0.7065699041417977
auc: 0.7191852052812444
auc: 0.6977301501175619
auc: 0.6907903780068729
auc: 0.670344827586207
0.7327565913370998


## SVM
### Use SVM with found parameters

In [7]:
print('SVM')
model = SVC(gamma='auto', kernel='rbf')
model = kFoldModel(model, train_data, train_target)
y_predict = model.predict(X_g_test)
auc_score = roc_auc_score(y_g_test, y_predict)
print(auc_score)

result_predict['SVM'] = np.array(model.predict(test_data))
test_predict['SVM'] = np.array(model.predict(X_g_test))

SVM
auc: 0.7122671369144511
auc: 0.71545487429915
auc: 0.7225764152649665
auc: 0.6510652920962199
auc: 0.6489655172413793
0.6907172002510985


In [8]:
## Linear SVM
print('Linear SVM')
model = LinearSVC(random_state=0, tol=1e-5, dual=True, loss='squared_hinge')
model = kFoldModel(model, train_data, train_target)
y_predict = model.predict(X_g_test)
auc_score = roc_auc_score(y_g_test, y_predict)
print(auc_score)

result_predict['LinearSVM'] = np.array(model.predict(test_data))
test_predict['LinearSVM'] = np.array(model.predict(X_g_test))

Linear SVM
auc: 0.7604223186833062
auc: 0.7080846446011937
auc: 0.747852233676976
auc: 0.6453608247422681
auc: 0.694712643678161
0.7016046767106089


## Neural Network
### Use neural network with found parameters

In [9]:
print('Start MLPClassifier')
model = MLPClassifier(solver='adam', alpha=0.0001, learning_rate_init=0.001,
                      hidden_layer_sizes=(24, 23, 22), max_iter=1000)
model = kFoldModel(model, train_data, train_target)
y_predict = model.predict(X_g_test)
auc_score = roc_auc_score(y_g_test, y_predict)
print(auc_score)

result_predict['mlpNetwork'] = np.array(model.predict(test_data))
test_predict['mlpNetwork'] = np.array(model.predict(X_g_test))

Start MLPClassifier
auc: 0.7722011213601013
auc: 0.6972327726532825
auc: 0.7810408753843372
auc: 0.7012371134020619
auc: 0.7126436781609194
0.7771892655367231


## Do stuff with unsure rows

In [27]:
result_df = pd.DataFrame(result_predict)
result_df['Sum'] = result_df.sum(axis=1)
final = []
width = len(result_df.keys()) - 1
count = []
# init counter
for i in range(0,width+1):
    count.append(0)
for row in result_df['Sum']:
    if 0 < row < width:
        final.append(1 if row > ((width / 2)+1) else 0)
        count[row] += 1
    else:
        count[row] += 1
        final.append(0 if row == 0 else 1)
result_df['Final'] = final
for i in range(0,width+1):
    print(str(count[i]) + ' times ' + str(i) + ' on TrainTest')
print(str(sum(count[1:-1]) / len(result_df) * 100) + '% Unsure on TrainSet')

print('/n Test:')
# Do it for test to
t_df = pd.DataFrame(test_predict)
t_df['Sum'] = t_df.sum(axis=1)
final = []
width = len(t_df.keys()) - 1
count = []
# init counter
for i in range(0,width+1):
    count.append(0)
for row in t_df['Sum']:
    if 0 < row < width:
        final.append(1 if row > ((width / 2)+1) else 0)
        count[row] += 1
    else:
        count[row] += 1
        final.append(0 if row == 0 else 1)
t_df['Final'] = final
for i in range(0,width+1):
    print(str(count[i]) + ' times ' + str(i) + ' on TrainTest')
print(str(sum(count[1:-1]) / len(t_df) * 100) + '% Unsure on TrainSet')
print(f'AUC: {roc_auc_score(y_g_test, list(t_df["Final"]))}')


0 times 0 on TrainTest
9 times 1 on TrainTest
4898 times 2 on TrainTest
8466 times 3 on TrainTest
9327 times 4 on TrainTest
3376 times 5 on TrainTest
87.05322902285627% Unsure on TrainSet
/n Test:
212 times 0 on TrainTest
60 times 1 on TrainTest
54 times 2 on TrainTest
69 times 3 on TrainTest
77 times 4 on TrainTest
78 times 5 on TrainTest
47.27272727272727% Unsure on TrainSet
AUC: 0.8007297551789078


### Save Result to file

In [None]:
test_df['QuoteConversion_Flag'] = pd.Series(result_df['Final'], index=test_df.index)

todrop = []
for col in test_df.columns:
    if col not in ['Quote_ID', 'QuoteConversion_Flag']:
        todrop.append(col)
test_df.drop(columns=todrop, inplace=True)
test_df.to_csv('Kaggle_Submission.csv', index=False)
print('Written to file')