In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
# import xgboost as xgb
from keras import Model
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from keras.optimizers import Adam
from keras.regularizers import l2, l1
from imblearn.combine import SMOTETomek
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from keras.layers import (
    Bidirectional,
    LSTM,
    Dense,
    Dropout,
    BatchNormalization,
    Input,
    Activation,
)

## Training Data

In [None]:
df_train = pd.read_csv("./clean_train.csv")
df_train = df_train.replace({True: 1, False:0})
df_train.head()

In [None]:
df_train['Target'].value_counts().plot(kind='bar')

In [None]:
train = df_train.drop('Target', axis=1)
targets = df_train['Target']

train.shape, targets.shape

## Class Imbalance

In [None]:
# def min_max_scaling(column):
#     min_val = column.min()
#     max_val = column.max()
#     scaled_column = (column - min_val) / (max_val - min_val)
#     return scaled_column

# # scale all columns [0-1]
# train = train.iloc[:, :].apply(min_max_scaling)
# train.head()

In [None]:
train = train.drop(['Round_nan', 'Status_nan', 'Geography_nan', 'Province_nan'], axis=1)
train.head()

In [None]:
# tl_us = TomekLinks(sampling_strategy='majority')
tl_us = SMOTETomek(random_state=42, sampling_strategy = 1.0)
x_resample,y_resample = tl_us.fit_resample(train, targets )
#
x_resample.shape,y_resample.value_counts()

In [None]:
print(y_resample.value_counts())
#
plt.figure(figsize=(5, 3))
sns.countplot(x=y_resample)
plt.show()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_resample, y_resample, test_size=0.2, random_state=42)
x_train.shape, y_train.shape

## Test Data

In [None]:
# clean data
test_data = pd.read_csv("./clean_test.csv")
test_data = test_data.replace({True: 1, False:0})
test_data.head()

In [None]:
test_data = test_data.drop(['Round_nan', 'Status_nan', 'Geography_nan', 'Province_nan'], axis=1)
test_data.head()

In [None]:
# test_data = test_data.iloc[:, :].apply(min_max_scaling)
# test_data.head()

## Models

#### Naive Bayes Gaussian

In [None]:
# # NB
# grid_search_params = {"var_smoothing": [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}

# cls_gnb = GaussianNB()

# grid_search = GridSearchCV(
#     estimator=cls_gnb, param_grid=grid_search_params, cv=10, scoring="accuracy"
# )

# grid_search.fit(x_train, y_train)

# print(f"{grid_search.best_params_}\n\n")
# print(f"{grid_search.best_estimator_}")

In [None]:
# # perform cross val on data
# model = GaussianNB(var_smoothing=1e-07)
# cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)
# scores = cross_val_score(model, x_train, y_train, cv=cv, scoring='accuracy')
# print(f"mean score: {scores.mean()}")

In [None]:
# model.fit(x_train, y_train)

#### SVM - SVC - Slow

In [None]:
# # svc takes times
# # "C": [0.08, 0.05, 0.03, 0.01, 0.1, 1, 3, 5, 7, 10],
# grid_search_params = {
#     "C": [0.01, 0.05, 0.1, 1, 5, 10],
#     "kernel": ["linear", "rbf", "poly", "sigmoid"],
#     "degree": [2, 3, 4, 5],
#     "gamma": ["scale", "auto"] + [0.001, 0.01, 0.1, 1],
# }

# cls_svc = SVC()

# grid_search = GridSearchCV(
#     estimator=cls_svc, param_grid=grid_search_params, cv=5, scoring="accuracy"
# )

# grid_search.fit(x_train, y_train)

# print(f"{grid_search.best_params_}\n\n")
# print(f"{grid_search.best_estimator_}")

In [None]:
# # perform cross val on data
# model = SVC()
# cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)
# scores = cross_val_score(model, x_train, y_train, cv=cv, scoring='accuracy')
# print(f"mean score: {scores.mean()}")

In [None]:
# model.fit(x_train, y_train)

#### Deep Learning

In [None]:
# # x_train.reshape(x_train.shape[0], 23, 3)
# x_train = np.array(x_train).reshape(x_train.shape[0], 23, 3)

In [None]:
# # remove province nan
# dl_data = np.array( train.drop('Province_nan', axis=1)).astype(np.float32)
# dl_data = dl_data.reshape(dl_data.shape[0], 9, -1 )
# dl_data.shape 

In [None]:
# x_test = np.array(x_test).reshape(x_test.shape[0], 23, 3)
# y_test = np.asarray(y_test).astype('int').reshape((-1,1))
# x_test.shape, y_test.shape

In [None]:
# y_train = np.array(y_train)
# y_train

In [None]:
# input_shape = x_train.shape[1: ]

# input_layer = Input(shape=input_shape)

# # units = [256, 128, 64, 32]
# # x = Bidirectional(LSTM(128, return_sequences=True, kernel_regularizer=(l2(0.01)) ))(input_layer)
# x = LSTM(128, return_sequences=True, kernel_regularizer=(l2(0.01)))(input_layer)
# # x = BatchNormalization()(x)
# x = Activation("relu")(x)
# x = Dropout(0.5)(x)
# #
# # x = Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=(l2(0.01))))(x)
# # # x = LSTM(32, return_sequences=True, kernel_regularizer=(l2(0.01)))(x)
# # # x = BatchNormalization()(x)
# # x = Activation("relu")(x)
# # x = Dropout(0.5)(x)
# #
# # x = Bidirectional(LSTM(16, return_sequences=True, kernel_regularizer=(l2(0.01))))(x)
# # # x = LSTM(16, return_sequences=True, kernel_regularizer=(l2(0.01)))(x)
# # # x = BatchNormalization()(x)
# # x = Activation("relu")(x)
# # x = Dropout(0.5)(x)

# output_layer = Dense(1, activation="sigmoid")(x)

# model = Model(inputs=input_layer, outputs=output_layer)

# #
# learning_rate = 0.01
# optim_ = Adam(learning_rate=learning_rate)

# #

# model.compile(optimizer=optim_, loss="binary_crossentropy", metrics=["accuracy"],)

# model.summary()

In [None]:
# y_train = np.asarray(y_train).astype('int').reshape((-1,1))
# x_train.shape, y_train.shape

In [None]:
# epochs= 10
# batch_size = 32
# class_weights = {0: 1.0, 1: 1.5}
# model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.2)

In [None]:
# # remove province nan
# dl_test_data = np.array( test_data.drop('Province_nan', axis=1)).astype(np.float32)
# dl_test_data = dl_test_data.reshape(dl_test_data.shape[0], 9, -1 )
# dl_test_data.shape

# y_pred = model.predict(dl_test_data)

In [None]:
# # y_pred = np.squeeze(y_pred)
# # predictions= ( y_pred >= .5).astype('int')[:,0]
# # predictions
# test_data = np.array(test_data).reshape(test_data.shape[0], 23, 3)
# test_data.shape

#### Multinomial NB

In [None]:
# NB
grid_search_params = {
    "alpha": [0.1, 0.5, 1.0, 2.0],
    "fit_prior": [True, False],
}

cls_mnb = MultinomialNB()

grid_search = GridSearchCV(
    estimator=cls_mnb, param_grid=grid_search_params, cv=30, scoring="accuracy"
)

grid_search.fit(x_train, y_train)

print(f"Best Params: {grid_search.best_params_}")
print(f"Best Estimator: {grid_search.best_estimator_}")
print(f"Best Score: {grid_search.best_score_}")

In [None]:
# cls_mnb.fit(x_train, y_train)

In [None]:
# predictions = cls_mnb.predict(test_data)
# predictions

#### Random Forest Classifier

In [None]:
# grid_search_params = {
#     'n_estimators': [50, 100, 200],  # Number of trees in the forest
#     'max_depth': [None, 10, 20, 30],  # Maximum depth of each tree
#     'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
#     'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
#     'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider for the best split
# }

# cls_rf = RandomForestClassifier(random_state=42)

# grid_search = GridSearchCV(
#     estimator=cls_rf, param_grid=grid_search_params, cv=10, scoring="accuracy"
# )

# grid_search.fit(train, targets)

# # print(f"{grid_search.best_params_}\n\n")
# print(f"{grid_search.best_estimator_}")

In [None]:
# rf = RandomForestClassifier(
#     random_state=42, max_depth=30, min_samples_leaf=2, min_samples_split=5
# )

# cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)

# scores = cross_val_score(rf, train, targets, cv=cv, scoring='accuracy')

# print(f"mean score: {scores.mean()}")

In [None]:
# rf.fit(train, targets)
# predictions = rf.predict(test_data)
# predictions

#### KNN

In [None]:
# grid_search_params = {
#     "n_neighbors": [3, 5, 7, 9],
#     "weights": ["uniform", "distance"],
#     "p": [1, 2],
#     # "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
# }

# cls_knn = KNeighborsClassifier()

# grid_search = GridSearchCV(
#     estimator=cls_knn, param_grid=grid_search_params, cv=10, scoring="accuracy"
# )

# grid_search.fit(train, targets)

# # print(f"{grid_search.best_params_}\n\n")
# print(f"{grid_search.best_estimator_}")

In [None]:
# knn = KNeighborsClassifier( n_neighbors=5, weights="distance", p=2 )

# cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)

# scores = cross_val_score(knn, train, targets, cv=cv, scoring='accuracy')

# print(f"mean score: {scores.mean()}")

In [None]:
# knn.fit(train, targets)
# predictions = knn.predict(test_data)
# predictions

#### DecisionTreeClassifier

In [None]:
# grid_search_params = {
#     "criterion": ["gini", "entropy"],
#     "max_depth": [None, 5, 10, 15, 20],
#     "min_samples_split": [2, 5, 10],
#     "min_samples_leaf": [1, 2, 4],
# }

# cls_dt = DecisionTreeClassifier()

# grid_search = GridSearchCV(
#     estimator=cls_dt, param_grid=grid_search_params, cv=10, scoring="accuracy"
# )

# grid_search.fit(train, targets)

# print(f"{grid_search.best_params_}\n\n")
# print(f"{grid_search.best_estimator_}")

In [None]:
# dt = DecisionTreeClassifier(
#     criterion="gini", max_depth=10, min_samples_leaf=4, min_samples_split=5
# )
# # dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5)

# cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)

# scores = cross_val_score(dt, train, targets, cv=cv, scoring="accuracy")

# print(f"mean score: {scores.mean()}")

In [None]:
# dt.fit(train, targets)
# predictions = dt.predict(test_data)
# predictions

#### SGDClassifier

In [None]:
# grid_search_params = {
#     "loss": ["hinge", "log", "modified_huber"],
#     "penalty": ["none", "l1", "l2", "elasticnet"],
#     "alpha": [0.0001, 0.001, 0.01, 0.1],
#     "max_iter": [1000, 2000, 3000],
# }

# cls_sdg = SGDClassifier()

# grid_search = GridSearchCV(
#     estimator=cls_sdg, param_grid=grid_search_params, cv=10, scoring="accuracy"
# )

# grid_search.fit(train, targets)

# # # print(f"{grid_search.best_params_}\n\n")
# # print(f"{grid_search.best_estimator_}")

In [None]:
# sgd = SGDClassifier(alpha=0.001, loss='modified_huber', max_iter=3000,penalty='elasticnet')

# cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)

# scores = cross_val_score(sgd, train, targets, cv=cv, scoring='accuracy')

# print(f"mean score: {scores.mean()}")

In [None]:
# sgd.fit(train, targets)
# predictions = sgd.predict(test_data)
# predictions

#### DecisionTreeRegressor

In [None]:
# grid_search_params = {
#     "max_depth": [None, 10, 20, 30],
#     "min_samples_split": [2, 5, 10],
#     "min_samples_leaf": [1, 2, 4],
# }

# cls_dt_reg = DecisionTreeRegressor()

# grid_search = GridSearchCV(
#     estimator=cls_dt_reg, param_grid=grid_search_params, cv=10, scoring="accuracy"
# )

# grid_search.fit(train, targets)

# print(f"{grid_search.best_params_}\n\n")
# print(f"{grid_search.best_estimator_}")

In [None]:
# dt_reg = DecisionTreeRegressor()
# # dt = DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, min_samples_split=5)

# cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)

# scores = cross_val_score(dt_reg, train, targets, cv=cv, scoring="accuracy")

# print(f"mean score: {scores.mean()}")

In [None]:
# dt.fit(train, targets)
# predictions = dt.predict(test_data)
# predictions

#### MLPClassifier

In [None]:
# grid_search_params = {
#     "hidden_layer_sizes": [(64,), (128,), (256,)],
#     "activation": ["relu", "tanh"],
#     "alpha": [0.0001, 0.001, 0.01],
# }


# cls_mlp = MLPClassifier()

# grid_search = GridSearchCV(
#     estimator=cls_mlp, param_grid=grid_search_params, cv=10, scoring="accuracy"
# )

# grid_search.fit(train, targets)

# print(f"{grid_search.best_params_}\n\n")
# print(f"{grid_search.best_estimator_}")

In [None]:
# # mlp = MLPClassifier(alpha=0.01, hidden_layer_sizes=(64,))
# mlp = MLPClassifier(activation='relu', alpha=0.0001, hidden_layer_sizes=(256,), random_state=1)
# # mlp = MLPClassifier()

# cv = StratifiedKFold(random_state=42, n_splits=10, shuffle=True)

# scores = cross_val_score(mlp, train, targets, cv=cv, scoring="accuracy")

# print(f"mean score: {scores.mean()}")

In [None]:
# mlp.fit(train, targets)
# predictions = mlp.predict(test_data)
# predictions

## Model Evaluation

In [None]:
y_preds = model.predict(x_test)

In [None]:
y_preds.shape, y_test.shape

In [None]:
f1_score(y_test, y_preds)

In [None]:
print(classification_report(y_test, y_preds, zero_division=1))

In [None]:
cm = confusion_matrix(y_test, y_preds)
cm

In [None]:
plt.figure(figsize=(5, 3))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

## Save to file

In [None]:
# test_data.shape

In [None]:
predictions = model.predict(test_data)
# predictions = ( predictions >= .5).astype('int')[:,0]
# predictions = predictions.flatten()
predictions

In [None]:
# # unclean data
submit_sample = pd.read_csv("./SampleSubmission.csv")
submit_sample.head()

In [None]:
df_submission = pd.DataFrame({"Person_id": submit_sample["Person_id"], "Target": predictions.astype(int)})
df_submission.head()

In [None]:
df_submission.to_csv("submission.csv", index=False)