# Import necessary libraries

In [146]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import numpy as np
import pickle
import os

from sklearn.feature_selection import *
from sklearn.model_selection import *

from sklearn.ensemble import *

os.chdir("/home/sieu/PycharmProjects/ML-Vincent-Ng/prj_new")

# Dataset import

In [None]:
column_name = pd.read_table('attr.txt', sep=":", usecols=all, names = ['attr', 'range'])
df = pd.read_table('train.txt', sep="\s+", usecols=all, names = list(column_name['attr']))
# Add validation set
dfX_pred = pd.read_table('prelim.txt', sep="\s+", usecols=all, names = list(column_name['attr']))
df = pd.concat((df,dfX_pred))

In [None]:
# Adjust year
df.eval('YEAR = YEAR - 2000', inplace=True)
dfX_pred.eval('YEAR = YEAR - 2000', inplace=True)

In [None]:
df.shape

# Build histogram

In [None]:
df.hist(figsize=(90, 90))
plt.show()

# Preprocessing

In [None]:
# Seperate class from attributes
y = df["Class"].to_numpy()
X = df[df.columns[:-1]]
attr_lst = X.columns

In [None]:
# Remove all constant-valued features
sel = VarianceThreshold()
sel.feature_names_in_= attr_lst
X=sel.fit_transform(X)
attr_lst=sel.get_feature_names_out(attr_lst)
print(attr_lst)
print("Number of features: ", len(attr_lst))
print(X.shape)

# Select good features

In [None]:
n_fet_to_sel = len(attr_lst)//3

print("BEGIN: HistGrad classifier feature selection")
berNB = SequentialFeatureSelector(estimator=HistGradientBoostingClassifier(max_leaf_nodes=60,max_iter=3000,learning_rate=0.06,l2_regularization=0.15, max_depth=8, max_bins=24, early_stopping=True, random_state=0),n_features_to_select=n_fet_to_sel, direction="forward").fit(X, y)
berNB.feature_names_in_= attr_lst
attr_lst = berNB.get_feature_names_out(attr_lst)
print(attr_lst)
print("Done HistGrad classifier feature selection")

In [None]:
# Save feature selection results
with open("histgrad.pkl", 'wb') as file:
    pickle.dump(attr_lst, file)

In [None]:
attr_lst

# Load good feature

In [None]:
# Load good feature list
with open("histgrad.pkl", 'rb') as file:
    attr_lst = pickle.load(file).astype(str)
print(attr_lst)
print(len(attr_lst))

In [None]:
# Load from clipboard (either load from file or load from clipboard)
# attr_lst = ['B1', 'B3', 'C1', 'C2', 'C3', 'C4', 'C6', 'C7', 'C8', 'C12', 'C14',
#        'C23', 'C32', 'C39', 'C40', 'C41', 'C45', 'C46', 'C51', 'C54',
#        'C77', 'C91', 'C92', 'C99', 'C101', 'C104', 'C105', 'C111', 'C116',
#        'C137', 'C141', 'CT18', 'CT19', 'CT22']
# attr_lst=np.array(attr_lst)

In [None]:
# all_C_mask = and(C, not(CT or CH))
all_C_mask = np.logical_and((np.core.defchararray.find(attr_lst,"C")!=-1), np.logical_not(np.logical_or((np.core.defchararray.find(attr_lst,"CT")!=-1),(np.core.defchararray.find(attr_lst,"CH")!=-1))))
# non_C_mask = not(all_C_mask)
non_C_mask = np.logical_not(all_C_mask)
# final_mask = and(non_C_mask, not(CH_mask))
final_mask = np.logical_and(non_C_mask, np.logical_not((np.core.defchararray.find(attr_lst,"CH")!=-1)))

# Data Augmentation

In [None]:
X_new=df[attr_lst].to_numpy()

In [None]:
X_pred = dfX_pred[attr_lst].to_numpy()
y_pred = dfX_pred["Class"].to_numpy()

In [None]:
damage = 1 #5
threshold = (15,35) #(2,12)
rng = np.random.default_rng(12345)
kfold = sklearn.model_selection.StratifiedKFold(n_splits=100, shuffle=True)
y_new = y
i=0

for train_index , test_index in kfold.split(X_new, y_new):
    if i > threshold[1]:
        i+=1
        continue
    if i < threshold[0]:
        i+=1
        continue
    X_new_aug, y_new_aug = X_new[test_index], y_new[test_index]
    aug_mask = rng.choice(a=[0,1], size=X_new_aug.shape, p=[1-(damage*i)/100, (damage*i)/100])
    X_new_aug[aug_mask] = np.nan
    X_new = np.concatenate((X_new, X_new_aug), axis=0)
    y_new = np.concatenate((y_new, y_new_aug))
    i+=1
print((X_new.shape, y_new.shape))

# Training and validation

In [None]:
# Best model validation
kfold = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
arr = []
#clf = BernoulliNB()
clf = HistGradientBoostingClassifier(max_leaf_nodes=60,max_iter=3000,learning_rate=0.06,l2_regularization=0.15, max_depth=8, max_bins=24, early_stopping=True, categorical_features=final_mask, random_state=0)
# clf.fit(X_new, y_new)
# val_set_score = clf.score(X_pred, y_pred)
# print(val_set_score)
for train_index , test_index in kfold.split(X_new, y_new):
    X_train, X_test = X_new[train_index], X_new[test_index]
    y_train, y_test = y_new[train_index], y_new[test_index]
    clf.fit(X_train, y_train)
    cv_set_score = clf.score(X_test, y_test)
    print(cv_set_score)
    arr.append(cv_set_score)
print("Done")
#print("mean acc: ", np.mean(arr))

In [None]:
# Save models
with open("model.pkl", 'wb') as file:
    pickle.dump(clf, file)

# GridSearch for best model

In [None]:
# Normal Grid
params = {'max_leaf_nodes':np.arange(10,90, 10), 'l2_regularization':np.arange(0.05,0.4,0.01), 'learning_rate':np.arange(0.01,0.1,0.01), 'max_depth':np.arange(6, 12, 1), 'max_bins':np.arange(16, 34, 2)}
#params = {'max_iter':np.arange(50,200,100),'max_leaf_nodes':np.arange(100,500, 10), 'l2_regularization':np.arange(0,1,0.05)}
search = GridSearchCV(estimator=HistGradientBoostingClassifier(max_iter=3000, categorical_features=final_mask, early_stopping=True), 
                                param_grid=params, n_jobs=-1).fit(X_new,y_new)

In [None]:
search.score(X_new, y)

In [None]:
# Save models
with open("hyperparam_search.pkl", 'wb') as file:
    pickle.dump(search, file)

In [None]:
# Load model
with open("hyperparam_search.pkl", 'rb') as file:
    search = pickle.load(file)

In [None]:
search.best_params_

# Evaluation 

In [None]:
attr = pd.read_table('attr.txt', sep=":", usecols=all, names = ['attr', 'range'])
dfX_test = pd.read_table('prelim.txt', sep="\s+", usecols=all, names = list(attr['attr'])[:-1])

In [None]:
with open("histgrad.pkl", 'rb') as file:
    attr_lst = pickle.load(file)
list(attr_lst)

In [None]:
X_test = dfX_test[attr_lst].to_numpy()

In [None]:
# Load model then predict and save prediction
with open("77_alldataset.pkl", 'rb') as file:
    model = pickle.load(file)
np.savetxt(fname="prediction.txt", X=model.predict(X_test), fmt='%d')