# Import necessary libraries

In [1]:
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import numpy as np
import pickle
import os

from sklearn.feature_selection import *
from sklearn.model_selection import *

from sklearn.ensemble import *

os.chdir("/home/sieu/PycharmProjects/ML-Vincent-Ng/prj_new")

# Dataset import

In [2]:
column_name = pd.read_table('attr.txt', sep=":", usecols=all, names = ['attr', 'range'])
df = pd.read_table('train.txt', sep="\s+", usecols=all, names = list(column_name['attr']))
# Add validation set
dfX_pred = pd.read_table('prelim.txt', sep="\s+", usecols=all, names = list(column_name['attr']))
df = pd.concat((df,dfX_pred))

In [3]:
# Adjust year
df.eval('YEAR = YEAR - 2000', inplace=True)
dfX_pred.eval('YEAR = YEAR - 2000', inplace=True)

In [4]:
df.shape

(17000, 177)

# Build histogram

In [None]:
df.hist(figsize=(90, 90))
plt.show()

# Preprocessing

In [5]:
# Seperate class from attributes
y = df["Class"].to_numpy()
X = df[df.columns[:-1]]
attr_lst = X.columns

In [6]:
# Remove all constant-valued features
sel = VarianceThreshold()
sel.feature_names_in_= attr_lst
X=sel.fit_transform(X)
attr_lst=sel.get_feature_names_out(attr_lst)
print(attr_lst)
print("Number of features: ", len(attr_lst))
print(X.shape)

['B1' 'B2' 'B3' 'C1' 'C2' 'C3' 'C4' 'C5' 'C6' 'C7' 'C8' 'C9' 'C10' 'C11'
 'C12' 'C13' 'C14' 'C15' 'C16' 'C17' 'C18' 'C19' 'C20' 'C21' 'C22' 'C23'
 'C24' 'C25' 'C26' 'C27' 'C28' 'C29' 'C30' 'C31' 'C32' 'C33' 'C34' 'C35'
 'C36' 'C37' 'C38' 'C39' 'C40' 'C41' 'C42' 'C43' 'C44' 'C45' 'C46' 'C47'
 'C48' 'C49' 'C50' 'C51' 'C52' 'C53' 'C54' 'C55' 'C56' 'C57' 'C58' 'C59'
 'C60' 'C61' 'C62' 'C63' 'C64' 'C65' 'C66' 'C67' 'C68' 'C69' 'C70' 'C71'
 'C72' 'C73' 'C74' 'C75' 'C76' 'C77' 'C78' 'C79' 'C80' 'C81' 'C82' 'C83'
 'C84' 'C85' 'C86' 'C87' 'C88' 'C89' 'C90' 'C91' 'C92' 'C93' 'C94' 'C95'
 'C96' 'C97' 'C98' 'C99' 'C100' 'C101' 'C102' 'C103' 'C104' 'C105' 'C106'
 'C107' 'C108' 'C109' 'C110' 'C111' 'C112' 'C113' 'C114' 'C115' 'C116'
 'C117' 'C118' 'C119' 'C120' 'C121' 'C122' 'C123' 'C124' 'C125' 'C126'
 'C127' 'C128' 'C129' 'C130' 'C131' 'C132' 'C133' 'C134' 'C135' 'C136'
 'C137' 'C138' 'C139' 'YEAR' 'C140' 'C141' 'C142' 'CT1' 'CT2' 'CT3' 'CT4'
 'CT5' 'CT6' 'CT9' 'CT10' 'CT11' 'CT12' 'CT13' 'CT14' '

# Select good features

In [None]:
n_fet_to_sel = len(attr_lst)//3

print("BEGIN: HistGrad classifier feature selection")
berNB = SequentialFeatureSelector(estimator=HistGradientBoostingClassifier(max_leaf_nodes=60,max_iter=3000,learning_rate=0.06,l2_regularization=0.15, max_depth=8, max_bins=24, early_stopping=True, random_state=0),n_features_to_select=n_fet_to_sel, direction="forward").fit(X, y)
berNB.feature_names_in_= attr_lst
attr_lst = berNB.get_feature_names_out(attr_lst)
print(attr_lst)
print("Done HistGrad classifier feature selection")

In [8]:
# Save feature selection results
with open("histgrad.pkl", 'wb') as file:
    pickle.dump(attr_lst, file)

In [9]:
attr_lst

array(['B1', 'B2', 'C1', 'C2', 'C3', 'C4', 'C6', 'C7', 'C8', 'C10', 'C11',
       'C15', 'C23', 'C26', 'C28', 'C31', 'C41', 'C46', 'C56', 'C71',
       'C76', 'C86', 'C101', 'C116', 'C131', 'C136', 'C137', 'C138',
       'C139', 'YEAR', 'C140', 'C141', 'C142', 'CT1', 'CT2', 'CT3', 'CT4',
       'CT5', 'CT6', 'CT9', 'CT10', 'CT11', 'CT12', 'CT13', 'CT14',
       'CT15', 'CT16', 'CT17', 'CT18', 'CT19', 'CT20', 'CT21', 'CT22',
       'CT23', 'CT24', 'CT25', 'CT26'], dtype='<U4')

# Load good feature

In [None]:
# Load good feature list
with open("histgrad.pkl", 'rb') as file:
    attr_lst = pickle.load(file).astype(str)
print(attr_lst)
print(len(attr_lst))

In [7]:
# Load from clipboard (either load from file or load from clipboard)
attr_lst = ['B1', 'B2', 'C1', 'C2', 'C3', 'C4', 'C6', 'C7', 'C8', 'C10', 'C11',
       'C15', 'C23', 'C26', 'C28', 'C31', 'C41', 'C46', 'C56', 'C71',
       'C76', 'C86', 'C101', 'C116', 'C131', 'C136', 'C137', 'C138',
       'C139', 'YEAR', 'C140', 'C141', 'C142', 'CT1', 'CT2', 'CT3', 'CT4',
       'CT5', 'CT6', 'CT9', 'CT10', 'CT11', 'CT12', 'CT13', 'CT14',
       'CT15', 'CT16', 'CT17', 'CT18', 'CT19', 'CT20', 'CT21', 'CT22',
       'CT23', 'CT24', 'CT25', 'CT26']
attr_lst=np.array(attr_lst)
print(len(attr_lst))

57


In [10]:
# all_C_mask = and(C, not(CT or CH))
all_C_mask = np.logical_and((np.core.defchararray.find(attr_lst,"C")!=-1), np.logical_not(np.logical_or((np.core.defchararray.find(attr_lst,"CT")!=-1),(np.core.defchararray.find(attr_lst,"CH")!=-1))))
# non_C_mask = not(all_C_mask)
non_C_mask = np.logical_not(all_C_mask)
# final_mask = and(non_C_mask, not(CH_mask))
final_mask = np.logical_and(non_C_mask, np.logical_not((np.core.defchararray.find(attr_lst,"CH")!=-1)))

# Data Augmentation

In [11]:
X_new=df[attr_lst].to_numpy()

In [12]:
X_pred = dfX_pred[attr_lst].to_numpy()
y_pred = dfX_pred["Class"].to_numpy()

In [13]:
damage = 1 #5
threshold = (15,35) #(2,12)
rng = np.random.default_rng(12345)
kfold = sklearn.model_selection.StratifiedKFold(n_splits=100, shuffle=True)
y_new = y
i=0

for train_index , test_index in kfold.split(X_new, y_new):
    if i > threshold[1]:
        i+=1
        continue
    if i < threshold[0]:
        i+=1
        continue
    X_new_aug, y_new_aug = X_new[test_index], y_new[test_index]
    aug_mask = rng.choice(a=[0,1], size=X_new_aug.shape, p=[1-(damage*i)/100, (damage*i)/100])
    X_new_aug[aug_mask] = np.nan
    X_new = np.concatenate((X_new, X_new_aug), axis=0)
    y_new = np.concatenate((y_new, y_new_aug))
    i+=1
print((X_new.shape, y_new.shape))

((20570, 57), (20570,))


# Training and validation

In [15]:
# Best model validation
kfold = sklearn.model_selection.StratifiedKFold(n_splits=5, shuffle=True)
arr = []
#clf = BernoulliNB()
clf = HistGradientBoostingClassifier(max_leaf_nodes=60,max_iter=3000,learning_rate=0.06,l2_regularization=0.15, max_depth=8, max_bins=24, early_stopping=True, categorical_features=final_mask, random_state=0)
print("Begin CV")
for train_index , test_index in kfold.split(X_new, y_new):
    X_train, X_test = X_new[train_index], X_new[test_index]
    y_train, y_test = y_new[train_index], y_new[test_index]
    clf.fit(X_train, y_train)
    cv_set_score = clf.score(X_test, y_test)
    print(cv_set_score)
    arr.append(cv_set_score)
print("CV mean score: ", np.mean(arr))
clf.fit(X_new, y_new)
train_score = clf.score(X_new, y_new)
print("Train set score: ", train_score)

Begin CV
0.7792902284880895
0.7751579970831308
0.7812348079727759
0.7736995624696159
0.7817209528439475
CV mean score:  0.7782207097715119
Train set score:  0.876421973748177


In [17]:
# Save models
with open("model.pkl", 'wb') as file:
    pickle.dump(clf, file)

# GridSearch for best model

In [None]:
# Normal Grid
params = {'max_leaf_nodes':np.arange(10,90, 10), 'l2_regularization':np.arange(0.05,0.4,0.01), 'learning_rate':np.arange(0.01,0.1,0.01), 'max_depth':np.arange(6, 12, 1), 'max_bins':np.arange(16, 34, 2)}
#params = {'max_iter':np.arange(50,200,100),'max_leaf_nodes':np.arange(100,500, 10), 'l2_regularization':np.arange(0,1,0.05)}
search = GridSearchCV(estimator=HistGradientBoostingClassifier(max_iter=3000, categorical_features=final_mask, early_stopping=True), 
                                param_grid=params, n_jobs=-1).fit(X_new,y_new)

In [None]:
search.score(X_new, y)

In [None]:
# Save models
with open("hyperparam_search.pkl", 'wb') as file:
    pickle.dump(search, file)

In [None]:
# Load model
with open("hyperparam_search.pkl", 'rb') as file:
    search = pickle.load(file)

In [None]:
search.best_params_

# Evaluation 

In [22]:
attr = pd.read_table('attr.txt', sep=":", usecols=all, names = ['attr', 'range'])
dfX_test = pd.read_table('final-noclass.txt', sep="\s+", usecols=all, names = list(attr['attr'])[:-1])
dfX_test.eval('YEAR = YEAR - 2000', inplace=True)

In [23]:
with open("histgrad.pkl", 'rb') as file:
    attr_lst = pickle.load(file)
list(attr_lst)

['B1',
 'B2',
 'C1',
 'C2',
 'C3',
 'C4',
 'C6',
 'C7',
 'C8',
 'C10',
 'C11',
 'C15',
 'C23',
 'C26',
 'C28',
 'C31',
 'C41',
 'C46',
 'C56',
 'C71',
 'C76',
 'C86',
 'C101',
 'C116',
 'C131',
 'C136',
 'C137',
 'C138',
 'C139',
 'YEAR',
 'C140',
 'C141',
 'C142',
 'CT1',
 'CT2',
 'CT3',
 'CT4',
 'CT5',
 'CT6',
 'CT9',
 'CT10',
 'CT11',
 'CT12',
 'CT13',
 'CT14',
 'CT15',
 'CT16',
 'CT17',
 'CT18',
 'CT19',
 'CT20',
 'CT21',
 'CT22',
 'CT23',
 'CT24',
 'CT25',
 'CT26']

In [24]:
X_test = dfX_test[attr_lst].to_numpy()

In [25]:
# Load model then predict and save prediction
with open("model.pkl", 'rb') as file:
    model = pickle.load(file)
np.savetxt(fname="final.txt", X=model.predict(X_test), fmt='%d')