<a href="https://colab.research.google.com/github/harnalashok/CatEncodersFamily/blob/main/Amazon_Employee_Access_Challenge.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
16th May, 2023


Keep 'resource' aside while transforming features
Still get comparable results to xgboost
on full data with 'resource'

"""

In [None]:
%reset -f

# 1.0 Call libraries
import pandas as pd
import numpy as np



# 1.01
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
#import umap  # Takes long time to import
import seaborn as sns
import xgboost as xgb



# 1.02 Misc
import os,gc , time


# 1.03 Home made modules

os.chdir("C:\\Users\\Ashok\\OneDrive\\Documents\\talkingdata\\26042023_amazon\\")
import utils
from utils import *
import catfamilyenc

# 1.04

import importlib; importlib.reload(utils)
import importlib; importlib.reload(catfamilyenc)



In [None]:
# Import PyDrive and associated libraries.
# This only needs to be done once per notebook.
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# List .txt files in the root.
#
# Search query reference:
# https://developers.google.com/drive/v2/web/search-parameters
listed = drive.ListFile({'q': "title contains '.txt' and 'root' in parents"}).GetList()
for file in listed:
  print('title {}, id {}'.format(file['title'], file['id']))

In [None]:
# 1.05
dataPath =                 "C:\\Users\\Ashok\\OneDrive\\Documents\\talkingdata\\26042023_amazon\\"
modelsPath =               "C:\\Users\\Ashok\\OneDrive\\Documents\\talkingdata\\26042023_amazon\\allmodels\\models\\"
pathToStoreProgress =      "C:\\Users\\Ashok\\OneDrive\\Documents\\talkingdata\\26042023_amazon\\allmodels\\progress\\"
master =  dataPath + "master\\"
os.chdir(dataPath)

In [None]:

# 2.0 Decide program-wide seed
rng= np.random.RandomState(0)


In [None]:
# Read Data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")   # Does not contain action columns

In [None]:
test.head()
train.head()
train.shape     # (32769, 10)
train.columns

In [None]:

y = train.pop("action")
test.pop("id")

# Map target to 0 and 1
y.value_counts()
y.value_counts(normalize = True)  # 95%:6%


# Check nulls. None.
train.isnull().sum()
test.isnull().sum()



In [None]:

# Rename columns with spaces:
#data = data.rename(columns = { "concave points_mean" : "concave_points_mean",
#                     "concave points_se" : "concave_points_se ",
#                     "concave points_worst" : "concave_points_worst"
#                    }
#                   )

print(train.columns)
len(train.columns)    # 9


## Developing models
# Which are our cat columns
# We will consider few columns
# Ref: https://www.kaggle.com/code/kanncaa1/feature-selection-and-data-visualization

cat_cols = ['mgrid', 'rolerollupOne', 'rolerolluptwo', 'roledeptname',
           'roletitle', 'rolefamilydesc', 'rolefamily', 'rolecode']
cat_cols
len(cat_cols)  # 8. 'resource' is not included
train['resource'].value_counts()
train['resource'].nunique()   # 7518


X_train, X_test, y_train,y_test = train_test_split(train, y, test_size=0.25, random_state=rng)

X_train.shape  #  (24576, 9)
X_test.shape   #  (8193, 9)
y_train.shape  # (24576,)
y_test.shape   # (8193,)

# Save these for future
os.chdir(master)
X_train.to_pickle("X_train.pkl")
X_test.to_pickle("X_test.pkl")
y_train.to_pickle("y_train.pkl")
y_test.to_pickle("y_test.pkl")
# Read the data back:
os.chdir(master)
X_train = pd.read_pickle("X_train.pkl")
X_test = pd.read_pickle("X_test.pkl")
y_train= pd.read_pickle("y_train.pkl")
y_test = pd.read_pickle("y_test.pkl")

X_train.shape  #  (24576, 9)
X_test.shape   #  (8193, 9)
y_train.shape  # (24576,)
y_test.shape   # (8193,)


# Keep interacting columns, same
# as cat columns:
interactingCatCols = []

# Instantiate CustomTransformer class:
ct = catfamilyenc.CatFamilyEncoder( cMeasures=[1,1,1,0,None,1,1],
                       noOfColsToConcat = 2,
                       n_iter =1,
                       k = 40,     # Does it matter here?
                       modelsPath = modelsPath,
                       pathToStoreProgress = pathToStoreProgress,
                       saveGraph = True
                       )

# Fit it:
ct.fit(X_train,  cat_cols, interactingCatCols)


utils.savePythonObject(ct, "transformer.pkl", modelsPath)
del ct
ct = utils.restorePythonObject("transformer.pkl", modelsPath)
ct


ct.modelsPath
ct.plotNetworkGraph("mgrid_projected_roledeptname.gml",
                    figsize = (10,10),
                    connected_nodes = True
                    )



# Transform test_binned data with only cat_cols
out_te = ct.transform(X_test)
out_te.shape      #   (8193, 219)
out_te.columns
# Remove low variance columns
#out_te = utils.removeLowVarCols( out_te , pca = False)
#out_te.shape   #    (8193, 71)
y_test.shape   #
out_te.columns    # (8193,)

os.chdir(master)
out_te.to_pickle("X_test_transformed.pkl")
y_test.to_pickle("y_test.pkl")



# Check list of original columns
gc.collect()
out_tr = ct.transform(X_train)


os.chdir(master)
out_tr.to_pickle("X_train_transformed.pkl")
y_train.to_pickle("y_train.pkl")
out_te.shape   #   (8193, 219)
out_tr.shape   #   (24576, 219)




##############################
## Start reading
#############################

os.chdir(master)
train_trans = pd.read_pickle("X_train_transformed.pkl")
test_trans = pd.read_pickle("X_test_transformed.pkl")
X_train = pd.read_pickle("X_train.pkl")
X_test = pd.read_pickle("X_test.pkl")
y_train = pd.read_pickle("y_train.pkl")
y_test=pd.read_pickle("y_test.pkl")

train_trans.shape   #   (24576, 219)
test_trans.shape    #  (8193, 219)
train_trans.columns


train_trans.columns[:9]
l=train_trans.columns[9:]
l


#############################3



model= 0
evals_result= {}
del model
model = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.04,      # 0.06
                           max_depth = 13,
                           subsample = 0.9,           # 0.8
                           evals_result = evals_result,
                           random_state = rng,
                           reg_lambda = 1.5,


                           )

## NOTE THIS IS WITHOT resource column
tr_X =  train_trans[l] # Xtrain
test_X = test_trans[l] # Xtest
ytrain = y_train
ytest = y_test


model.fit(tr_X, ytrain.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc']    # binary classification problem
          )



# auc: 0.81646
model.best_score   # 0.8622657
model.best_iteration # 155
pred = model.predict(test_X)
(pred == ytest).sum()/ytest.size    # 0.948858

print(classification_report(ytest,pred))



"""
              precision    recall  f1-score   support

           0       0.59      0.28      0.38       458
           1       0.96      0.99      0.97      7735

    accuracy                           0.95      8193
   macro avg       0.77      0.63      0.68      8193
weighted avg       0.94      0.95      0.94      8193

            precision    recall  f1-score   support

           0       0.58      0.36      0.45       458
           1       0.96      0.98      0.97      7735

    accuracy                           0.95      8193
   macro avg       0.77      0.67      0.71      8193
weighted avg       0.94      0.95      0.94      8193

"""

#################3

model_or= 0
evals_result= {}
del model_or
model_or = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 15,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = rng
                           )


tr_X = X_train # Xtrain
test_X = X_test # Xtest
ytrain = y_train
ytest = y_test


model_or.fit(tr_X, ytrain.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc']    # binary classification problem
          )



# auc: 0.81646
model_or.best_score   # 0.8593978
pred = model_or.predict(test_X)
(pred == ytest).sum()/ytest.size    # 0.9515440

print(classification_report(ytest.values,pred))

"""
               precision    recall  f1-score   support

           0       0.64      0.27      0.38       458
           1       0.96      0.99      0.97      7735

    accuracy                           0.95      8193
   macro avg       0.80      0.63      0.68      8193
weighted avg       0.94      0.95      0.94      8193


"""





##############################
## PCA
##############################
from sklearn.decomposition import PCA
#from sklearn.decomposition import KernelPCA

## 2D
# kernelpca requires huge RAM. Hangs
del pca
pca = PCA(n_components= 2, whiten= True, random_state = rng)


train_trans.columns[:9]
l=train_trans.columns[9:]
l

# Check null status and fill it up with median
da = train_trans[l]
da.isnull().sum().sum()
da.isnull().sum()[da.isnull().sum() > 0]
nullcols = list(da.isnull().sum()[da.isnull().sum() > 0].index)
nullcols
# Fill up nulls using median
for i in nullcols:
    da[i]= da[i].fillna(da[i].median())

# Check again
da.isnull().sum().sum()
da.columns
da.shape   #  (513, 192)

del ss
ss = StandardScaler()
da = pca.fit_transform(ss.fit_transform(da))
da.shape  #  (24576, 19)

colnames = ["pc" + str(i) for i in range(da.shape[1])]
colnames
da = pd.DataFrame(da, columns = colnames)
sns.scatterplot(x= da['pc0'], y = da['pc1'], hue = y_train.values)

## How good is PCA

Xtrain, Xtest, ytr,yte = train_test_split(da, y_train, test_size = 0.20, stratify=y_train)

model_pca= 0
evals_result= {}
del model_pca
model_pca = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 15,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = rng
                           )


tr_X = Xtrain # Xtrain
test_X = Xtest # Xtest
ytrain = ytr
ytest = yte


model_pca.fit(tr_X, ytrain.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc']    # binary classification problem
          )



# auc: 0.81646
model_pca.best_score   # 0.80
pred = model_pca.predict(test_X)
(pred == yte).sum()/yte.size    # 0.9515440

print(classification_report(ytest.values,pred))

"""

                 precision    recall  f1-score   support

           0       0.57      0.16      0.25       288
           1       0.95      0.99      0.97      4628

    accuracy                           0.94      4916
   macro avg       0.76      0.57      0.61      4916
weighted avg       0.93      0.94      0.93      4916


"""


##############################
## tsne
##############################
# Why blobs do not appear together in tsne?
# See StackOverflow:
#    https://stats.stackexchange.com/a/453106/78454


from sklearn.manifold import  TSNE

# Not possible to tsne original data
#  being categorical

train_trans[l].head()
tsne = TSNE(perplexity = 30)  # 30 gives best AUC
                              # Tried 20 and 50
ss = StandardScaler()
da = tsne.fit_transform(ss.fit_transform(train_trans[l]))
da.shape
da
colnames = ["tsne" + str(i) for i in range(da.shape[1])]
colnames
da = pd.DataFrame(da, columns = colnames)
da.head()

plt.figure(100)
sns.scatterplot(x= da['tsne0'], y = da['tsne1'], hue = y_train.values)


Xtrain, Xtest, ytr, yte = train_test_split(da, y_train, test_size = 0.20,stratify= y_train )

evals_result= {}
model_tsne = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = 70
                           )


tr_X =  Xtrain
test_X = Xtest



model_tsne.fit(tr_X, ytr,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, yte)],
          eval_metric = ['auc']
          )




model_tsne.best_score   # 0.8215140479448766
pred = model_tsne.predict(test_X)
(pred == yte).sum()/yte.size    # 0.75

print(classification_report(yte,pred))

"""
auc = 0.82151

                precision    recall  f1-score   support

           0       0.70      0.19      0.30       288
           1       0.95      0.99      0.97      4628

    accuracy                           0.95      4916
   macro avg       0.83      0.59      0.64      4916
weighted avg       0.94      0.95      0.93      4916


"""


############################
# Optuna hyperparameter tuning
###########################
# REf: https://practicaldatascience.co.uk/machine-learning/how-to-use-optuna-for-xgboost-hyperparameter-tuning
# Maximise f1_score.


import optuna
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')



# Get our train/test data:
# Transformed data
# Filter out initial 9
#  cat columns. Keep only their
#   numeric transformations

l=train_trans.columns[9:]
l
tr_X =  train_trans[l] # Xtrain
test_X = test_trans[l] # Xtest
ytrain = y_train
ytest = y_test

# Original data
tr_X =  X_train
test_X = X_test
ytrain = y_train
ytest = y_test



# Optuna, define objective function
def objective(trial):
    """Define the objective function"""

    # xgboost parameter ranges
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 14),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'auc',
        'use_label_encoder': False
    }


    optuna_model = xgb.XGBClassifier(**params)
    optuna_model.fit(tr_X, ytrain)
    # Make predictions
    y_pred = optuna_model.predict(test_X)

    # Evaluate predictions
    f1 = f1_score(ytest, y_pred, pos_label = 0)
    # Maximise f1-score
    return f1



# Create optuna study
study = optuna.create_study(direction='maximize')
# Begin optimization
study.optimize(objective, n_trials=400)
# Can run this function again to optimize further
study.optimize(objective, n_trials=200)


# After study has finished:
print('Number of finished trials: {}'.format(len(study.trials)))

# Best trial
trial = study.best_trial
trial.value   # Best trial result (f1-score)
# Get best parameters:
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


# Use these parameters in our estimator:
best_params = trial.params

model = xgb.XGBClassifier(**best_params)
model.fit(tr_X, ytrain)

# Make predictions and assessments:
y_pred = model.predict(test_X)
print(classification_report(ytest, y_pred))


"""
Results with transformed features (400 trials):
==================================

               precision    recall  f1-score   support

           0       0.57      0.43      0.49       458
           1       0.97      0.98      0.97      7735

    accuracy                           0.95      8193
   macro avg       0.77      0.70      0.73      8193
weighted avg       0.94      0.95      0.95      8193



              precision    recall  f1-score   support

           0       0.57      0.45      0.50       458
           1       0.97      0.98      0.97      7735

    accuracy                           0.95      8193
   macro avg       0.77      0.71      0.74      8193
weighted avg       0.95      0.95      0.95      8193



Results with original data (400 trials):
===========================

             precision    recall  f1-score   support

           0       0.65      0.40      0.49       458
           1       0.97      0.99      0.98      7735

    accuracy                           0.95      8193
   macro avg       0.81      0.69      0.73      8193
weighted avg       0.95      0.95      0.95      8193
"""


# Also calculate 'auc':
# If f1-score is maximised, there
#  is some slight degradaion in 'auc'
y_score = model.predict_proba(test_X)
roc_auc_score(ytest, y_score[:,1])     # 0.846096/ 0.848417700973570





############################
# Optuna hp tuning
###########################
# REf: https://practicaldatascience.co.uk/machine-learning/how-to-use-optuna-for-xgboost-hyperparameter-tuning
# Maximise auc

# Call libraries, as usuual:
import optuna
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')

# Train/test data
# Filter out initial 9
#  cat columns. Keep only their
#   numeric transformations
l=train_trans.columns[9:]
l
tr_X =  train_trans[l] # Xtrain
test_X = test_trans[l] # Xtest
ytrain = y_train
ytest = y_test

# Original data
tr_X =  X_train
test_X = X_test
ytrain = y_train
ytest = y_test


def objective(trial):
    """Define the objective function"""

   # Parameter ranges
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 14),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'auc',
        'use_label_encoder': False
    }


    optuna_model = xgb.XGBClassifier(**params)
    optuna_model.fit(tr_X, ytrain)
    # Make predictions
    y_score = optuna_model.predict_proba(test_X)
    roc_score = roc_auc_score(ytest, y_score[:,1])

    return roc_score




study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)



print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


params = trial.params

model = xgb.XGBClassifier(**params)
model.fit(tr_X, ytrain)


y_score = model.predict_proba(test_X)
roc_score = roc_auc_score(ytest, y_score[:,1])
roc_score  #  0.8628952219114048

y_pred = model.predict(test_X)
print(classification_report(ytest, y_pred))


"""
                        precision    recall  f1-score   support

           0       0.60      0.28      0.38       458
           1       0.96      0.99      0.97      7735

    accuracy                           0.95      8193
   macro avg       0.78      0.64      0.68      8193
weighted avg       0.94      0.95      0.94      8193


"""
########################################################
# Optuna with SMOTE
# Check if f1-score further improves
########################################################


import optuna
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

# Train/test data
# Filter out initial 9
#  cat columns. Keep only their
#   numeric transformations
l=train_trans.columns[9:]
l
tr_X =  train_trans[l] # Xtrain
test_X = test_trans[l] # Xtest
ytrain = y_train
ytest = y_test


sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(tr_X, ytrain)
X_res.shape  # (46274, 62)


tr_X = X_res
test_X = test_trans[l] # Xtest
ytrain = y_res
ytest = y_test


# Optuna, define objective function
def objective(trial):
    """Define the objective function"""

    # xgboost parameter ranges
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 14),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'auc',
        'use_label_encoder': False
    }


    optuna_model = xgb.XGBClassifier(**params)
    optuna_model.fit(tr_X, ytrain)
    # Make predictions
    y_pred = optuna_model.predict(test_X)

    # Evaluate predictions
    f1 = f1_score(ytest, y_pred, pos_label = 0)
    # Maximise f1-score
    return f1



# Create optuna study
study = optuna.create_study(direction='maximize')
# Begin optimization
study.optimize(objective, n_trials=400)


# After study has finished:
print('Number of finished trials: {}'.format(len(study.trials)))

# Best trial
trial = study.best_trial
trial.value   # Best trial,  0.5168986083499006
# Get best parameters:
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))


# Use these parameters in our estimator:
best_params = trial.params

model = xgb.XGBClassifier(**best_params)
model.fit(tr_X, ytrain)

# Make predictions and assessments:
y_pred = model.predict(test_X)
print(classification_report(ytest, y_pred))

"""
After using smote:

                 precision    recall  f1-score   support

           0       0.47      0.57      0.52       458
           1       0.97      0.96      0.97      7735

    accuracy                           0.94      8193
   macro avg       0.72      0.77      0.74      8193
weighted avg       0.95      0.94      0.94      8193


"""
##*****************************
## Embedding Projector
## Incidentally f1-score is highest
##*****************************

os.chdir(master)
train_trans = pd.read_pickle("X_train_transformed.pkl")
test_trans = pd.read_pickle("X_test_transformed.pkl")
X_train = pd.read_pickle("X_train.pkl")
X_test = pd.read_pickle("X_test.pkl")
y_train = pd.read_pickle("y_train.pkl")
y_test=pd.read_pickle("y_test.pkl")

train_trans.shape   #   (24576, 219)
test_trans.shape    #  (8193, 219)
train_trans.columns
y_train.shape       # (24576,)
y_test.shape        # (8193,)
X_train.shape   # (24576, 9)
#X_train['target'].head()  # Does it contain 'target'. No.


# Impute test data
train_trans.isnull().sum().sum()
test_trans.isnull().sum().sum()   # 15796

# Reset index
y_train = y_train.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

test_trans['target'] = y_test
train_trans['target'] =  y_train


# Impute test data
train_trans.isnull().sum().sum()
test_trans.isnull().sum().sum()   # 15796


test_trans = test_trans.dropna()
test_trans = test_trans.reset_index(drop = True)
test_trans.shape  #  (7740, 220)

# Impute test data
train_trans.isnull().sum().sum()
test_trans.isnull().sum().sum()   # 15796

train_trans.head()
test_trans.head()
test_trans.shape

# Impute test_trans
#si = SimpleImputer(strategy = 'mean')
#si.fit(train_trans)
#test_trans[:] = si.transform(test_trans)
#test_trans.isnull().sum().sum()


# Get embedding projector vectors and metadata
# Needed to color
train_trans.pop('resource')
test_trans.pop('resource')

yt_test = test_trans['target']
yt_test.shape


vec_tr = ct.vectorsToTSV(train_trans, take_mean = False, filepath = None, saveVectorsToDisk = True)
vec_te = ct.vectorsToTSV(test_trans, take_mean = False, filepath = None, saveVectorsToDisk = False)



cctr,ccte, vtr,vte = utils.pcaAndConcat(vec_tr, vec_te, n_components = 3)


vtr.keys()
vtr['mgrid'].head()

cctr.shape   #  (24576, 16)
ccte.shape   #  (8193, 16)

cctr.columns
ccte.columns

Xtr_pca, Xte_pca, ytr_pca, yte_pca =  train_test_split(cctr, y_train, test_size = 0.20, stratify=y_train)

model_vec= 0
evals_result= {}
del model_vec
model_vec = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.04,      # 0.06
                           max_depth = 13,
                           subsample = 0.9,           # 0.8
                           evals_result = evals_result,
                           random_state = rng,
                           reg_lambda = 1.5,


                           )

## NOTE THIS IS WITHOT resource column
tr_X =    cctr # Xtr_pca # cctr #
test_X =  ccte# Xte_pca # ccte #
ytrain =  y_train # ytr_pca #  #  # y_train
ytest = yt_test # yte_pca # yt_test # y_test #  #


model_vec.fit(tr_X, ytrain.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc']    # binary classification problem
          )



# auc: 0.81646
model_vec.best_score   # 0.8622657  0.84430
model_vec.best_iteration # 155
pred = model_vec.predict(test_X)
(pred == ytest).sum()/ytest.size    # 0.948858

print(classification_report(ytest,pred))




##############################
## tsne
##############################
# Why blobs do not appear together in tsne?
# See StackOverflow:
#    https://stats.stackexchange.com/a/453106/78454


from sklearn.manifold import  TSNE


## 2D
tsne = TSNE()
dx = tsne.fit_transform(orig_train)
y_train.values.shape


sns.scatterplot(x= dx[:,0], y = dx[:,1], hue = y_train.values)

tsne = TSNE()
org_trans_train.columns[20:]
da = tsne.fit_transform(org_trans_train[org_trans_train.columns[20:]])
da.shape
sns.scatterplot(x= da[:,0], y = da[:,1], hue = y_train.values)
sns.scatterplot(x= dx[:,0], y = dx[:,1], hue = y_train.values)


## 3D
tsne = TSNE(n_components = 3, early_exaggeration = 40)
dx3 = tsne.fit_transform(orig_train)
dx3.shape


tsne = TSNE(n_components=3)
org_trans_train.columns[20:]
da3 = tsne.fit_transform(org_trans_train[org_trans_train.columns[20:]])
da3.shape

colnames = ["c" + str(i) for i in range(dx3.shape[1])]
colnames
dx3 = pd.DataFrame(dx3, columns = colnames)
da3 = pd.DataFrame(da3, columns = colnames)

dx3['target'] = y_train
da3['target'] = y_train
dx3.head()
da3.head()

os.chdir(master)
dx3.to_csv("dx3.csv", index = False)
da3.to_csv("da3.csv", index = False)




X_train, X_test, ytrain, ytest = train_test_split(dx3.iloc[:,:3], y_train, test_size = 0.25 )
Xtrain, Xtest, ytr, yte = train_test_split(da3.iloc[:,:3], y_train, test_size = 0.25 )

evals_result= {}
model_tsne = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = 70
                           )


tr_X =  X_train
test_X = X_test



model_tsne.fit(tr_X, ytrain.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model_tsne.best_score   # 1.096898
pred = model_tsne.predict(test_X)
(pred == yte).sum()/yte.size    # 0.75



##############################
## umap
##############################

## 2D

reducer = umap.UMAP()
ss = StandardScaler()
dx = reducer.fit_transform(ss.fit_transform(orig_train))

sns.scatterplot(x= dx[:,0], y = dx[:,1], hue = y_train.values)

reducer = umap.UMAP()
ss = StandardScaler()
org_trans_train.columns[20:]
da = reducer.fit_transform(ss.fit_transform(org_trans_train[org_trans_train.columns[20:]]))
da.shape
sns.scatterplot(x= da[:,0], y = da[:,1], hue = y_train.values)
sns.scatterplot(x= dx[:,0], y = dx[:,1], hue = y_train.values)


colnames = ["c" + str(i) for i in range(dx.shape[1])]
colnames
dx = pd.DataFrame(dx, columns = colnames)
da = pd.DataFrame(da, columns = colnames)




X_train, X_test, ytrain, ytest = train_test_split(dx, y_train, test_size = 0.25 )
Xtrain, Xtest, ytr, yte = train_test_split(da, y_train, test_size = 0.25 )

evals_result= {}
model_umap = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = 70
                           )


tr_X =  Xtrain
test_X = Xtest



model_umap.fit(tr_X, ytr.values,
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, yte.values)],
          eval_metric = ['auc']
          )




model_umap.best_score
pred = model_pca.predict(test_X)
(pred == yte).sum()/yte.size




#########################################
## Predictive analytics
########################################
# Call it only once
# See https://scikit-learn.org/stable/common_pitfalls.html#general-recommendations


model0 = 0
gc.collect()
del model0
evals_result= {}
model0 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = seed
                           )


tr_X =  org_trans_train
test_X =  org_trans_test



model0.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model0.best_score   # 0.81761; 820858; 0.816837; 0.892089; 0.876738; 0.884359; 0.885373
                    # 0.84595; 0.851114
pred = model0.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.7324 0.8022; 0.78395; 0.7954
                                      # 0.7664;0.7716
#plot_importance(model, importance_type = 'gain')



fe_1, fe_0 = xg_impt_features(model0,org_trans_train.columns  )

len(fe_1)   # 335  86  55 76   77  88
len(fe_0)   # 743  11  11 14   16  16



os.chdir(master)
file = open('fe_1.txt','w')
for  item in fe_1:
	file.write(item+"\n")
file.close()

# Read fe_1
os.chdir(master)
with open("fe_1.txt", 'r') as f:
    fe_1 = [line.rstrip('\n') for line in f]

len(fe_1)  # 77  88




##---------------
# With reduced best features
model1 = 0
gc.collect()
del model1
evals_result= {}
model1 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = seed
                           )


tr_X =  org_trans_train[fe_1[:15]]     # Try from 7 to 30
test_X =  org_trans_test[fe_1[:15]]



model1.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )


# auc: 0.81646
model1.best_score   # 0.7228

pred = model1.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.5244


fe_1[:6]


fe_1[:7]

##--------------------
# orig + binned
##--------------------
gc.collect()
#del model
evals_result= {}
model2 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = seed
                           )


tr_X =  org_binned_train
test_X =  org_binned_test



model2.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 50,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model2.best_score   # 0.821435 ; 827361 ; 0.897
pred = model2.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.7324 ; 0.81

fe_11, fe_00 = xg_impt_features(model2,org_binned_train.columns  )
len(fe_11)
fe_00

##-------------------
# orig + binned best features
##-------------------


gc.collect()
#del model
evals_result= {}
model3 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = rng
                           )


tr_X =  org_binned_train[fe_11]
test_X =  org_binned_test[fe_11]



model3.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 50,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model3.best_score   # 826236; 826423
pred = model3.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.7324



##--------------------
##-------------------
# orig  features
##-------------------


model4 = 0

gc.collect()
del model4
evals_result= {}
model4 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = seed
                           )


tr_X =  orig_train[fe_4_1[:5]]
test_X =  orig_test[fe_4_1[:5]]



model4.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model4.best_score   # 0.7335065739582236
pred = model4.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.544

fe_4_1, fe_4_0 = xg_impt_features(model4,orig_train.columns  )

fe_4_1[:5]

##--------------------

fe_4_1[:5]

model4_1 = 0

gc.collect()
del model4_1
evals_result= {}
model4_1 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = 70
                           )


tr_X =  orig_train[fe_4_1[:5]]
test_X =  orig_test[fe_4_1[:5]]



model4_1.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model4_1.best_score   # 831523 ; 824436 ; 0.8288 ; 0.897301 ; 0.880147; (0.891444, 0.892768, 0.893049)
                    # (0.858484,0.862771, 0.874083 )
pred = model4_1.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.7376 ; 0.81; 0.7881; 0.8014, 0.8044
                                      # 0.7788; 0.7918

###################################




###################################
####################################


y = train_train.pop('target')
train_train.head()
ohe = OneHotEncoder(  sparse = False)
ohe.fit(train_train)
train_ohe = ohe.transform(train_train)
train_ohe.shape  # (7500, 89)
cl = ["c" + str(i) for i in range(train_ohe.shape[1]) ]
train_ohe = pd.DataFrame(train_ohe,columns = cl)
train_ohe.head()
train_ohe.shape  # (7500,75)




pca = PCA(n_components=3)
train_pca= pca.fit_transform(train_ohe)
train_ohe.head()
cx = ["c" + str(i) for i in range(train_pca.shape[1]) ]
train_pca = pd.DataFrame(train_pca,columns = cx)
train_pca.head()



os.chdir(dataPath)

train_pca.to_csv("train_pca.csv", index = False)
y.to_csv("y_train_pca.csv", index = False)
y.head()


##################Model with orig data #####################


X = orig_train
y = orig_train.pop('target')
X.columns
X.head()
y

X_train,X_test,y_train,y_test = train_test_split( X,y,
                                                 test_size = 0.25,
                                                 stratify = y,
                                                 random_state = 384)

gc.collect()
#del model
evals_result= {}
model = xgb.XGBClassifier( n_estimators= 700,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = 800
                           )

tr_X =  X_train
test_X =  X_test


model.fit(tr_X, y_train,                   # Xtr, ytr
          early_stopping_rounds = 50,   # 10% of n_estimators
          eval_set=[ (test_X, y_test)],
          eval_metric = ['merror']
          )



pred = model.predict(test_X)
(pred == y_test).sum()/y_test.size    # 94.93%   91.8%  94.73  98.2(class_Sep = 2.0)
plot_importance(model, importance_type = 'gain')

################## Model with discrete features #####################


X = train_train
y = train_train.pop('target')
X.columns
X.head()
y

for i,j in enumerate(X.columns):
    X[j] = X[j].astype('int')


X_train,X_test,y_train,y_test = train_test_split( X,y,
                                                 test_size = 0.25,
                                                 stratify = y,
                                                 random_state = 384)

gc.collect()
del model
evals_result= {}
model = xgb.XGBClassifier( n_estimators= 700,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = 800
                           )

tr_X =  X_train
test_X =  X_test


model.fit(tr_X, y_train,                   # Xtr, ytr
          early_stopping_rounds = 50,   # 10% of n_estimators
          eval_set=[ (test_X, y_test)],
          eval_metric = ['merror']
          )



pred = model.predict(test_X)
(pred == y_test).sum()/y_test.size    # 94.6% ; 95%  90.8%  94.86  98.86(class sep = 2.0)
plot_importance(model, importance_type = 'gain')

##############################################################



import matplotlib.pyplot as plt
import seaborn as sns
fig,ax= plt.subplots(1,1,figsize = (10,10))
sns.scatterplot(data = tr_X, x = 'fe', y = 'fd', hue= y_train, ax = ax, alpha = 0.4)

fig,ax= plt.subplots(1,1,figsize = (10,10))
sns.scatterplot(data = orig_train, x = 'fe', y = 'fb', hue= y,ax=ax ,palette = "Set2")





#################################################################



plt.figure(1)
plt.clf()
colors = ["#dede00", "#377eb8", "#f781bf"]
markers = ["x", "o", "^"]

# Three clusters can be seen
fig = plt.figure(figsize = (8,8))
_=sns.scatterplot(data = X, x = "x1", y = "x2", hue = y)

fig = plt.figure(figsize = (8,8))
_=sns.scatterplot(data = X, x = "x2", y = "x3", hue = y)


fig = plt.figure(figsize = (8,8)) ;
_=sns.scatterplot(data = X, x = "x1", y = "x3", hue = y)