<a href="https://colab.research.google.com/github/harnalashok/CatEncodersFamily/blob/main/avazu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""
17th May, 2023
17th May, 2023
Objective: Avazu:
    Classification
    And
    SMOTE
    with less data

"""

In [None]:
%reset -f

# 1.0 Call libraries
import pandas as pd
import numpy as np
import sys    # For pointig to folder where our module is




# 1.01
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
#import umap  # Takes long time to import

# 1.02 Misc
import gc , os,time

In [None]:




# 1.05
#dataPath =                 "C:\\Users\\Ashok\\OneDrive\\Documents\\talkingdata\\29042023_avazu\\"
#modulePath = dataPath
#modelsPath =               "C:\\Users\\Ashok\\OneDrive\\Documents\\talkingdata\\29042023_avazu\\allmodels\\models\\"
#pathToStoreProgress =      "C:\\Users\\Ashok\\OneDrive\\Documents\\talkingdata\\29042023_avazu\\allmodels\\progress\\"
#master =  dataPath + "master\\"

dataPath =                 "D:\\avazu\\"
modulePath = dataPath
modelsPath =               "D:\\avazu\\allmodels\\models\\"
pathToStoreProgress =      "D:\\avazu\\allmodels\\progress\\"
master =  dataPath + "master\\"




# 1.03 Home made modules
sys.path.append(modulePath)
import utils
# DO NOT CALL, as:
#  from scikitlearnclass import CatEncoders
#   Subsequently saving in pickle gives error.
import scikitlearnclass

# 1.04
#import importlib; importlib.reload(network_features)
#import importlib; importlib.reload(utils)
#import importlib; importlib.reload(scikitlearnclass)


# 2.0 Decide program-wide
rng = np.random.RandomState(0)



## Read Data
os.chdir(dataPath)

# Our field datatypes:
dtypes = {
        'C1'              : 'uint16',
        'banner_pos'      : 'uint8',
        'device_type'     : 'uint8',
        'device_conn_type': 'uint8',
        'C14'             : 'uint16',
        'C15'             : 'uint16',
        'C16'             : 'uint16',
        'C17'             : 'uint16',
        'C18'             : 'uint8',
        'C19'             : 'uint16',
        'C20'             : 'int32',
        'click'           : 'uint8'
        }

print('load train...')


# 3.1 Read a fraction of data
total_lines = 40428967   #  (40428967, 24)
read_lines =   3000000    # 7.4%  Reduce it if less RAM

# 3.2 Read randomly 'p' fraction of files
#     Ref: https://stackoverflow.com/a/48589768

p = read_lines/total_lines  # fraction of lines to read

# 3.2.1 How to pick up random rows from hard-disk
#       without first loading the complete file in RAM
#       Toss a coin:
#           At each row, toss a biased-coin: 60%->Head, 40%->tail
#           If tail comes, select the row else not.
#           Toss a coin: random.random()
#           Head occurs if value > 0.6 else it is tail
#
#       We do not toss the coin for header row. Keep the header
# From https://www.kaggle.com/code/gauravduttakiit/data-sampling
parse_date = lambda val : pd.datetime.strptime(val, '%y%m%d%H')
train = pd.read_csv(
                     dataPath + "train.gz",  # Not reading test.csv.zip
                     header=0,  # First row is header-row
                                # 'and' operator returns True if both values are True
                                #  random.random() returns values between (0,1)
                                #  No of rows skipped will be around 60% of total
                      skiprows=lambda i: (i >0 ) and (np.random.random() > p),    # (i>0) implies skip first header row
                      dtype=dtypes,
                      parse_dates=['hour'],
                      date_parser=parse_date
                      # We read all columns. Here are the column-names
                      #  in the sequence they occur in the train data
                      #usecols=['ip','app','device','os', 'channel', 'click_time', 'attributed_time', 'is_attributed']
                    )   # Takes 3 minute


# Explore read data:
train.shape     # (3000756, 24)
train.head(3)
train.dtypes
"""
id                          uint64
click                        uint8
hour                datetime64[ns]
C1                          uint16
banner_pos                   uint8
site_id                     object
site_domain                 object
site_category               object
app_id                      object
app_domain                  object
app_category                object
device_id                   object
device_ip                   object
device_model                object
device_type                  uint8
device_conn_type             uint8
C14                         uint16
C15                         uint16
C16                         uint16
C17                         uint16
C18                          uint8
C19                         uint16
C20                          int32
C21                          int64

"""

# Separate target, and drop id
y = train.pop("click")
train.pop("id")

# Our list of columns:
train.columns



# Here is a list of all columns except 'hour'
cols = ["C1","banner_pos","site_id","site_domain","site_category",
        "app_id","app_domain","app_category","device_id","device_ip",
        "device_model", "device_type", "device_conn_type",	"C14",
        "C15", "C16",	"C17",	"C18",	"C19",	"C20",	"C21"]



# Check no of unique values
# Columns [device_id, device_ip]
# Have very large no of unique values
for i in cols:
    print(i,"\t",train[i].nunique())



"""
Data: 3000000 rows
C1 	 7
banner_pos 	 7
site_id 	 3256
site_domain 	 3924
site_category 	 22
app_id 	 4434
app_domain 	 277
app_category 	 27
device_id 	 394043    <==v large
device_ip 	 1313203   <==v large
device_model 	 6122
device_type 	 5
device_conn_type 	 4
C14 	 2428
C15 	 8
C16 	 9
C17 	 426
C18 	 4
C19 	 67
C20 	 166
C21 	 60
-----------

"""




# Rename columns requirement of our transformer class
# Column names are not to have digits
cols = {
        "C1" : "Cone",
        "banner_pos" : "bannerpos",
        "site_id"    :  "siteid",
        "site_domain": "sitedomain",
        "site_category" : "sitecategory",
        "app_id"        : "appid",
        "app_domain"    : "appdomain",
        "app_category"  : "appcategory",
        "device_model"  : "devicemodel",
        "device_type"   : "devicetype",
        "device_conn_type" : "deviceconntype",
        "C14" : "Cfourteen",
        "C15" : "Cfifteen",
        "C16" : "Csixteen",
        "C17" : "Cseventeen",
        "C18" : "Ceighteen",
        "C19" : "Cnineteen",
        "C20" : "Ctwenty",
        "C21" : "Ctwentyone"
        }

train = train.rename( columns = cols)
train.shape  #    (5999306, 22)
train.columns
train.head(3)


# Extract date components then remove 'hour' column
train['month'] = train['hour'].dt.month
train['dayofweek'] = train['hour'].dt.dayofweek
train['day'] = train['hour'].dt.day
train['hour_time'] = train['hour'].dt.hour
train.pop('hour')
train.head(3)

# Save train data
os.chdir(master)
train.to_pickle("train.pkl")
y.to_pickle("y.pkl")

####
# Read saved data
os.chdir(master)
train = pd.read_pickle("train.pkl")
y = pd.read_pickle("y.pkl")
####

# Some columns need label encoding for feeding
#  into xgboost classifier. Let us check.
train.iloc[:, :5].head()   # siteid, sitedomain,sitecategory
train.iloc[:, 5:10].head()   # appid,appdomain,appcategory,device_id,device_ip
train.iloc[:, 10:15].head()  # devicemodel

ColumnsToEncode = ["siteid", "sitedomain", "appid", "appdomain",
                   "appcategory", "device_id", "device_ip",
                   "devicemodel", 'sitecategory']


# Check our list once again:
train[ColumnsToEncode[:4]].head()
train[ColumnsToEncode[4:]].head()



# Label encode some columns of train data:
#  Takes time:

dict_ = {}  # Save label encoder objects here
for i in ColumnsToEncode:
    le = LabelEncoder()
    le.fit(train[i])
    train[i] = le.transform(train[i])
    dict_[i] = le
    gc.collect()




# Check again if encoding done?
train[ColumnsToEncode[:4]].head()
train[ColumnsToEncode[4:]].head()

# Save dict of LabelEncoder objects for future use
utils.savePythonObject(dict_, "labelEnoders.pkl", master)
# Can restore dict of LabelEncoder objects, as:
le_objs = utils.restorePythonObject("labelEnoders.pkl", master)
le_objs


# Save label encoded train data
os.chdir(master)
train.to_pickle("train_encoded.pkl")
y.to_pickle("y.pkl")

####
# Read label encoded saved data
os.chdir(master)
train = pd.read_pickle("train_encoded.pkl")
y = pd.read_pickle("y.pkl")
####
# Check, if read
train.head()

######################
######################

# Split our data
X_train, X_test, y_train, y_test = train_test_split(
                                                    train,
                                                    y,
                                                    test_size = 0.25,
                                                    stratify= y,
                                                    random_state= rng)




# Check shapes:
X_train.shape    # (2250567, 25)
X_test.shape     # (750189, 25)
y_train.shape    # (2250567,)
y_test.shape     # (750189,)


# As data is large, save split data:
os.chdir(master)
X_train.to_pickle("X_train.pkl")
X_test.to_pickle("X_test.pkl")
y_train.to_pickle("y_train.pkl")
y_test.to_pickle("y_test.pkl")


####
# Read split data
os.chdir(master)
X_train = pd.read_pickle("X_train.pkl")
X_test = pd.read_pickle("X_test.pkl")
y_train = pd.read_pickle("y_train.pkl")
y_test = pd.read_pickle("y_test.pkl")
#####


# Recheck shapes:
X_train.shape    # (2250567, 25)
X_test.shape     # (750189, 25)
y_train.shape    # (2250567,)
y_test.shape     # (750189,)



# Check distribution of levels in split data
y_test.value_counts(normalize = True)  # 83%:17%
y_train.value_counts(normalize = True) # 83%:17%


# Check nulls. None.
X_train.isnull().sum()
X_test.isnull().sum()


##*********************************
## Developing models for transformation:
##*********************************

#  Which are our cat columns


# We cosider:
cat_cols = ['Cone', 'bannerpos', 'siteid', 'sitedomain', 'sitecategory',
            'appid', 'appdomain', 'appcategory', 'devicemodel', 'devicetype',
            'deviceconntype', 'Cfourteen', 'Cfifteen','Csixteen', 'Cseventeen',
            'Ceighteen', 'Cnineteen', 'Ctwenty',
            'Ctwentyone']

len(cat_cols)  # 19

# Remaining columns are a mix of numeric and cat:
# device_id and device_ip have very large number of levels
# So we have ignored them in our above list.
rem_cols =  set(train.columns).difference(set(cat_cols))
rem_cols
"""
{'day', 'dayofweek', 'device_id', 'device_ip', 'hour_time', 'month'}
"""


# As number of cat_cols are quite large
#  Keep interacting columns as null
interactingCatCols = []

# Instantiate CustomTransformer class:
# WARNING: If you do not call the class as:
#          scikitlearnclass.CustomTransformer,
#          pickle does not save the class-object
ct = scikitlearnclass.CatEncoder(pathToStoreProgress, # Progress file is stored here
                                 modelsPath,          # Graph files will be saved here
                                 cMeasures=  [ 1,1,1,0,None,0,0],
                                 subseqlength = 2, # It is the default
                                 n_iter =1,  # It is the default
                                 k = 40,  # Irrelevalent here
                                          #  as we are not calculating betweenness centrality
                                 saveGraph = True
                       )

# Fit it on X_train:
gc.collect()
start = time.time()
ct.fit(X_train, cat_cols, interactingCatCols)
end = time.time()
print((end-start)/60)    # 88 minutes(6000000),


# Save fitted class object for later use:
utils.savePythonObject(ct, "transformer.pkl", modelsPath)

# We delete existing class object
del ct

# Read back saved class object:
ct = utils.restorePythonObject("transformer.pkl", modelsPath)
ct

# Transform X_train now:
gc.collect()
start = time.time()
out_tr = ct.transform(X_train[cat_cols])
end = time.time()     # 38 minutes (6000000)
print((end -start)/60)


# Transform test data
start = time.time()
out_te = ct.transform(X_test[cat_cols])
end = time.time()
print((end -start)/60)    # 14 min

gc.collect()


# Check shapes:
out_te.shape      #    (750189, 599)
out_te.columns    #   Includes original columns also
out_tr.shape      #  (2250567, 599)

# Remove low variance columns
# out_te = utils.removeLowVarCols( out_te , pca = False)

# Save transformed data:
os.chdir(master)
out_te.to_pickle("X_test_transformed.pkl")
y_test.to_pickle("y_test.pkl")


os.chdir(master)
out_tr.to_pickle("X_train_transformed.pkl")
y_train.to_pickle("y_train.pkl")



##############################
## Start reading
#############################

cat_cols = ['Cone', 'bannerpos', 'siteid', 'sitedomain', 'sitecategory',
            'appid', 'appdomain', 'appcategory', 'devicemodel', 'devicetype',
            'deviceconntype', 'Cfourteen', 'Cfifteen','Csixteen', 'Cseventeen',
            'Ceighteen', 'Cnineteen', 'Ctwenty', 'Ctwentyone']  # 19 cols

rem_cols = ['day', 'dayofweek', 'device_id',
            'device_ip', 'hour_time', 'month']  # 6 cols

os.chdir(master)
X_train_trans = pd.read_pickle("X_train_transformed.pkl")
X_test_trans = pd.read_pickle("X_test_transformed.pkl")
X_train = pd.read_pickle("X_train.pkl")
X_test = pd.read_pickle("X_test.pkl")
y_train = pd.read_pickle("y_train.pkl")
y_test = pd.read_pickle("y_test.pkl")

X_test_trans.columns[19:]
X_test_trans.columns[:19]


##************************
## Predictive analytics
##************************

# Modeling with computed data
# Concat remaining columns:
X_train = X_train.reset_index( drop = True )
X_test = X_test.reset_index(drop = True)

# Ignore original cat cols in transformed data
l = list(X_train_trans.columns)
l[:19]     # First 19 cols are cat_cols
l = l[19:] # Forget them
l[:5]      # Recheck

# Concat with rem_cols:
X_train_trans = pd.concat([X_train[rem_cols], X_train_trans[l] ],  axis = 1)
X_test_trans = pd.concat([X_test[rem_cols], X_test_trans[l] ], axis = 1)

# Check:
X_train_trans.shape   # (2250567, 586)
X_test_trans.shape   # (750189, 586)
X_train_trans.head()

# Prepare xgboost model:
gc.collect()
evals_result= {}
model = 0
model = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,
                           max_depth = 15,
                           subsample = 0.8,
                           evals_result = evals_result,
                           random_state = rng,
                           )


tr_X =   X_train_trans[fe_1[:-15]]
test_X = X_test_trans[fe_1[:-15]]
ytrain = y_train        # Just renaming
ytest = y_test          # Just renaming


model.fit(tr_X, ytrain.values,
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc']    # binary classification problem
          )




model.best_score   #  0.756914(30l);  0.76074(60l)
pred = model.predict(test_X)
(pred == ytest).sum()/ytest.size   # 0.835550774 (30l)

# Get impt features:
# fe_1: Ordered features with impt > 0
# fe_0: Feature with zero importance
fe_1, fe_0 = utils.xg_impt_features(model,X_train_trans.columns)
len(fe_1)   # 408
len(fe_0)   # 178

print(classification_report(ytest, pred))
"""
                  precision  recall  f1-score   support

           0       0.84      0.99      0.91    622933
           1       0.60      0.09      0.16    127256

    accuracy                           0.84    750189
   macro avg       0.72      0.54      0.54    750189
weighted avg       0.80      0.84      0.78    750189



"""



# Modeling with original untransformed data
#  (except hour etc)
#  With all the features:
evals_result= {}
model_or =0
model_or = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 15,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = rng,
                           #enable_categorical = True
                           )


tr_X =   X_train
test_X = X_test
ytrain = y_train
ytest =  y_test


model_or.fit(tr_X, ytrain.values,
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc']    # binary classification problem
          )




model_or.best_score   #  0.7540460416
pred = model_or.predict(test_X)
(pred == ytest).sum()/ytest.size    #  0.835640; 0.8352290



print(classification_report(ytest, pred))
"""

# -- End pasted text --
                 precision    recall  f1-score   support

           0       0.84      0.99      0.91    622933
           1       0.59      0.09      0.16    127256

    accuracy                           0.84    750189
   macro avg       0.72      0.54      0.53    750189
weighted avg       0.80      0.84      0.78    750189


"""


#####################################################
# SMOTE
#####################################################

from imblearn.over_sampling import SMOTE,ADASYN
from imblearn.combine import SMOTEENN
sm = SMOTE(random_state=rng)
sm = ADASYN(random_state=rng)
sm = SMOTEENN(random_state=rng)
X_res, y_res = sm.fit_resample(X_train_trans, y_train)
X_res.shape  #  (3768921, 586)
y_res.shape  # ( (3768921, )

# Save SMOTE data:
os.chdir(master)
X_res.to_pickle("X_res_transformed.pkl")
y_res.to_pickle("y_res.pkl")

# Read SMOTE data
os.chdir(master)
X_res = pd.read_pickle("X_res_transformed.pkl")
y_res = pd.read_pickle("y_res.pkl")
gc.collect()


# Modeling with smote data
evals_result= {}
model_sm =0
model_sm = xgb.XGBClassifier( n_estimators= 300,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 15,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = rng,
                           #enable_categorical = True
                           )


tr_X =   X_res
test_X = X_test_trans
ytrain = y_res
ytest =  y_test


model_sm.fit(tr_X, ytrain.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc']    # binary classification problem
          )




model_sm.best_score   #  0.75517761 (30l)
model_sm.best_iteration # 232
pred = model_sm.predict(test_X)
(pred == ytest).sum()/ytest.size    #  0.835498

print(classification_report(ytest, pred))

"""
             precision    recall  f1-score   support

           0       0.84      0.99      0.91    622933
           1       0.59      0.10      0.17    127256

    accuracy                           0.84    750189
   macro avg       0.72      0.54      0.54    750189
weighted avg       0.80      0.84      0.78    750189

"""


from sklearn.metrics import roc_auc_score
score = model_sm.predict_proba(test_X)
roc_auc_score(ytest, score[:,1])    #  0.7552722506975745









######################3


seed = 678
# Transformed data
X_train, X_test, ytrain, ytest = train_test_split(
                                                  train_trans[l],
                                                  y_train,
                                                  test_size = 0.25,
                                                  random_state= seed)

# original data
X_train, X_test, ytrain, ytest = train_test_split(
                                                  train,
                                                  y_train,
                                                  test_size = 0.25,
                                                  random_state= seed)


# PCA data
Xtrain, Xtest, ytr, yte = train_test_split(da, y_train, test_size = 0.25 )




from sklearn.decomposition import PCA
del ss
ss = StandardScaler()
pca = PCA(n_components = 2, whiten = True, random_state=rng)
da = pca.fit_transform(ss.fit_transform(train_trans))
da.shape  # (32769, 2)
db = pca.transform(ss.transform(test_trans_imputed))
db.shape

colnames = ["c" + str(i) for i in range(da.shape[1])]
colnames
da = pd.DataFrame(da, columns = colnames)
db = pd.DataFrame(db, columns = colnames)

n_train_trans = pd.concat([train_trans, da], axis = 1)
n_train_trans.shape
n_test_trans_imputed = pd.concat([test_trans_imputed, db], axis = 1)


evals_result= {}
model =0
model = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 13,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = rng,
                           #enable_categorical = True
                           )


tr_X =  n_train_trans  #[fe_1[:-15]] # X_train # Xtrain
test_X = n_test_trans_imputed   #[fe_1[:-15]] # X_test # Xtest
ytrain = y_train
ytest = y_test


model.fit(tr_X, ytrain.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc']    # binary classification problem
          )















gc.collect()
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state= rng)
X_res, y_res = sm.fit_resample(train_trans, y_train)
X_res.shape  # (746856, 131)



evals_result= {}
model1 =0
model1 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 11,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = seed,
                           #enable_categorical = True
                           )


tr_X =  X_res # Xtrain
test_X = test_trans # Xtest
ytrain = y_res
ytest = y_test


model1.fit(tr_X, ytrain.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc', 'logloss']    # binary classification problem
          )




model1.best_score   # 0.74345
pred = model1.predict(test_X)
(pred == ytest).sum()/ytest.size    # 0.83244




########### Resampling

from sklearn.utils import resample

train_trans['click'] = y_train

train_trans.shape

train_trans.columns

# Separate majority and minority classes
df_majority = train_trans[train_trans.click==0]
df_majority.shape
df_minority = train_trans[train_trans.click==1]
df_minority.shape

df_minority_upsampled = resample(df_minority,
                                 replace=True,     # sample with replacement
                                 n_samples=279050,    # to match majority class
                                 random_state=123) # reproducible results


df_upsampled = pd.concat([df_majority, df_minority_upsampled])


df_upsampled.click.value_counts()

# Save it
os,chdir(master)
df_upsampled.to_pickle("df_upsampled.pkl")
os.chdir(master)
df_upsampled = pd.read_pickle("df_upsampled.pkl")

y_tr = df_upsampled.pop('click')

seed = 789
evals_result= {}
model =0
model = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 11,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = seed,
                           #enable_categorical = True
                           )


tr_X = df_upsampled
test_X = test_trans # Xtest
ytrain = y_tr
ytest = y_test


model.fit(tr_X, ytrain.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc']    # binary classification problem
          )



model.best_score   # 0.74345
pred = model.predict(test_X)
(pred == ytest).sum()/ytest.size    # 0.83244




### NOT DONE BELOW
##############################
## tsne
##############################
# Why blobs do not appear together in tsne?
# See StackOverflow:
#    https://stats.stackexchange.com/a/453106/78454


from sklearn.manifold import  TSNE


## 2D
tsne = TSNE()
dx = tsne.fit_transform(orig_train)
y_train.values.shape


sns.scatterplot(x= dx[:,0], y = dx[:,1], hue = y_train.values)

tsne = TSNE()
org_trans_train.columns[20:]
da = tsne.fit_transform(org_trans_train[org_trans_train.columns[20:]])
da.shape
sns.scatterplot(x= da[:,0], y = da[:,1], hue = y_train.values)
sns.scatterplot(x= dx[:,0], y = dx[:,1], hue = y_train.values)


## 3D
tsne = TSNE(n_components = 3, early_exaggeration = 40)
dx3 = tsne.fit_transform(orig_train)
dx3.shape


tsne = TSNE(n_components=3)
org_trans_train.columns[20:]
da3 = tsne.fit_transform(org_trans_train[org_trans_train.columns[20:]])
da3.shape

colnames = ["c" + str(i) for i in range(dx3.shape[1])]
colnames
dx3 = pd.DataFrame(dx3, columns = colnames)
da3 = pd.DataFrame(da3, columns = colnames)

dx3['target'] = y_train
da3['target'] = y_train
dx3.head()
da3.head()

os.chdir(master)
dx3.to_csv("dx3.csv", index = False)
da3.to_csv("da3.csv", index = False)




X_train, X_test, ytrain, ytest = train_test_split(dx3.iloc[:,:3], y_train, test_size = 0.25 )
Xtrain, Xtest, ytr, yte = train_test_split(da3.iloc[:,:3], y_train, test_size = 0.25 )

evals_result= {}
model_tsne = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = 70
                           )


tr_X =  X_train
test_X = X_test



model_tsne.fit(tr_X, ytrain.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, ytest.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model_tsne.best_score   # 1.096898
pred = model_tsne.predict(test_X)
(pred == yte).sum()/yte.size    # 0.75



##############################
## umap
##############################

## 2D

reducer = umap.UMAP()
ss = StandardScaler()
dx = reducer.fit_transform(ss.fit_transform(orig_train))

sns.scatterplot(x= dx[:,0], y = dx[:,1], hue = y_train.values)

reducer = umap.UMAP()
ss = StandardScaler()
org_trans_train.columns[20:]
da = reducer.fit_transform(ss.fit_transform(org_trans_train[org_trans_train.columns[20:]]))
da.shape
sns.scatterplot(x= da[:,0], y = da[:,1], hue = y_train.values)
sns.scatterplot(x= dx[:,0], y = dx[:,1], hue = y_train.values)


colnames = ["c" + str(i) for i in range(dx.shape[1])]
colnames
dx = pd.DataFrame(dx, columns = colnames)
da = pd.DataFrame(da, columns = colnames)




X_train, X_test, ytrain, ytest = train_test_split(dx, y_train, test_size = 0.25 )
Xtrain, Xtest, ytr, yte = train_test_split(da, y_train, test_size = 0.25 )

evals_result= {}
model_umap = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = 70
                           )


tr_X =  Xtrain
test_X = Xtest



model_umap.fit(tr_X, ytr.values,
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, yte.values)],
          eval_metric = ['auc']
          )




model_umap.best_score
pred = model_pca.predict(test_X)
(pred == yte).sum()/yte.size




#########################################
## Predictive analytics
########################################
# Call it only once
# See https://scikit-learn.org/stable/common_pitfalls.html#general-recommendations


model0 = 0
gc.collect()
del model0
evals_result= {}
model0 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = seed
                           )


tr_X =  org_trans_train
test_X =  org_trans_test



model0.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model0.best_score   # 0.81761; 820858; 0.816837; 0.892089; 0.876738; 0.884359; 0.885373
                    # 0.84595; 0.851114
pred = model0.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.7324 0.8022; 0.78395; 0.7954
                                      # 0.7664;0.7716
#plot_importance(model, importance_type = 'gain')



fe_1, fe_0 = xg_impt_features(model0,org_trans_train.columns  )

len(fe_1)   # 335  86  55 76   77  88
len(fe_0)   # 743  11  11 14   16  16



os.chdir(master)
file = open('fe_1.txt','w')
for  item in fe_1:
	file.write(item+"\n")
file.close()

# Read fe_1
os.chdir(master)
with open("fe_1.txt", 'r') as f:
    fe_1 = [line.rstrip('\n') for line in f]

len(fe_1)  # 77  88




##---------------
# With reduced best features
model1 = 0
gc.collect()
del model1
evals_result= {}
model1 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = seed
                           )


tr_X =  org_trans_train[fe_1[:15]]     # Try from 7 to 30
test_X =  org_trans_test[fe_1[:15]]



model1.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )


# auc: 0.81646
model1.best_score   # 0.7228

pred = model1.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.5244


fe_1[:6]


fe_1[:7]

##--------------------
# orig + binned
##--------------------
gc.collect()
#del model
evals_result= {}
model2 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = seed
                           )


tr_X =  org_binned_train
test_X =  org_binned_test



model2.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 50,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model2.best_score   # 0.821435 ; 827361 ; 0.897
pred = model2.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.7324 ; 0.81

fe_11, fe_00 = xg_impt_features(model2,org_binned_train.columns  )
len(fe_11)
fe_00

##-------------------
# orig + binned best features
##-------------------


gc.collect()
#del model
evals_result= {}
model3 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = rng
                           )


tr_X =  org_binned_train[fe_11]
test_X =  org_binned_test[fe_11]



model3.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 50,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model3.best_score   # 826236; 826423
pred = model3.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.7324



##--------------------
##-------------------
# orig  features
##-------------------


model4 = 0

gc.collect()
del model4
evals_result= {}
model4 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = seed
                           )


tr_X =  orig_train[fe_4_1[:5]]
test_X =  orig_test[fe_4_1[:5]]



model4.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model4.best_score   # 0.7335065739582236
pred = model4.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.544

fe_4_1, fe_4_0 = xg_impt_features(model4,orig_train.columns  )

fe_4_1[:5]

##--------------------

fe_4_1[:5]

model4_1 = 0

gc.collect()
del model4_1
evals_result= {}
model4_1 = xgb.XGBClassifier( n_estimators= 1000,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = 70
                           )


tr_X =  orig_train[fe_4_1[:5]]
test_X =  orig_test[fe_4_1[:5]]



model4_1.fit(tr_X, y_train.values,                   # Xtr, ytr
          early_stopping_rounds = 100,   # 10% of n_estimators
          eval_set=[ (test_X, y_test.values)],
          eval_metric = ['auc']
          )



# auc: 0.81646
model4_1.best_score   # 831523 ; 824436 ; 0.8288 ; 0.897301 ; 0.880147; (0.891444, 0.892768, 0.893049)
                    # (0.858484,0.862771, 0.874083 )
pred = model4_1.predict(test_X)
(pred == y_test).sum()/y_test.size    # 0.7376 ; 0.81; 0.7881; 0.8014, 0.8044
                                      # 0.7788; 0.7918

###################################




###################################
####################################


y = train_train.pop('target')
train_train.head()
ohe = OneHotEncoder(  sparse = False)
ohe.fit(train_train)
train_ohe = ohe.transform(train_train)
train_ohe.shape  # (7500, 89)
cl = ["c" + str(i) for i in range(train_ohe.shape[1]) ]
train_ohe = pd.DataFrame(train_ohe,columns = cl)
train_ohe.head()
train_ohe.shape  # (7500,75)




pca = PCA(n_components=3)
train_pca= pca.fit_transform(train_ohe)
train_ohe.head()
cx = ["c" + str(i) for i in range(train_pca.shape[1]) ]
train_pca = pd.DataFrame(train_pca,columns = cx)
train_pca.head()



os.chdir(dataPath)

train_pca.to_csv("train_pca.csv", index = False)
y.to_csv("y_train_pca.csv", index = False)
y.head()


##################Model with orig data #####################


X = orig_train
y = orig_train.pop('target')
X.columns
X.head()
y

X_train,X_test,y_train,y_test = train_test_split( X,y,
                                                 test_size = 0.25,
                                                 stratify = y,
                                                 random_state = 384)

gc.collect()
#del model
evals_result= {}
model = xgb.XGBClassifier( n_estimators= 700,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = 800
                           )

tr_X =  X_train
test_X =  X_test


model.fit(tr_X, y_train,                   # Xtr, ytr
          early_stopping_rounds = 50,   # 10% of n_estimators
          eval_set=[ (test_X, y_test)],
          eval_metric = ['merror']
          )



pred = model.predict(test_X)
(pred == y_test).sum()/y_test.size    # 94.93%   91.8%  94.73  98.2(class_Sep = 2.0)
plot_importance(model, importance_type = 'gain')

################## Model with discrete features #####################


X = train_train
y = train_train.pop('target')
X.columns
X.head()
y

for i,j in enumerate(X.columns):
    X[j] = X[j].astype('int')


X_train,X_test,y_train,y_test = train_test_split( X,y,
                                                 test_size = 0.25,
                                                 stratify = y,
                                                 random_state = 384)

gc.collect()
del model
evals_result= {}
model = xgb.XGBClassifier( n_estimators= 700,
                           verbosity = 3,
                           eta = 0.06,      # 0.06
                           max_depth = 6,
                           subsample = 0.8,           # 0.8
                           evals_result = evals_result,
                           random_state = 800
                           )

tr_X =  X_train
test_X =  X_test


model.fit(tr_X, y_train,                   # Xtr, ytr
          early_stopping_rounds = 50,   # 10% of n_estimators
          eval_set=[ (test_X, y_test)],
          eval_metric = ['merror']
          )



pred = model.predict(test_X)
(pred == y_test).sum()/y_test.size    # 94.6% ; 95%  90.8%  94.86  98.86(class sep = 2.0)
plot_importance(model, importance_type = 'gain')

##############################################################



import matplotlib.pyplot as plt
import seaborn as sns
fig,ax= plt.subplots(1,1,figsize = (10,10))
sns.scatterplot(data = tr_X, x = 'fe', y = 'fd', hue= y_train, ax = ax, alpha = 0.4)

fig,ax= plt.subplots(1,1,figsize = (10,10))
sns.scatterplot(data = orig_train, x = 'fe', y = 'fb', hue= y,ax=ax ,palette = "Set2")





#################################################################



plt.figure(1)
plt.clf()
colors = ["#dede00", "#377eb8", "#f781bf"]
markers = ["x", "o", "^"]

# Three clusters can be seen
fig = plt.figure(figsize = (8,8))
_=sns.scatterplot(data = X, x = "x1", y = "x2", hue = y)

fig = plt.figure(figsize = (8,8))
_=sns.scatterplot(data = X, x = "x2", y = "x3", hue = y)


fig = plt.figure(figsize = (8,8)) ;
_=sns.scatterplot(data = X, x = "x1", y = "x3", hue = y)