In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML

import os


In [2]:
def set_figure(row, col, suptitle=None) :
    u''' Activate matplot figure setting size and super title
    '''
    fig = plt.figure(figsize=(row, col));
    if suptitle != None :
        fig.suptitle(suptitle, 
                     verticalalignment='center', fontsize='xx-large', fontweight='extra bold');
    return fig

In [3]:

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# to tune hiperparameters
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_auc_score

In [4]:
# to keep only 4 digits
ROUND = lambda v : round(v, 4)


### Data

In [5]:
data_raw = pd.read_csv("./data/blogData_train.csv", header=None)
data_raw.drop_duplicates(inplace=True)

In [6]:
data_raw.shape

(49203, 281)

In [7]:
# to_classes = lambda v : 0 if v < 30 else (1 if v < 90 else (2 if v < 150 else (3 if v < 210 else 4)))
# to_classes = lambda v : 0 if v < 30 else (1 if v < 90 else 2)
to_classes = lambda v : 0 if v < 30 else 1

In [8]:
X_train = data_raw.iloc[:,0:280]
y_train = data_raw.iloc[:,-1]

y_train = y_train.apply(to_classes)

In [9]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)

In [10]:

filepath = './data/test/'
filelist = [os.path.join(filepath, filename) for filename in os.listdir(filepath) if os.path.isfile(os.path.join(filepath, filename))]

test_raw = pd.DataFrame()

for filename in filelist :
    temp_raw = pd.read_csv(filename, header=None)
    temp_raw.drop_duplicates(inplace=True)
    test_raw = test_raw.append(temp_raw)

X_test = test_raw.iloc[:,0:280]
y_test = test_raw.iloc[:,-1]

y_test = y_test.apply(to_classes)

# using train scaler
X_test = scaler.transform(X_test)


---

---


In [11]:

# def classification_gridsearch_evaluate(X_train, y_train) :
#     u'''
#     '''
if True :
    class GS_Estimator :
        u'''
        '''

        def __init__(self, name, estimator, gs_param_grid=None) :
            self.name = name
            self.estimator = estimator
            self.gs_param_grid = gs_param_grid
            self.gs_estimator = None

            return        
# ---
    models = []

    models.append(
        GS_Estimator(
            name='XGBoost Classifier 1',
            estimator=xgb.XGBClassifier(),
            gs_param_grid={
                'eval_metric' : ['auc'], # 
                'gamma' : [0, 1], # (min_split_loss) minimum loss reduction
                'learning_rate' : [0.1], # (eta) step size shrinkage
                'max_depth' : [6], # maximum depth of tree
                'n_estimators' : [100], 
                'n_jobs' : [-1], # use all processors
                'objective' : ['binary:logistic'], # for binary classification 
                'random_state' : [127], 
                # 'subsample' : [0.1, 0.5, 1], # prevents overfitting
            }
        )
    )
    models.append(
        GS_Estimator(
            name='XGBoost Classifier 2',
            estimator=xgb.XGBClassifier(),
            gs_param_grid={
                'eval_metric' : ['auc'], # 
                'gamma' : [0], # (min_split_loss) minimum loss reduction
                'learning_rate' : [0.1], # (eta) step size shrinkage
                'max_depth' : [8, 10], # maximum depth of tree
                'n_estimators' : [500], 
                'n_jobs' : [-1], # use all processors
                'objective' : ['binary:logistic'], # for binary classification 
                'random_state' : [127], 
                # 'subsample' : [0.1, 0.5, 1], # prevents overfitting
            }
        )
    )
    
# ---

    gs_results = pd.DataFrame(columns=['model', 'best params', 'best score', 'train ROC AUC'])

    for m in models :
        scoring = 'roc_auc'
        cv = StratifiedKFold(n_splits=2, random_state=11, shuffle=True)
        gs = GridSearchCV(
            estimator=m.estimator, # scikit-learn estimator interface
            param_grid=m.gs_param_grid, # dictionart key=parametrer, value=list of paraameter posible values
            scoring=scoring, # strategy to evaluate performance of cross-validated
            n_jobs=-2, # jobs in parallel -2 : all processors minus one
            refit=True, # refit estimator using best parameters
            cv=cv, # cross-validated splitting strategy
            return_train_score=False, # include training scores
            verbose=3 # display fold parameters, score, time, ...
        )
        
        print('Gridsearch para', m.name, '...')

        gs.fit(X_train, y_train)
        m.gs_estimator = gs.best_estimator_
        
        y_train_pred = gs.predict(X_train)
        gs_train_roc_auc = ROUND(roc_auc_score(y_train, y_train_pred))

        gs_results = gs_results.append(
            pd.Series(
                data=[m.name, 
                      gs.best_params_, 
                      gs.best_score_, 
                      gs_train_roc_auc
                     ], 
                index=gs_results.columns
                ),
            ignore_index=True
        )

    pd.options.display.max_colwidth = 500 
    display(gs_results.sort_values(by=['train ROC AUC'], axis='index'))
    
    # return

# CAUTION !!!, it take a lot of time to run grid search
# comment to hide grid search evaluate
# classification_gridsearch_evaluate(X_train, y_train)


Gridsearch para XGBoost Classifier 1 ...
Fitting 2 folds for each of 2 candidates, totalling 4 fits


  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index
  from pandas import MultiIndex, Int64Index


Gridsearch para XGBoost Classifier 2 ...
Fitting 2 folds for each of 2 candidates, totalling 4 fits


  from pandas import MultiIndex, Int64Index


[CV 2/2] END eval_metric=auc, gamma=0, learning_rate=0.1, max_depth=6, n_estimators=100, n_jobs=-1, objective=binary:logistic, random_state=127;, score=0.963 total time= 5.8min


Unnamed: 0,model,best params,best score,train ROC AUC
0,XGBoost Classifier 1,"{'eval_metric': 'auc', 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 100, 'n_jobs': -1, 'objective': 'binary:logistic', 'random_state': 127}",0.966265,0.8147
1,XGBoost Classifier 2,"{'eval_metric': 'auc', 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 500, 'n_jobs': -1, 'objective': 'binary:logistic', 'random_state': 127}",0.963551,0.9977
