In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML

import os


In [2]:
def blogData_train_read() :
    u''' Reads and prepare data from blog feedback data train set
    
    '''

    data = pd.read_csv("./data/blogData_train.csv", header=None)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    header = pd.read_csv("./data/blogData_label.csv", header=None)
    header = list(header[0])
    
    if len(header) != data.shape[1] :
        raise Exception('Los encabezados y la cantidad de características NO COINCIDE !!!')

    data.columns = header
    
    return data


In [3]:
data = blogData_train_read()
data.shape

(49203, 281)

In [5]:
def blogData_labels(data) :
    u''' Create a dictionary with some keys associates to list of features in the final work dataframe
    
    '''
    columns = list(data.columns)

    labels = dict()

    labels['sd_nc_total_before_BT'] = columns[0:5]
    labels['sd_nc_24_before_BT'] = columns[5:10]
    labels['sd_nc_between_24_48'] = columns[10:15]
    labels['sd_nc_first_24_BT'] = columns[15:20]
    labels['sd_nc_diff_24_48'] = columns[20:25]
    
    labels['sd_nl_total_before_BT'] = columns[25:30]
    labels['sd_nl_24_before_BT'] = columns[30:35]
    labels['sd_nl_between_24_48'] = columns[35:40]
    labels['sd_nl_first_24_BT'] = columns[40:45]
    labels['sd_nl_diff_24_48'] = columns[45:50]
    
    labels['nc_total_before_BT'] = columns[50:51]
    labels['nc_24_before_BT'] = columns[51:52]
    labels['nc_between_24_48'] = columns[52:53]
    labels['nc_first_24_BT'] = columns[53:54]
    labels['nc_diff_24_48'] = columns[54:55]
    
    labels['nl_total_before_BT'] = columns[55:56]
    labels['nl_24_before_BT'] = columns[56:57]
    labels['nl_between_24_48'] = columns[57:58]
    labels['nl_first_24_BT'] = columns[58:59]
    labels['nl_diff_24_48'] = columns[59:60]
    
    labels['nc'] = columns[50:55]
    labels['nl'] = columns[55:60]

    labels['timelength_post_BT'] = columns[60:61]
    labels['length_post'] = columns[61:62]
    
    labels['tl_post'] = columns[60:62]

    labels['frequent_word'] = columns[62:262]

    labels['weekday_BT'] = columns[262:269]
    labels['weekday_post'] = columns[269:276]
    
    labels['parents'] = columns[276:280]
    labels['comments'] = columns[280:281]

    return labels


In [6]:

labels = blogData_labels(data)
target = 'comments'


---

In [3]:

from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# to tune hiperparameters
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import roc_auc_score

ROUND = lambda v: round(v, 4)

---


In [9]:

class RM_Estimator :
    u'''
    '''
    
    def __init__(self, name, estimator, gs_param_grid=None) :
        # self.alias = alias
        self.name = name
        self.estimator = estimator
        self.gs_param_grid = gs_param_grid
        self.gs_estimator = None
        
        return    

In [10]:

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import time

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb

from sklearn.metrics import mean_squared_error


In [11]:
def rm_evaluate(rm_result, rm_models, X_train, y_train) :
    u'''
    '''
    
    for rm in rm_models :

        scoring = 'neg_root_mean_squared_error'
        cv = StratifiedKFold(n_splits=2, random_state=11, shuffle=True)
        
        gs = GridSearchCV(
            estimator=rm.estimator, # scikit-learn estimator interface
            param_grid=rm.gs_param_grid, # dictionart key=parametrer, value=list of paraameter posible values
            scoring=scoring, # strategy to evaluate performance of cross-validated
            n_jobs=-2, # jobs in parallel -2 : all processors minus one
            refit=True, # refit estimator using best parameters
            cv=cv, # cross-validated splitting strategy
            return_train_score=False, # include training scores
            verbose=1 # display fold parameters, score, time, ...
        )
        
        print('Gridsearch para', rm.name, '...')

        gs.fit(X_train, y_train)
        rm.gs_estimator = gs.best_estimator_
        
        y_pred = gs.predict(X_train)
        gs_rmse = ROUND(np.sqrt(mean_squared_error(y_train, y_pred)))

        
        rm_result = rm_result.append(
            pd.Series(
                data=[rm.name, 
                      gs.best_params_, 
                      gs.best_score_, 
                      gs_rmse
                     ], 
                index=rm_result.columns
                ),
            ignore_index=True
        )

    return rm_result
    

In [12]:

rm_models = []

rm_models.append(
    RM_Estimator(
        name='Random Forest Regressor',
        estimator=RandomForestRegressor(),
        gs_param_grid={
            'max_depth' : [3], 
            'n_estimators' : [500], 
            'n_jobs' : [-2], 
            'random_state' : [127]
        }
    )
)

rm_models.append(
    RM_Estimator(
        name='XGBoost L1 y L2',
        estimator=xgb.XGBRegressor(),
        gs_param_grid={
            'gamma' : [1], 
            'learning_rate' : [0.2], 
            'max_depth' : [12], 
            'n_estimators' : [1000], 
            'n_jobs' : [-2], # jobs in parallel -2 : all processors minus one
            'random_state' : [127], 
            'reg_alpha' : [1000], # L1 regularization
            'reg_lambda' : [1000] # L2 regularization
        }
    )
)



---

In [13]:

X_train = data.drop(columns=[target])
y_train = data[target].copy()

scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(
    data=scaler.transform(X_train), 
    columns=list(X_train.columns)
)



---

In [14]:
# ignore code
if True :
    rm_columns = ['model', 'params', 'cv_score', 'RMSE']
    rm_result = pd.DataFrame(columns=rm_columns)

    rm_result = rm_evaluate(rm_result, rm_models, X_train, y_train)

    pd.options.display.max_colwidth = 500 
    rm_result.sort_values(by=['RMSE'], axis='index')

Gridsearch para Random Forest Regressor ...
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Gridsearch para XGBoost L1 y L2 ...
Fitting 2 folds for each of 1 candidates, totalling 2 fits


---

In [16]:


def get_feature_importance(rm_models, data_columns) :
    u'''
    
    '''
    
    result = pd.DataFrame(columns=['model'] + list(data_columns))
    
    for rm in rm_models :
        result = result.append(
            pd.Series(
                data=[rm.name] + list(rm.gs_estimator.feature_importances_), 
                index=result.columns
            ), 
            ignore_index=True 
        )

    result = pd.DataFrame(result.mean(), columns=['rate'])
    result.reset_index(drop=False, inplace=True)
    result.rename(columns={'index' : 'feature'}, inplace=True)
    result.sort_values(by='rate', ascending=False, inplace=True)
    
    result.reset_index(drop=True, inplace=True)
    result['rank'] = result.index
    
    return result


In [17]:
def show_feature_importance(rm_models, data_columns) :

    result = get_feature_importance(rm_models, data_columns)

    for rm in rm_models :
        temp = get_feature_importance([rm], data_columns)
        result = result.merge(right=temp[['feature', 'rate', 'rank']], on='feature', how='inner', suffixes=(None, ' ' + rm.name) )
    
    return result

show_feature_importance(rm_models=rm_models, data_columns=X_train.columns)

#


Unnamed: 0,feature,rate,rank,rate Random Forest Regressor,rank Random Forest Regressor,rate XGBoost L1 y L2,rank XGBoost L1 y L2
0,timelength_post_BT,0.177859,0,0.345726,0,0.009992,11
1,std_nc_diff_24_48,0.096395,1,0.039033,7,0.153757,1
2,nc_24_before_BT,0.084905,2,0.159542,1,0.010268,9
3,std_nc_24_before_BT,0.083842,3,0.044640,6,0.123043,2
4,median_nc_24_before_BT,0.083087,4,0.008712,16,0.157462,0
...,...,...,...,...,...,...,...
275,fw_47,0.000000,275,0.000000,258,0.000000,249
276,min_nl_between_24_48,0.000000,276,0.000000,198,0.000000,252
277,fw_87,0.000000,277,0.000000,239,0.000000,235
278,median_nl_between_24_48,0.000000,278,0.000000,200,0.000000,271


In [20]:
get_feature_importance(rm_models=[rm_models[1]], data_columns=X_train.columns)[:20]

Unnamed: 0,feature,rate,rank
0,median_nc_24_before_BT,0.157462,0
1,std_nc_diff_24_48,0.153757,1
2,std_nc_24_before_BT,0.123043,2
3,media_nc_diff_24_48,0.070029,3
4,std_nc_total_before_BT,0.042128,4
5,media_nc_total_before_BT,0.025601,5
6,nl_first_24_BT,0.024752,6
7,nc_diff_24_48,0.012668,7
8,media_nc_between_24_48,0.010701,8
9,nc_24_before_BT,0.010268,9
