In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML



### Data

In [2]:
def blogData_train_read() :
    u''' Reads and prepare data from blog feedback data train set
    
    '''

    data = pd.read_csv("./data/blogData_train.csv", header=None)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    header = pd.read_csv("./data/blogData_label.csv", header=None)
    header = list(header[0])
    
    if len(header) != data.shape[1] :
        raise Exception('Los encabezados y la cantidad de características NO COINCIDE !!!')

    data.columns = header
    
    return data


In [3]:
data = blogData_train_read()
data.shape

(49203, 281)

In [4]:
# data

In [5]:
def blogData_labels(data) :
    u''' Create a dictionary with some keys associates to list of features in the final work dataframe
    
    '''
    columns = list(data.columns)

    labels = dict()

    labels['sd_nc_total_before_BT'] = columns[0:5]
    labels['sd_nc_24_before_BT'] = columns[5:10]
    labels['sd_nc_between_24_48'] = columns[10:15]
    labels['sd_nc_first_24_BT'] = columns[15:20]
    labels['sd_nc_diff_24_48'] = columns[20:25]
    
    labels['sd_nl_total_before_BT'] = columns[25:30]
    labels['sd_nl_24_before_BT'] = columns[30:35]
    labels['sd_nl_between_24_48'] = columns[35:40]
    labels['sd_nl_first_24_BT'] = columns[40:45]
    labels['sd_nl_diff_24_48'] = columns[45:50]
    
    labels['nc_total_before_BT'] = columns[50:51]
    labels['nc_24_before_BT'] = columns[51:52]
    labels['nc_between_24_48'] = columns[52:53]
    labels['nc_first_24_BT'] = columns[53:54]
    labels['nc_diff_24_48'] = columns[54:55]
    
    labels['nl_total_before_BT'] = columns[55:56]
    labels['nl_24_before_BT'] = columns[56:57]
    labels['nl_between_24_48'] = columns[57:58]
    labels['nl_first_24_BT'] = columns[58:59]
    labels['nl_diff_24_48'] = columns[59:60]
    
    labels['nc'] = columns[50:55]
    labels['nl'] = columns[55:60]

    labels['timelength_post_BT'] = columns[60:61]
    labels['length_post'] = columns[61:62]
    
    labels['tl_post'] = columns[60:62]

    labels['frequent_word'] = columns[62:262]

    labels['weekday_BT'] = columns[262:269]
    labels['weekday_post'] = columns[269:276]
    
    labels['parents'] = columns[276:280]
    labels['comments'] = columns[280:281]

    return labels


In [6]:

labels = blogData_labels(data)
target = 'comments'


---

In [7]:
ROUND = lambda v: round(v, 4)

---

In [8]:

from sklearn.preprocessing import StandardScaler


---
<a name="Regression_Models"></a>
### Regression Models

In [9]:

class RM_Estimator :
    u'''
    '''
    
    def __init__(self, name, estimator, gs_param_grid=None) :
        # self.alias = alias
        self.name = name
        self.estimator = estimator
        self.gs_param_grid = gs_param_grid
        self.gs_estimator = None
        
        return    

In [10]:

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import time

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb

from sklearn.metrics import mean_squared_error


In [11]:
def rm_evaluate(rm_result, rm_models, X_train, y_train) :
    u'''
    '''
    
    for rm in rm_models :

        scoring = 'neg_root_mean_squared_error'
        cv = StratifiedKFold(n_splits=2, random_state=11, shuffle=True)
        
        gs = GridSearchCV(
            estimator=rm.estimator, # scikit-learn estimator interface
            param_grid=rm.gs_param_grid, # dictionart key=parametrer, value=list of paraameter posible values
            scoring=scoring, # strategy to evaluate performance of cross-validated
            n_jobs=-2, # jobs in parallel -2 : all processors minus one
            refit=True, # refit estimator using best parameters
            cv=cv, # cross-validated splitting strategy
            return_train_score=False, # include training scores
            verbose=1 # display fold parameters, score, time, ...
        )
        
        print('Gridsearch para', rm.name, '...')

        gs.fit(X_train, y_train)
        rm.gs_estimator = gs.best_estimator_
        
        y_pred = gs.predict(X_train)
        gs_rmse = ROUND(np.sqrt(mean_squared_error(y_train, y_pred)))

        
        rm_result = rm_result.append(
            pd.Series(
                data=[rm.name, 
                      gs.best_params_, 
                      gs.best_score_, 
                      gs_rmse
                     ], 
                index=rm_result.columns
                ),
            ignore_index=True
        )

    return rm_result
    

In [12]:

rm_models = []

# rm_models.append(
#     RM_Estimator(
#         name='Linear Regression',
#         estimator=LinearRegression(),
#         gs_param_grid={
#             'fit_intercept' : [True]
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='Ridge',
#         estimator=Ridge(),
#         gs_param_grid={
#             'fit_intercept' : [True]
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='Lasso',
#         estimator=Lasso(),
#         gs_param_grid={
#             'fit_intercept' : [True]
#         }
#     )
# )


# rm_models.append(
#     RM_Estimator(
#         name='Elastic Net',
#         estimator=ElasticNet(),
#         gs_param_grid={
#             'alpha' : [1.0], 
#             'l1_ratio' : [0, 0.5, 1] # 0 : no L2 penalty (Ridge);  1 : no L1 penalty (Lasso)
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='K-Nearest Neighbors',
#         estimator=KNeighborsRegressor(),
#         gs_param_grid={
#             'n_jobs' : [-2], 
#             'n_neighbors' : [5, 10], 
#             'p' : [2], # euclidian_distance
#             'weights' : ['uniform'] # equally weighted
#         }
#     )
# )

rm_models.append(
    RM_Estimator(
        name='Random Forest Regressor',
        estimator=RandomForestRegressor(),
        gs_param_grid={
            'max_depth' : [3], 
            'n_estimators' : [500], 
            'n_jobs' : [-2], 
            'random_state' : [127]
        }
    )
)

# rm_models.append(
#     RM_Estimator(
#         name='Gradient Boosting Regressor',
#         estimator=GradientBoostingRegressor(),
#         gs_param_grid={
#             'learning_rate' : [0.1, 0.2], 
#             'max_depth' : [3], 
#             'n_estimators' : [500], 
#             'random_state' : [127], 
#             'verbose' : [0]
#         }
#     )
# )

# rm_models.append(
#     RM_Estimator(
#         name='XGBoost (default)',
#         estimator=xgb.XGBRegressor(),
#         gs_param_grid={
#             'gamma' : [0], # (min_split_loss) minimum loss reduction
#             'learning_rate' : [0.3], # (eta) step size shrinkage
#             'max_depth' : [6], # maximum depth of tree
#             'n_estimators' : [500], 
#             'n_jobs' : [-2], # jobs in parallel -2 : all processors minus one
#             'random_state' : [127], 
#             'reg_alpha' : [0], # (alpha) L1 regularization
#             'reg_lambda' : [1] # (lambda) L2 regularization
#         }
#     )
# )

rm_models.append(
    RM_Estimator(
        name='XGBoost L1 y L2',
        estimator=xgb.XGBRegressor(),
        gs_param_grid={
            'gamma' : [1], 
            'learning_rate' : [0.2], 
            'max_depth' : [12], 
            'n_estimators' : [1000], 
            'n_jobs' : [-2], # jobs in parallel -2 : all processors minus one
            'random_state' : [127], 
            'reg_alpha' : [1000], # L1 regularization
            'reg_lambda' : [1000] # L2 regularization
        }
    )
)



---

In [13]:

X_train = data.drop(columns=[target])
y_train = data[target].copy()

scaler = StandardScaler().fit(X_train)
X_train = pd.DataFrame(
    data=scaler.transform(X_train), 
    columns=list(X_train.columns)
)



---

In [14]:
# ignore code
if True :
    rm_columns = ['model', 'params', 'cv_score', 'RMSE']
    rm_result = pd.DataFrame(columns=rm_columns)

    rm_result = rm_evaluate(rm_result, rm_models, X_train, y_train)

    pd.options.display.max_colwidth = 500 
    rm_result.sort_values(by=['RMSE'], axis='index')

Gridsearch para Random Forest Regressor ...
Fitting 2 folds for each of 1 candidates, totalling 2 fits
Gridsearch para XGBoost L1 y L2 ...
Fitting 2 folds for each of 1 candidates, totalling 2 fits


---

In [15]:
rm_result

Unnamed: 0,model,params,cv_score,RMSE
0,Random Forest Regressor,"{'max_depth': 3, 'n_estimators': 500, 'n_jobs': -2, 'random_state': 127}",-26.519411,25.5614
1,XGBoost L1 y L2,"{'gamma': 1, 'learning_rate': 0.2, 'max_depth': 12, 'n_estimators': 1000, 'n_jobs': -2, 'random_state': 127, 'reg_alpha': 1000, 'reg_lambda': 1000}",-25.481842,14.9075


In [19]:
for rm in rm_models :
    print(rm.name, rm.gs_estimator.get_params())

Random Forest Regressor {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': 3, 'max_features': 'auto', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 500, 'n_jobs': -2, 'oob_score': False, 'random_state': 127, 'verbose': 0, 'warm_start': False}
XGBoost L1 y L2 {'objective': 'reg:squarederror', 'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'enable_categorical': False, 'gamma': 1, 'gpu_id': -1, 'importance_type': None, 'interaction_constraints': '', 'learning_rate': 0.2, 'max_delta_step': 0, 'max_depth': 12, 'min_child_weight': 1, 'missing': nan, 'monotone_constraints': '()', 'n_estimators': 1000, 'n_jobs': -2, 'num_parallel_tree': 1, 'predictor': 'auto', 'random_state': 127, 'reg_alpha': 1000, 'reg_lambda': 1000, 'scale_pos_weight': 1, 'subsample': 1, 'tre

In [None]:
# model = xgb.XGBRegressor()

# model_params = {
#     'gamma': 1, 
#     'learning_rate': 0.2, 
#     'max_depth': 12, 
#     'n_estimators': 1000, 
#     'n_jobs': -2, 
#     'random_state': 127, 
#     'reg_alpha': 1000, 
#     'reg_lambda': 1000
# }

# model.set_params(**model_params)

# model.fit(X_train, y_train)

In [128]:
data_columns = [str(c) for c in range(rm_models[0].gs_estimator.n_features_in_) ]

print(data_columns)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '15

In [138]:
for rm in rm_models :
    print(rm.name)


Random Forest Regressor
XGBoost L1 y L2


In [272]:


def get_feature_importance(rm_models, data_columns) :
    u'''
    
    '''
    
    result = pd.DataFrame(columns=['model'] + list(data_columns))
    
    for rm in rm_models :
        result = result.append(
            pd.Series(
                data=[rm.name] + list(rm.gs_estimator.feature_importances_), 
                index=result.columns
            ), 
            ignore_index=True 
        )

    result = pd.DataFrame(result.mean(), columns=['rate'])
    result.reset_index(drop=False, inplace=True)
    result.rename(columns={'index' : 'feature'}, inplace=True)
    result.sort_values(by='rate', ascending=False, inplace=True)
    
    result.reset_index(drop=True, inplace=True)
    result['rank'] = result.index
    
    # result.reset_index(drop=False, inplace=True)
    # result.rename(columns={'index' : 'feature_index'}, inplace=True)
    # result = result[['feature', 'feature_index', 'importance']]
    
    return result


In [271]:
rm_alias = {
    'Random Forest Regressor' : 'RFR', 
    'XGBoost L1 y L2' : 'XGB', 
    'Z' : 'Z'
}

In [278]:
def show_feature_importance(rm_models, data_columns) :

    result = get_feature_importance(rm_models, data_columns)

    for rm in rm_models :
        temp = get_feature_importance([rm], data_columns)
        result = result.merge(right=temp[['feature', 'rate', 'rank']], on='feature', how='inner', suffixes=(None, ' ' + rm.name) )
    
    return result

show_feature_importance(rm_models=rm_models, data_columns=X_train.columns)

#


Unnamed: 0,feature,rate,rank,rate Random Forest Regressor,rank Random Forest Regressor,rate XGBoost L1 y L2,rank XGBoost L1 y L2
0,timelength_post_BT,0.177859,0,0.345726,0,0.009992,11
1,std_nc_diff_24_48,0.096395,1,0.039033,7,0.153757,1
2,nc_24_before_BT,0.084905,2,0.159542,1,0.010268,9
3,std_nc_24_before_BT,0.083842,3,0.044640,6,0.123043,2
4,median_nc_24_before_BT,0.083087,4,0.008712,16,0.157462,0
...,...,...,...,...,...,...,...
275,fw_47,0.000000,275,0.000000,258,0.000000,249
276,min_nl_between_24_48,0.000000,276,0.000000,198,0.000000,252
277,fw_87,0.000000,277,0.000000,239,0.000000,235
278,median_nl_between_24_48,0.000000,278,0.000000,200,0.000000,271
