In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML

import os


In [2]:
def read_blogData_train() :
    u''' Reads and prepare data from blog feedback data train set
    
    '''

    data = pd.read_csv("./data/blogData_train.csv", header=None)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    header = pd.read_csv("./data/blogData_label.csv", header=None)
    header = list(header[0])
    
    if len(header) != data.shape[1] :
        raise Exception('Los encabezados y la cantidad de características NO COINCIDE !!!')

    data.columns = header
    
    return data


In [3]:
def blogData_labels(data) :
    u''' Create a dictionary with some keys associates to list of features in the final work dataframe
    
    '''
    columns = list(data.columns)

    labels = dict()

    labels['sd_nc_total_before_BT'] = columns[0:5]
    labels['sd_nc_24_before_BT'] = columns[5:10]
    labels['sd_nc_between_24_48'] = columns[10:15]
    labels['sd_nc_first_24_BT'] = columns[15:20]
    labels['sd_nc_diff_24_48'] = columns[20:25]
    
    labels['sd_nl_total_before_BT'] = columns[25:30]
    labels['sd_nl_24_before_BT'] = columns[30:35]
    labels['sd_nl_between_24_48'] = columns[35:40]
    labels['sd_nl_first_24_BT'] = columns[40:45]
    labels['sd_nl_diff_24_48'] = columns[45:50]
    
    labels['nc_total_before_BT'] = columns[50:51]
    labels['nc_24_before_BT'] = columns[51:52]
    labels['nc_between_24_48'] = columns[52:53]
    labels['nc_first_24_BT'] = columns[53:54]
    labels['nc_diff_24_48'] = columns[54:55]
    
    labels['nl_total_before_BT'] = columns[55:56]
    labels['nl_24_before_BT'] = columns[56:57]
    labels['nl_between_24_48'] = columns[57:58]
    labels['nl_first_24_BT'] = columns[58:59]
    labels['nl_diff_24_48'] = columns[59:60]
    
    labels['nc'] = columns[50:55]
    labels['nl'] = columns[55:60]

    labels['timelength_post_BT'] = columns[60:61]
    labels['length_post'] = columns[61:62]
    
    labels['tl_post'] = columns[60:62]

    labels['frequent_word'] = columns[62:262]

    labels['weekday_BT'] = columns[262:269]
    labels['weekday_post'] = columns[269:276]
    
    labels['parents'] = columns[276:280]
    labels['comments'] = columns[280:281]

    return labels


---

In [4]:
ROUND = lambda v: round(v, 4)

---


In [5]:
# to scale data
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

import time

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb

from sklearn.metrics import mean_squared_error


---

In [6]:

# to_classes = lambda v : 0 if v < 30 else (1 if v < 90 else (2 if v < 150 else (3 if v < 210 else 4)))
# to_classes = lambda v : 0 if v < 30 else (1 if v < 90 else 2)
to_classes = lambda v : 0 if v < 30 else 1


In [7]:
    data_raw = read_blogData_train()
    # test_raw = blogData_test_read()

    X_train = data_raw.iloc[:,0:280]
    y_train = data_raw.iloc[:,-1].copy()
    y_train = y_train.apply(to_classes)

    # X_test = test_raw.iloc[:,0:280]
    # y_test = test_raw.iloc[:,-1].copy()

    # X_train = X_train.iloc[:, 0:62].copy()
    # X_test = X_test.iloc[:, 0:62].copy()

    scaler = StandardScaler().fit(X_train)
    # X_train = scaler.transform(X_train)

    X_train = pd.DataFrame(
        data=scaler.transform(X_train), 
        columns=list(X_train.columns)
    )
    
    
    # X_test = scaler.transform(X_test)
# ---


In [8]:
X_train

Unnamed: 0,media_nc_total_before_BT,std_nc_total_before_BT,min_nc_total_before_BT,max_nc_total_before_BT,median_nc_total_before_BT,media_nc_24_before_BT,std_nc_24_before_BT,min_nc_24_before_BT,max_nc_24_before_BT,median_nc_24_before_BT,...,tuesday_post,wednesday_post,thursday_post,friday_post,saturday_post,sunday_post,parents,min_parents,max_parents,media_parents
0,0.027483,0.139747,-0.052689,0.173680,-0.126921,0.021799,0.133024,-0.021364,0.384456,-0.111373,...,-0.445822,-0.452390,2.270630,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
1,0.027483,0.139747,-0.052689,0.173680,-0.126921,0.021799,0.133024,-0.021364,0.384456,-0.111373,...,-0.445822,2.210482,-0.440406,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
2,0.027483,0.139747,-0.052689,0.173680,-0.126921,0.021799,0.133024,-0.021364,0.384456,-0.111373,...,-0.445822,-0.452390,2.270630,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
3,0.027483,0.139747,-0.052689,0.173680,-0.126921,0.021799,0.133024,-0.021364,0.384456,-0.111373,...,-0.445822,2.210482,-0.440406,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
4,0.027483,0.139747,-0.052689,0.173680,-0.126921,0.021799,0.133024,-0.021364,0.384456,-0.111373,...,-0.445822,-0.452390,2.270630,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49198,-0.063236,-0.717392,4.623013,-0.661320,0.126202,-0.115468,-0.297055,-0.021364,-0.666928,-0.234212,...,-0.445822,-0.452390,-0.440406,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
49199,-0.063236,-0.717392,4.623013,-0.661320,0.126202,-0.115468,-0.297055,-0.021364,-0.666928,-0.234212,...,-0.445822,-0.452390,-0.440406,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
49200,-0.473073,-0.717392,-0.052689,-0.736197,-0.337857,-0.449217,-0.693280,-0.021364,-0.767788,-0.234212,...,-0.445822,-0.452390,2.270630,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
49201,-0.473073,-0.717392,-0.052689,-0.736197,-0.337857,-0.449217,-0.693280,-0.021364,-0.767788,-0.234212,...,-0.445822,-0.452390,2.270630,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047


---

In [9]:
if True :
# ---    
    class Eval_Estimator :
        u'''
        '''

        def __init__(self, name, estimator, params=None) :
            self.name = name
            self.estimator = estimator
            self.params = params

            return        
# ---
    model = Eval_Estimator(
        name='XGBoost Classifier', 
        estimator=xgb.XGBClassifier(), 
        params={
            'eval_metric' : 'auc', # 
            'gamma' : 0, # (min_split_loss) minimum loss reduction
            'learning_rate' : 0.0001, # (eta) step size shrinkage
            'max_delta_step' : 1e6, # extremely imbalanced
            'max_depth' : 20, # maximum depth of tree
            'n_estimators' : 500, 
            'n_jobs' : -1, # use all processors
            'objective' : 'binary:logistic', # for binary classification 
            'random_state' : 127, 
            'verbosity' : 0, 
        }
    )
# ---
    model.estimator.set_params(**model.params)
    print('Entrenando modelo', model.name, '...')
    print(model.estimator.get_params())
    model.estimator.fit(X_train, y_train)
# ---

Entrenando modelo XGBoost Classifier ...
{'objective': 'binary:logistic', 'use_label_encoder': True, 'base_score': None, 'booster': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': None, 'enable_categorical': False, 'gamma': 0, 'gpu_id': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.0001, 'max_delta_step': 1000000.0, 'max_depth': 20, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'n_estimators': 500, 'n_jobs': -1, 'num_parallel_tree': None, 'predictor': None, 'random_state': 127, 'reg_alpha': None, 'reg_lambda': None, 'scale_pos_weight': None, 'subsample': None, 'tree_method': None, 'validate_parameters': None, 'verbosity': 0, 'eval_metric': 'auc'}


In [10]:
X_train

Unnamed: 0,media_nc_total_before_BT,std_nc_total_before_BT,min_nc_total_before_BT,max_nc_total_before_BT,median_nc_total_before_BT,media_nc_24_before_BT,std_nc_24_before_BT,min_nc_24_before_BT,max_nc_24_before_BT,median_nc_24_before_BT,...,tuesday_post,wednesday_post,thursday_post,friday_post,saturday_post,sunday_post,parents,min_parents,max_parents,media_parents
0,0.027483,0.139747,-0.052689,0.173680,-0.126921,0.021799,0.133024,-0.021364,0.384456,-0.111373,...,-0.445822,-0.452390,2.270630,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
1,0.027483,0.139747,-0.052689,0.173680,-0.126921,0.021799,0.133024,-0.021364,0.384456,-0.111373,...,-0.445822,2.210482,-0.440406,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
2,0.027483,0.139747,-0.052689,0.173680,-0.126921,0.021799,0.133024,-0.021364,0.384456,-0.111373,...,-0.445822,-0.452390,2.270630,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
3,0.027483,0.139747,-0.052689,0.173680,-0.126921,0.021799,0.133024,-0.021364,0.384456,-0.111373,...,-0.445822,2.210482,-0.440406,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
4,0.027483,0.139747,-0.052689,0.173680,-0.126921,0.021799,0.133024,-0.021364,0.384456,-0.111373,...,-0.445822,-0.452390,2.270630,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49198,-0.063236,-0.717392,4.623013,-0.661320,0.126202,-0.115468,-0.297055,-0.021364,-0.666928,-0.234212,...,-0.445822,-0.452390,-0.440406,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
49199,-0.063236,-0.717392,4.623013,-0.661320,0.126202,-0.115468,-0.297055,-0.021364,-0.666928,-0.234212,...,-0.445822,-0.452390,-0.440406,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
49200,-0.473073,-0.717392,-0.052689,-0.736197,-0.337857,-0.449217,-0.693280,-0.021364,-0.767788,-0.234212,...,-0.445822,-0.452390,2.270630,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047
49201,-0.473073,-0.717392,-0.052689,-0.736197,-0.337857,-0.449217,-0.693280,-0.021364,-0.767788,-0.234212,...,-0.445822,-0.452390,2.270630,-0.428341,-0.326467,-0.317238,-0.085526,0.0,-0.046617,-0.039047


---

In [11]:


def get_feature_importance(models, data_columns) :
    u'''
    
    '''
    
    result = pd.DataFrame(columns=['model'] + list(data_columns))
    
    for m in models :
        result = result.append(
            pd.Series(
                data=[m.name] + list(m.estimator.feature_importances_), 
                index=result.columns
            ), 
            ignore_index=True 
        )

    result = pd.DataFrame(result.mean(), columns=['rate'])
    result.reset_index(drop=False, inplace=True)
    result.rename(columns={'index' : 'feature'}, inplace=True)
    result.sort_values(by='rate', ascending=False, inplace=True)
    
    result.reset_index(drop=True, inplace=True)
    result['rank'] = result.index
    
    return result


In [12]:
def show_feature_importance(models, data_columns) :

    result = get_feature_importance(models, data_columns)
    result.rename(columns={'rate' : 'rate global', 'rank' : 'rank global'}, inplace=True)

    if len(models) > 1 :
        for m in models :
            temp = get_feature_importance([m], data_columns)
            result = result.merge(right=temp[['feature', 'rate', 'rank']], on='feature', how='inner', suffixes=(None, ' ' + m.name) )
    
    return result

show_feature_importance(models=[model], data_columns=X_train.columns)[:20]


Unnamed: 0,feature,rate global,rank global
0,nc_24_before_BT,0.122732,0
1,median_nc_total_before_BT,0.073042,1
2,media_nc_diff_24_48,0.066276,2
3,nc_diff_24_48,0.045481,3
4,media_nl_first_24_BT,0.04066,4
5,timelength_post_BT,0.035805,5
6,median_nc_first_24_BT,0.034947,6
7,media_nl_between_24_48,0.032491,7
8,std_nc_total_before_BT,0.025741,8
9,max_nc_total_before_BT,0.025512,9


In [14]:
import pickle

In [17]:
pkl_file = 'Model.pkl'

In [18]:
with open(pkl_file, 'wb') as pkl_hand :
    pickle.dump(model, pkl_hand)

In [21]:
with open(pkl_file, 'rb') as pkl_hand :
    other_model = pickle.load(pkl_hand)

In [22]:
show_feature_importance(models=[other_model], data_columns=X_train.columns)[:20]

Unnamed: 0,feature,rate global,rank global
0,nc_24_before_BT,0.122732,0
1,median_nc_total_before_BT,0.073042,1
2,media_nc_diff_24_48,0.066276,2
3,nc_diff_24_48,0.045481,3
4,media_nl_first_24_BT,0.04066,4
5,timelength_post_BT,0.035805,5
6,median_nc_first_24_BT,0.034947,6
7,media_nl_between_24_48,0.032491,7
8,std_nc_total_before_BT,0.025741,8
9,max_nc_total_before_BT,0.025512,9


In [59]:
# features_names = list(data_raw.columns)

In [69]:
# print('xxx = [')
# i = 0
# s = 5
# for j in range(5, 55, 5) :
#     for f in features_names[i:j] :
#         print("'" + f + "', ", end='')
#     i = j
#     print()
# #
# for j in range(50, 55) :
#     print("'" + features_names[j] + "', ", end='')
# print()
# #
# for j in range(55, 60) :
#     print("'" + features_names[j] + "', ", end='')
# print()
# #
# for j in range(60, 62) :
#     print("'" + features_names[j] + "', ", end='')
# print()
# #
# c = 0
# for j in range(62, 262) :
#     print("'" + features_names[j] + "', ", end='')
#     c += 1
#     if c % 10 == 0 :
#         print()
# #
# for j in range(262, 269) :
#     print("'" + features_names[j] + "', ", end='')
# print()
# for j in range(269, 276) :
#     print("'" + features_names[j] + "', ", end='')
# print()
# #
# for j in range(276, 280) :
#     print("'" + features_names[j] + "', ", end='')
# #
# print("'" + features_names[280] + "'")
# print(']')



In [67]:
def get_blogData_features_names() :
    u'''
    '''
    return [
        'media_nc_total_before_BT', 'std_nc_total_before_BT', 'min_nc_total_before_BT', 'max_nc_total_before_BT', 'median_nc_total_before_BT', 
        'media_nc_24_before_BT', 'std_nc_24_before_BT', 'min_nc_24_before_BT', 'max_nc_24_before_BT', 'median_nc_24_before_BT', 
        'media_nc_between_24_48', 'std_nc_between_24_48', 'min_nc_between_24_48', 'max_nc_between_24_48', 'median_nc_between_24_48', 
        'media_nc_first_24_BT', 'std_nc_first_24_BT', 'min_nc_first_24_BT', 'max_nc_first_24_BT', 'median_nc_first_24_BT', 
        'media_nc_diff_24_48', 'std_nc_diff_24_48', 'min_nc_diff_24_48', 'max_nc_diff_24_48', 'median_nc_diff_24_48', 
        'media_nl_total_before_BT', 'std_nl_total_before_BT', 'min_nl_total_before_BT', 'max_nl_total_before_BT', 'median_nl_total_before_BT', 
        'media_nl_24_before_BT', 'std_nl_24_before_BT', 'min_nl_24_before_BT', 'max_nl_24_before_BT', 'median_nl_24_before_BT', 
        'media_nl_between_24_48', 'std_nl_between_24_48', 'min_nl_between_24_48', 'max_nl_between_24_48', 'median_nl_between_24_48', 
        'media_nl_first_24_BT', 'std_nl_first_24_BT', 'min_nl_first_24_BT', 'max_nl_first_24_BT', 'median_nl_first_24_BT', 
        'media_nl_diff_24_48', 'std_nl_diff_24_48', 'min_nl_diff_24_48', 'max_nl_diff_24_48', 'median_nl_diff_24_48', 
        'nc_total_before_BT', 'nc_24_before_BT', 'nc_between_24_48', 'nc_first_24_BT', 'nc_diff_24_48', 
        'nl_total_before_BT', 'nl_24_before_BT', 'nl_between_24_48', 'nl_first_24_BT', 'nl_diff_24_48', 
        'timelength_post_BT', 'length_post', 
        'fw_1', 'fw_2', 'fw_3', 'fw_4', 'fw_5', 'fw_6', 'fw_7', 'fw_8', 'fw_9', 'fw_10', 
        'fw_11', 'fw_12', 'fw_13', 'fw_14', 'fw_15', 'fw_16', 'fw_17', 'fw_18', 'fw_19', 'fw_20', 
        'fw_21', 'fw_22', 'fw_23', 'fw_24', 'fw_25', 'fw_26', 'fw_27', 'fw_28', 'fw_29', 'fw_30', 
        'fw_31', 'fw_32', 'fw_33', 'fw_34', 'fw_35', 'fw_36', 'fw_37', 'fw_38', 'fw_39', 'fw_40', 
        'fw_41', 'fw_42', 'fw_43', 'fw_44', 'fw_45', 'fw_46', 'fw_47', 'fw_48', 'fw_49', 'fw_50', 
        'fw_51', 'fw_52', 'fw_53', 'fw_54', 'fw_55', 'fw_56', 'fw_57', 'fw_58', 'fw_59', 'fw_60', 
        'fw_61', 'fw_62', 'fw_63', 'fw_64', 'fw_65', 'fw_66', 'fw_67', 'fw_68', 'fw_69', 'fw_70', 
        'fw_71', 'fw_72', 'fw_73', 'fw_74', 'fw_75', 'fw_76', 'fw_77', 'fw_78', 'fw_79', 'fw_80', 
        'fw_81', 'fw_82', 'fw_83', 'fw_84', 'fw_85', 'fw_86', 'fw_87', 'fw_88', 'fw_89', 'fw_90', 
        'fw_91', 'fw_92', 'fw_93', 'fw_94', 'fw_95', 'fw_96', 'fw_97', 'fw_98', 'fw_99', 'fw_100', 
        'fw_101', 'fw_102', 'fw_103', 'fw_104', 'fw_105', 'fw_106', 'fw_107', 'fw_108', 'fw_109', 'fw_110', 
        'fw_111', 'fw_112', 'fw_113', 'fw_114', 'fw_115', 'fw_116', 'fw_117', 'fw_118', 'fw_119', 'fw_120', 
        'fw_121', 'fw_122', 'fw_123', 'fw_124', 'fw_125', 'fw_126', 'fw_127', 'fw_128', 'fw_129', 'fw_130', 
        'fw_131', 'fw_132', 'fw_133', 'fw_134', 'fw_135', 'fw_136', 'fw_137', 'fw_138', 'fw_139', 'fw_140', 
        'fw_141', 'fw_142', 'fw_143', 'fw_144', 'fw_145', 'fw_146', 'fw_147', 'fw_148', 'fw_149', 'fw_150', 
        'fw_151', 'fw_152', 'fw_153', 'fw_154', 'fw_155', 'fw_156', 'fw_157', 'fw_158', 'fw_159', 'fw_160', 
        'fw_161', 'fw_162', 'fw_163', 'fw_164', 'fw_165', 'fw_166', 'fw_167', 'fw_168', 'fw_169', 'fw_170', 
        'fw_171', 'fw_172', 'fw_173', 'fw_174', 'fw_175', 'fw_176', 'fw_177', 'fw_178', 'fw_179', 'fw_180', 
        'fw_181', 'fw_182', 'fw_183', 'fw_184', 'fw_185', 'fw_186', 'fw_187', 'fw_188', 'fw_189', 'fw_190', 
        'fw_191', 'fw_192', 'fw_193', 'fw_194', 'fw_195', 'fw_196', 'fw_197', 'fw_198', 'fw_199', 'fw_200', 
        'monday_BT', 'tuesday_BT', 'wednesday_BT', 'thursday_BT', 'friday_BT', 'saturday_BT', 'sunday_BT', 
        'monday_post', 'tuesday_post', 'wednesday_post', 'thursday_post', 'friday_post', 'saturday_post', 'sunday_post', 
        'parents', 'min_parents', 'max_parents', 'media_parents', 'comments'
    ]

In [68]:
# get_blogData_features_names()

['media_nc_total_before_BT',
 'std_nc_total_before_BT',
 'min_nc_total_before_BT',
 'max_nc_total_before_BT',
 'median_nc_total_before_BT',
 'media_nc_24_before_BT',
 'std_nc_24_before_BT',
 'min_nc_24_before_BT',
 'max_nc_24_before_BT',
 'median_nc_24_before_BT',
 'media_nc_between_24_48',
 'std_nc_between_24_48',
 'min_nc_between_24_48',
 'max_nc_between_24_48',
 'median_nc_between_24_48',
 'media_nc_first_24_BT',
 'std_nc_first_24_BT',
 'min_nc_first_24_BT',
 'max_nc_first_24_BT',
 'median_nc_first_24_BT',
 'media_nc_diff_24_48',
 'std_nc_diff_24_48',
 'min_nc_diff_24_48',
 'max_nc_diff_24_48',
 'median_nc_diff_24_48',
 'media_nl_total_before_BT',
 'std_nl_total_before_BT',
 'min_nl_total_before_BT',
 'max_nl_total_before_BT',
 'median_nl_total_before_BT',
 'media_nl_24_before_BT',
 'std_nl_24_before_BT',
 'min_nl_24_before_BT',
 'max_nl_24_before_BT',
 'median_nl_24_before_BT',
 'media_nl_between_24_48',
 'std_nl_between_24_48',
 'min_nl_between_24_48',
 'max_nl_between_24_48',
 'm