In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML



### Data

In [2]:
def blogData_train_read() :
    u''' Reads and prepare data from blog feedback data train set
    
    '''

    data = pd.read_csv("./data/blogData_train.csv", header=None)
    data.drop_duplicates(inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    header = pd.read_csv("./data/blogData_label.csv", header=None)
    header = list(header[0])
    
    if len(header) != data.shape[1] :
        raise Exception('Los encabezados y la cantidad de características NO COINCIDE !!!')

    data.columns = header
    
    return data


In [3]:
data = blogData_train_read()
data.shape

(49203, 281)

In [4]:
data

Unnamed: 0,media_nc_total_before_BT,std_nc_total_before_BT,min_nc_total_before_BT,max_nc_total_before_BT,median_nc_total_before_BT,media_nc_24_before_BT,std_nc_24_before_BT,min_nc_24_before_BT,max_nc_24_before_BT,median_nc_24_before_BT,...,wednesday_post,thursday_post,friday_post,saturday_post,sunday_post,parents,min_parents,max_parents,media_parents,comments
0,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.441880,0.0,377.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.441880,0.0,377.0,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.441880,0.0,377.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,27.0
3,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.441880,0.0,377.0,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,40.30467,53.845657,0.0,401.0,15.0,15.52416,32.441880,0.0,377.0,3.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49198,33.00000,0.000000,33.0,33.0,33.0,11.00000,15.556349,0.0,33.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49199,33.00000,0.000000,33.0,33.0,33.0,11.00000,15.556349,0.0,33.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49200,0.00000,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49201,0.00000,0.000000,0.0,0.0,0.0,0.00000,0.000000,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
def blogData_labels(data) :
    u''' Create a dictionary with some keys associates to list of features in the final work dataframe
    
    '''
    columns = list(data.columns)

    labels = dict()

    labels['sd_nc_total_before_BT'] = columns[0:5]
    labels['sd_nc_24_before_BT'] = columns[5:10]
    labels['sd_nc_between_24_48'] = columns[10:15]
    labels['sd_nc_first_24_BT'] = columns[15:20]
    labels['sd_nc_diff_24_48'] = columns[20:25]
    
    labels['sd_nl_total_before_BT'] = columns[25:30]
    labels['sd_nl_24_before_BT'] = columns[30:35]
    labels['sd_nl_between_24_48'] = columns[35:40]
    labels['sd_nl_first_24_BT'] = columns[40:45]
    labels['sd_nl_diff_24_48'] = columns[45:50]
    
    labels['nc_total_before_BT'] = columns[50:51]
    labels['nc_24_before_BT'] = columns[51:52]
    labels['nc_between_24_48'] = columns[52:53]
    labels['nc_first_24_BT'] = columns[53:54]
    labels['nc_diff_24_48'] = columns[54:55]
    
    labels['nl_total_before_BT'] = columns[55:56]
    labels['nl_24_before_BT'] = columns[56:57]
    labels['nl_between_24_48'] = columns[57:58]
    labels['nl_first_24_BT'] = columns[58:59]
    labels['nl_diff_24_48'] = columns[59:60]
    
    labels['nc'] = columns[50:55]
    labels['nl'] = columns[55:60]

    labels['timelength_post_BT'] = columns[60:61]
    labels['length_post'] = columns[61:62]

    labels['frequent_word'] = columns[62:262]

    labels['weekday_BT'] = columns[262:269]
    labels['weekday_post'] = columns[269:276]
    labels['parents'] = columns[276:280]
    labels['comments'] = columns[280:281]

    return labels


In [6]:

labels = blogData_labels(data)
target = 'comments'


---

In [7]:
def set_figure(row, col, suptitle=None) :
    u''' Activate matplot figure setting size and super title
    '''
    fig = plt.figure(figsize=(row, col));
    if suptitle != None :
        fig.suptitle(suptitle, 
                     verticalalignment='center', fontsize='xx-large', fontweight='extra bold');
    return fig

In [8]:
def show_corr(data, target) :
    u'''
    '''
    
    corr = data.corr()
    cols = corr[target].sort_values(ascending=False).keys()
    
    topc = corr.loc[cols, cols]
    mask = np.zeros_like(topc)
    mask[np.triu_indices_from(mask)] = True
    
    sns.heatmap(data=topc, 
                cmap=sns.diverging_palette(h_neg=350, h_pos=150, center='light', as_cmap=True), 
                center=0, linewidths=1, annot=True, fmt=".3f", cbar=False, mask=mask)

    return

---

In [9]:

from sklearn.preprocessing import StandardScaler


In [10]:

basic_features = labels['nc']

X_train = data[basic_features]
y_train = data[target]

In [11]:

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)


In [12]:
RMSE = lambda v: round(np.sqrt(v), 4)

class Model :
    def __init__(self, name, model) :
        self.name = name
        self.model = model
        return    


In [23]:

from sklearn.linear_model import LinearRegression

from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet

from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import GridSearchCV



In [44]:
param1 = {
    'alpha' : [1.0, 2.0, 2.5]
}

In [57]:

#   estimator_cross_val (model,estimator,pipe,matriz,rs,X,y):
def pepe(estimator, param_grid, X, y) :

    # scoring = ['neg_mean_absolute_error', 'neg_root_mean_squared_error', 'r2']
    # refit =  'neg_root_mean_squared_error'

    scoring = 'neg_root_mean_squared_error'
    refit = True
    cv = StratifiedKFold(n_splits=5, random_state=11, shuffle=True)
    
    gs = GridSearchCV(estimator=estimator, 
                      param_grid=param_grid, 
                      scoring=scoring, 
                      n_jobs=-2, 
                      refit=refit, 
                      cv=cv, 
                      return_train_score=False)
    gs.fit(X, y)
    
    
#     MAE_mean  = -gs.cv_results_['mean_test_neg_mean_absolute_error']
#     MAE_std   =  gs.cv_results_['std_test_neg_mean_absolute_error']

#     RMSE_mean = -gs.cv_results_['mean_test_neg_root_mean_squared_error']
#     RMSE_std  =  gs.cv_results_['std_test_neg_root_mean_squared_error']

#     R2_mean   = gs.cv_results_['mean_test_r2']
#     R2_std    = gs.cv_results_['std_test_r2']

    RMSE_mean = -gs.cv_results_['mean_test_score']
    RMSE_std  =  gs.cv_results_['std_test_score']

    print(gs.best_estimator_)
    print(gs.best_params_)
    
    # return MAE_mean[0], MAE_std[0], RMSE_mean[0], RMSE_std[0], R2_mean[0], R2_std[0] , gs.cv_results_
    return RMSE_mean[0], RMSE_std[0] , gs.cv_results_


In [58]:
# matriz2 = estimator_cross_val('Linear Regression',LinearRegression(),estimator_scaler,matriz2,rs,blog_X_train,blog_y_train)

pepe(Lasso(), param1, X_train, y_train)

Lasso()
{'alpha': 1.0}


(33.8832067793465,
 2.38097383660485,
 {'mean_fit_time': array([0.01753006, 0.01057119, 0.01017232]),
  'std_fit_time': array([0.00336199, 0.00325317, 0.00146608]),
  'mean_score_time': array([0.00099783, 0.00059819, 0.00099745]),
  'std_score_time': array([4.86280395e-07, 4.88422185e-04, 1.90734863e-07]),
  'param_alpha': masked_array(data=[1.0, 2.0, 2.5],
               mask=[False, False, False],
         fill_value='?',
              dtype=object),
  'params': [{'alpha': 1.0}, {'alpha': 2.0}, {'alpha': 2.5}],
  'split0_test_score': array([-36.1640048 , -36.26373983, -36.3241679 ]),
  'split1_test_score': array([-29.96398958, -30.07624851, -30.15113785]),
  'split2_test_score': array([-35.72797998, -35.80838915, -35.85883887]),
  'split3_test_score': array([-32.30381844, -32.23464304, -32.21265862]),
  'split4_test_score': array([-35.2562411 , -35.24021645, -35.24190924]),
  'mean_test_score': array([-33.88320678, -33.92464739, -33.9577425 ]),
  'std_test_score': array([2.38097384, 