In [1]:
# imprescindible
import pandas as pd
import numpy as np

# to avoid some warnings messages
import warnings
warnings.filterwarnings('ignore')

# to draw some graphs
import seaborn as sns
import matplotlib.pyplot as plt

# set seaborn and matplotlib default theme
sns.set_theme()
_sns_plotting_contex_ = sns.plotting_context()
sns.plotting_context('poster')

# set seaborn and matplotlib style to ...
# plt.style.use('classic')
sns.mpl.rcParams['axes.titlesize'] = 18
sns.mpl.rcParams['axes.labelsize'] = 14

# to use HTML codes within IPpython.display function
from IPython.display import HTML



In [2]:

import os
import time

from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

import xgboost as xgb

from sklearn.metrics import mean_squared_error


In [3]:
ROUND = lambda v : round(v, 4)

### Data

In [4]:
data_raw = pd.read_csv("./data/blogData_train.csv", header=None)
data_raw.drop_duplicates(inplace=True)

data_raw.shape

(49203, 281)

In [5]:

filepath = './data/test/'
filelist = [os.path.join(filepath, filename) for filename in os.listdir(filepath) if os.path.isfile(os.path.join(filepath, filename))]

test_raw = pd.DataFrame()

for filename in filelist :
    temp_raw = pd.read_csv(filename, header=None)
    temp_raw.drop_duplicates(inplace=True)
    test_raw = test_raw.append(temp_raw)

test_raw.shape

(7036, 281)

In [6]:
X_train = data_raw.iloc[:,0:280]
y_train = data_raw.iloc[:,-1]

In [7]:
X_test = test_raw.iloc[:,0:280]
y_test = test_raw.iloc[:,-1]


In [8]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
# using train scaler
X_test = scaler.transform(X_test)


In [11]:

def proof_of_concept() :
    u'''
    '''

    class POC_Model :
        def __init__(self, name, estimator) :
            self.name = name
            self.estimator = estimator
            return    

# ---

    models = []
    models.append( POC_Model(
        name='Linear Regression', 
        estimator=LinearRegression(
            n_jobs=-1, # use all processors
        ) 
    ) )

    models.append( POC_Model(
        name='Random Forest Regressor', 
        estimator=RandomForestRegressor(
            n_jobs=-1, # use all processors
        ) 
    ) )

    models.append( POC_Model(
        name='Gradient Boosting Regressor', 
        estimator=GradientBoostingRegressor() 
    ) )

    models.append( POC_Model(
        name='XGBoost Regressor', 
        estimator=xgb.XGBRegressor(
            n_jobs=-1, # use all processors
        ) 
    ) )

# ---

    poc_results = pd.DataFrame(columns=['model', 'fit time', 'train RMSE', 'test RMSE'])


    for m in models :

        print('Prueba de concepto para', m.name, '... ', end='')
        fit_time = time.time()
        m.estimator.fit(X_train, y_train)
        fit_time = ROUND(time.time() - fit_time)
        print(fit_time, 'segundos')

        y_train_pred = m.estimator.predict(X_train)
        train_rmse = ROUND(np.sqrt(mean_squared_error(y_train, y_train_pred)))
        y_test_pred = m.estimator.predict(X_test)
        test_rmse = ROUND(np.sqrt(mean_squared_error(y_test, y_test_pred)))

        poc_results = poc_results.append(
            pd.Series(
                data=[m.name, 
                      fit_time, 
                      train_rmse, 
                      test_rmse
                     ], 
                index=poc_results.columns), 
            ignore_index=True
        )


    # pd.options.display.max_colwidth = 500 
    display(poc_results)
    return

# uncomment to see proof of concept
# proof_of_concept()

POC for Linear Regression ... 0.4779 segundos
POC for Random Forest Regressor ... 40.4341 segundos
POC for Gradient Boosting Regressor ... 29.9809 segundos
POC for XGBoost Regressor ... 6.4282 segundos


Unnamed: 0,model,fit time,train RMSE,test RMSE
0,Linear Regression,0.4779,30.6107,26.3417
1,Random Forest Regressor,40.4341,10.1301,23.6
2,Gradient Boosting Regressor,29.9809,21.5453,23.0177
3,XGBoost Regressor,6.4282,11.1912,24.0768


---