In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from IPython.display import display

from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

warnings.filterwarnings('ignore')#to filter all the warnings
pd.set_option('float_format', '{:.4f}'.format)# to keep the float values short

## Carga del dataset

In [2]:
df = pd.read_csv('../data/model_data/youtube_num.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40949 entries, 0 to 40948
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   category_id          40949 non-null  int64
 1   views                40949 non-null  int64
 2   likes                40949 non-null  int64
 3   dislikes             40949 non-null  int64
 4   comment_count        40949 non-null  int64
 5   total_trending_days  40949 non-null  int64
 6   published_year       40949 non-null  int64
 7   published_month      40949 non-null  int64
 8   published_week       40949 non-null  int64
 9   published_day        40949 non-null  int64
 10  published_hour       40949 non-null  int64
 11  published_minute     40949 non-null  int64
dtypes: int64(12)
memory usage: 3.7 MB


In [3]:
df.head(3)

Unnamed: 0,category_id,views,likes,dislikes,comment_count,total_trending_days,published_year,published_month,published_week,published_day,published_hour,published_minute
0,24,475965,6531,172,271,6,2018,6,23,4,13,0
1,24,605506,7848,232,354,6,2018,6,23,4,13,0
2,24,705986,8930,277,371,6,2018,6,23,4,13,0


### ¿Es posible predecir cuantos likes o visitas tendrá un video? Si es así, crea un modelo que lo compruebe.

* Columna objetivo `(target)` --> Views

In [36]:
ct = make_column_transformer(
    (StandardScaler(),
    make_column_selector(dtype_include=np.number))
)

In [38]:
X = df.copy()
y = X.pop('views')

In [39]:
X = ct.fit_transform(X)

In [33]:
y.describe()

count       40949.0000
mean      2360784.6383
std       7394113.7597
min           549.0000
25%        242329.0000
50%        681861.0000
75%       1823157.0000
max     225211923.0000
Name: views, dtype: float64

In [7]:
y.dtypes

dtype('int64')

## Realizamos una primera prueba

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [42]:
model_linear = LinearRegression()

In [43]:
model_linear.fit(X_train, y_train)
model_linear.score(X_test, y_test)

0.7970677134378407

#### Metrics

Coefficient Determination

In [51]:
y_predict = model_linear.predict(X_test)
r2_score = r2_score(y_test, y_predict)
print("Coefficient of determination: %.2f" % r2_score)

Coefficient of determination: 0.80


In [46]:
df_result = pd.DataFrame()
df_result['y_true'] = y_test
df_result['y_predict'] = y_predict

In [47]:
df_result.sample(10)

Unnamed: 0,y_true,y_predict
252,14107720,10804167.6783
25111,142139,476389.5403
29791,219160,208687.6845
17639,392117,753919.8294
9937,2232546,2194022.4502
6225,24249,-600752.7399
353,2422729,7085996.8733
3758,376062,258153.9444
17690,8534145,4531383.1386
16345,24624,-518812.4777


## Probemos distintos modelos y seleccionemos el mejor

In [55]:
minMAD = 10000000
nfolds = 3
bestREG = ''

regs = [Lasso(), ElasticNet(), DecisionTreeRegressor(), GradientBoostingRegressor(), LinearRegression(),
        RandomForestRegressor()]

for reg in regs:
    kf = KFold(n_splits=nfolds,random_state=0,shuffle=True)
    mad = cross_val_score(reg, X, y,\
             cv=kf,scoring='neg_mean_absolute_error').mean()
    # need the lowest scoring for mad
    print (str(reg)[:25] + ' with mad= ' + str(mad) )
    if mad < minMAD:
        minMAD = mad
        bestREG = reg
        
print('***********************************************')
print ('Best Regressor is... ' + str(bestREG)[:25] )
print('**********************')
print ('With MAD Score ' + str(minMAD))

Lasso() with mad= -1251181.3493892644
ElasticNet() with mad= -1436958.5454328603
DecisionTreeRegressor() with mad= -361916.9140745744
GradientBoostingRegressor with mad= -802967.3440349827
LinearRegression() with mad= -1252355.3621205122
RandomForestRegressor() with mad= -293140.72074019857
***********************************************
Best Regressor is... ElasticNet()
**********************
With MAD Score -1436958.5454328603


El mejor modelo para nuestro set de datos es `ElasticNet` así que podremos 'tunearlo' a continuacion

In [56]:
parametersGrid = {"max_iter": [1, 5, 10],
                    "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                    "l1_ratio": np.arange(0.0, 1.0, 0.1)}

model_eNet = ElasticNet()
grid = GridSearchCV(model_eNet, parametersGrid, scoring='r2', cv=10)
grid.fit(X_train, y_train)
Y_pred = grid.predict(X_test)

In [60]:
grid.score(X_test, y_test)

0.7967743294716628

In [57]:
df_result_tun = pd.DataFrame()
df_result_tun['y_true'] = y_test
df_result_tun['y_predict'] = Y_pred

In [58]:
df_result_tun.sample(5)

Unnamed: 0,y_true,y_predict
16398,1420018,2965866.0893
19384,328056,609972.8619
5528,3174,-40527.9112
40126,608431,857850.1088
7901,344358,186994.3869


In [59]:
r2_score = r2_score(y_test, Y_pred)
print("Coefficient of determination: %.2f" % r2_score)

Coefficient of determination: 0.80
