In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from IPython.display import display

from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

warnings.filterwarnings('ignore')#to filter all the warnings
pd.set_option('float_format', '{:.4f}'.format)# to keep the float values short

## Carga del dataset

In [3]:
df = pd.read_csv('../data/model_data/youtube_num.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40949 entries, 0 to 40948
Data columns (total 13 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   category_id          40949 non-null  int64 
 1   views                40949 non-null  int64 
 2   likes                40949 non-null  int64 
 3   dislikes             40949 non-null  int64 
 4   comment_count        40949 non-null  int64 
 5   category_name        40949 non-null  object
 6   total_trending_days  40949 non-null  int64 
 7   published_year       40949 non-null  int64 
 8   published_month      40949 non-null  int64 
 9   published_week       40949 non-null  int64 
 10  published_day        40949 non-null  int64 
 11  published_hour       40949 non-null  int64 
 12  published_minute     40949 non-null  int64 
dtypes: int64(12), object(1)
memory usage: 4.1+ MB


In [4]:
dummies_category = pd.get_dummies(df['category_name'])
df.drop(['category_id', 'category_name'], axis=1, inplace=True)

In [6]:
df = pd.concat([df, dummies_category], axis=1)

In [7]:
df.head(3)

Unnamed: 0,views,likes,dislikes,comment_count,total_trending_days,published_year,published_month,published_week,published_day,published_hour,...,Howto & Style,Music,News & Politics,Nonprofits & Activism,People & Blogs,Pets & Animals,Science & Technology,Shows,Sports,Travel & Events
0,81377,655,25,177,1,2017,11,46,13,2,...,0,0,0,0,0,0,0,0,1,0
1,288922,7515,792,2111,1,2017,11,45,12,18,...,0,0,0,0,0,0,0,0,0,0
2,34785,308,26,413,1,2017,11,45,12,21,...,0,0,1,0,0,0,0,0,0,0


### ¿Es posible predecir cuantos likes o visitas tendrá un video? Si es así, crea un modelo que lo compruebe.

* Columna objetivo `(target)` --> Views

In [8]:
ct = make_column_transformer(
    (StandardScaler(),
    make_column_selector(dtype_include=np.number))
)

In [9]:
X = df.copy()
y = X.pop('views')

In [10]:
X = ct.fit_transform(X)

In [11]:
y.describe()

count       40949.0000
mean      2360784.6383
std       7394113.7597
min           549.0000
25%        242329.0000
50%        681861.0000
75%       1823157.0000
max     225211923.0000
Name: views, dtype: float64

In [12]:
y.dtypes

dtype('int64')

## Realizamos una primera prueba

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
model_linear = LinearRegression()

In [15]:
model_linear.fit(X_train, y_train)
model_linear.score(X_test, y_test)

0.7779087277386881

#### Metrics

Coefficient Determination

In [16]:
y_predict = model_linear.predict(X_test)
r2_score = r2_score(y_test, y_predict)
print("Coefficient of determination: %.2f" % r2_score)

Coefficient of determination: 0.78


In [17]:
df_result = pd.DataFrame()
df_result['y_true'] = y_test
df_result['y_predict'] = y_predict

In [18]:
df_result.sample(10)

Unnamed: 0,y_true,y_predict
40122,5103644,5350950.7405
6428,393757,421622.7405
36359,4480775,2007334.7405
19638,71841,-331269.2595
17851,723523,880010.7405
5822,451967,263254.7405
24812,488438,1449702.7405
7090,36934,826694.7405
24598,478968,-230073.2595
17736,214015,-316809.2595


## Probemos distintos modelos y seleccionemos el mejor

In [19]:
minMAD = 10000000
nfolds = 3
bestREG = ''

regs = [Lasso(), ElasticNet(), DecisionTreeRegressor(), GradientBoostingRegressor(), LinearRegression(),
        RandomForestRegressor()]

for reg in regs:
    kf = KFold(n_splits=nfolds,random_state=0,shuffle=True)
    mad = cross_val_score(reg, X, y,\
             cv=kf,scoring='neg_mean_absolute_error').mean()
    # need the lowest scoring for mad
    print (str(reg)[:25] + ' with mad= ' + str(mad) )
    if mad < minMAD:
        minMAD = mad
        bestREG = reg
        
print('***********************************************')
print ('Best Regressor is... ' + str(bestREG)[:25] )
print('**********************')
print ('With MAD Score ' + str(minMAD))

Lasso() with mad= -1267701.4493909143
ElasticNet() with mad= -1444239.7623721883
DecisionTreeRegressor() with mad= -354131.8606972846
GradientBoostingRegressor with mad= -788349.6727779023
LinearRegression() with mad= -1268967.0010748308
RandomForestRegressor() with mad= -285551.64098683925
***********************************************
Best Regressor is... ElasticNet()
**********************
With MAD Score -1444239.7623721883


El mejor modelo para nuestro set de datos es `ElasticNet` así que podremos 'tunearlo' a continuacion

In [20]:
parametersGrid = {"max_iter": [1, 5, 10],
                    "alpha": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                    "l1_ratio": np.arange(0.0, 1.0, 0.1)}

model_eNet = ElasticNet()
grid = GridSearchCV(model_eNet, parametersGrid, scoring='r2', cv=10)
grid.fit(X_train, y_train)
Y_pred = grid.predict(X_test)

In [21]:
grid.score(X_test, y_test)

0.778973882807052

In [22]:
df_result_tun = pd.DataFrame()
df_result_tun['y_true'] = y_test
df_result_tun['y_predict'] = Y_pred

In [23]:
df_result_tun.sample(5)

Unnamed: 0,y_true,y_predict
8322,70041,-719183.9221
23630,1413179,2205657.3572
19719,2684509,1851043.2705
39528,2476738,4961335.7309
24597,471621,-162914.7994
