# Práctica 2: Variaciones de la regresión

In [1]:
import numpy as np 
import pandas as pd
import re 

#Visualización
import cufflinks as cf
cf.go_offline()

#Import selectKBest

from sklearn.feature_selection import SelectKBest, f_classif, f_regression

#Escalamiento

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

#Modeling

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge, Lars, ElasticNet, BayesianRidge, SGDRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, accuracy_score, roc_auc_score

## Data Load

In [2]:
#aarchivo txt

data = pd.read_csv('./_chat.txt', sep='\t', header=None)

In [3]:
#Organización del dataset

patron = r'\[\d{2}/\d{2}/\d{2}, \d{1,2}:\d{2}:\d{2}\u202f(a\.m\.|p\.m\.)\]'
def date(x):
    f = re.search(patron, x)
    if f:
        f = f.group()[1:-1]
        f = f.replace('\u202f', ' ')
        f = f.replace('a.m.', 'am').replace('p.m.', 'pm')
        return f
    else:
        return None
    
data['date-hour'] = data[0].apply(date)
data['date'] = data['date-hour'].apply(lambda x: x.split(',')[0])
data['hour'] = data['date-hour'].apply(lambda x: x.split(',')[1])
data['date'] = pd.to_datetime(data['date'], format='%d/%m/%y')
data['hour'] = data['hour'].apply(lambda x: x[1:])
data['hour'] = pd.to_datetime(data['hour'], format='%I:%M:%S %p').dt.time
data['person'] = data[0].apply(lambda x: x.split(']')[1].split(':')[0][1:])
data['mensaje'] = data[0].apply(lambda x: x.split(']')[1].split(':')[1][1:])

In [4]:
data.head(10)

Unnamed: 0,0,date-hour,date,hour,person,mensaje
0,"[20/08/24, 11:44:48 p.m.] ex: ‎Messages and ca...","20/08/24, 11:44:48 pm",2024-08-20,23:44:48,ex,‎Messages and calls are end-to-end encrypted. ...
1,"[20/08/24, 11:44:48 p.m.] ex: Hola Jos ¿cómo h...","20/08/24, 11:44:48 pm",2024-08-20,23:44:48,ex,Hola Jos ¿cómo has estado?
2,"[20/08/24, 11:44:54 p.m.] ex: perdón si te mol...","20/08/24, 11:44:54 pm",2024-08-20,23:44:54,ex,perdón si te molesto.
3,"[21/08/24, 8:08:56 a.m.] chema: holi, buenos días","21/08/24, 8:08:56 am",2024-08-21,08:08:56,chema,"holi, buenos días"
4,"[21/08/24, 8:09:27 a.m.] chema: disculpa q no ...","21/08/24, 8:09:27 am",2024-08-21,08:09:27,chema,"disculpa q no te contesté ayer, no vi la noti"
5,"[21/08/24, 8:10:09 a.m.] chema: pero nono, no ...","21/08/24, 8:10:09 am",2024-08-21,08:10:09,chema,"pero nono, no es molestia"
6,"[21/08/24, 8:10:39 a.m.] chema: y he medio ocu...","21/08/24, 8:10:39 am",2024-08-21,08:10:39,chema,y he medio ocupadoo jaja
7,"[21/08/24, 8:10:53 a.m.] chema: que tal tu","21/08/24, 8:10:53 am",2024-08-21,08:10:53,chema,que tal tu
8,"[21/08/24, 10:57:43 a.m.] ex: no te preocupes,...","21/08/24, 10:57:43 am",2024-08-21,10:57:43,ex,"no te preocupes, ya era algo tarde"
9,"[21/08/24, 10:58:00 a.m.] ex: mucho trabajo?","21/08/24, 10:58:00 am",2024-08-21,10:58:00,ex,mucho trabajo?


In [5]:
data = data.drop([0, 'date-hour'], axis=1)

In [6]:
#Generación de variables

#Sticker
data['sticker'] = data['mensaje'].apply(lambda x: 1 if 'sticker omitted' in x else 0)

#Mensaje Editado
data['m_edited'] = data['mensaje'].apply(lambda x: 1 if 'This message was edited' in x else 0)

#Separación date y hour
data['date-hour'] = data['date'].astype(str) + ' ' + data['hour'].astype(str)
data['date-hour'] = pd.to_datetime(data['date-hour'], format='%Y-%m-%d %H:%M:%S')

#Ordenación del dataset
data = data[['date', 'hour', 'date-hour', 'person', 'mensaje', 'sticker', 'm_edited']]

#Tiempos entre mensajes
data['tiempo_entre_msj'] = data['date-hour'].diff().shift(-1)
data['tiempo_entre_msj'] = data['tiempo_entre_msj'].shift(1)
data['tiempo_entre_msj'] = data['tiempo_entre_msj'].dt.total_seconds()
data['tiempo_entre_msj'] = data['tiempo_entre_msj']/3600
data['tiempo_entre_msj'] = data['tiempo_entre_msj'].fillna(0)

#Tiempo de respuesta
data['tiempo_respuesta'] = data['date-hour'].diff().where(data['person'] != data['person'].shift())
data['tiempo_respuesta'] = data['tiempo_respuesta'].dt.total_seconds()
data['tiempo_respuesta'] = data['tiempo_respuesta']/3600
data['tiempo_respuesta'] = data['tiempo_respuesta'].fillna(0)

#Palabras por mensaje

data['num_palabras'] = data['mensaje'].apply(lambda x: len(x.split()))

#Turno de la conversación

data['ID_conversacion'] = (data['person'] != data['person'].shift()).cumsum()

#Eliminación del primer registro y reset del index

data = data.drop(0)
data = data.reset_index(drop=True)

In [7]:
data.head(10)

Unnamed: 0,date,hour,date-hour,person,mensaje,sticker,m_edited,tiempo_entre_msj,tiempo_respuesta,num_palabras,ID_conversacion
0,2024-08-20,23:44:48,2024-08-20 23:44:48,ex,Hola Jos ¿cómo has estado?,0,0,0.0,0.0,5,1
1,2024-08-20,23:44:54,2024-08-20 23:44:54,ex,perdón si te molesto.,0,0,0.001667,0.0,4,1
2,2024-08-21,08:08:56,2024-08-21 08:08:56,chema,"holi, buenos días",0,0,8.400556,8.400556,3,2
3,2024-08-21,08:09:27,2024-08-21 08:09:27,chema,"disculpa q no te contesté ayer, no vi la noti",0,0,0.008611,0.0,10,2
4,2024-08-21,08:10:09,2024-08-21 08:10:09,chema,"pero nono, no es molestia",0,0,0.011667,0.0,5,2
5,2024-08-21,08:10:39,2024-08-21 08:10:39,chema,y he medio ocupadoo jaja,0,0,0.008333,0.0,5,2
6,2024-08-21,08:10:53,2024-08-21 08:10:53,chema,que tal tu,0,0,0.003889,0.0,3,2
7,2024-08-21,10:57:43,2024-08-21 10:57:43,ex,"no te preocupes, ya era algo tarde",0,0,2.780556,2.780556,7,3
8,2024-08-21,10:58:00,2024-08-21 10:58:00,ex,mucho trabajo?,0,0,0.004722,0.0,2,3
9,2024-08-21,10:58:13,2024-08-21 10:58:13,ex,"pues bien bien, igual ocupado",0,0,0.003611,0.0,5,3


## Definición dataset

In [8]:
dataset = data.copy()
dataset = dataset.groupby(['ID_conversacion', 'person']).agg({'mensaje': 'count', 'sticker':'sum','m_edited':'sum','num_palabras':'mean','tiempo_entre_msj': 'median', 'tiempo_respuesta': 'sum', 'date-hour':'min'}).reset_index()

In [9]:
dataset

Unnamed: 0,ID_conversacion,person,mensaje,sticker,m_edited,num_palabras,tiempo_entre_msj,tiempo_respuesta,date-hour
0,1,ex,2,0,0,4.500000,0.000833,0.000000,2024-08-20 23:44:48
1,2,chema,5,0,0,5.200000,0.008611,8.400556,2024-08-21 08:08:56
2,3,ex,3,0,0,4.666667,0.004722,2.780556,2024-08-21 10:57:43
3,4,chema,6,0,0,8.166667,0.010694,0.782500,2024-08-21 11:45:10
4,5,ex,4,0,0,7.500000,0.006111,0.826944,2024-08-21 12:57:47
...,...,...,...,...,...,...,...,...,...
556,557,ex,2,0,0,3.500000,1.017361,2.033333,2024-10-02 21:42:27
557,558,chema,3,2,0,1.666667,1.012500,1.012500,2024-10-02 22:43:17
558,559,ex,1,0,0,1.000000,1.301111,1.301111,2024-10-04 18:45:59
559,560,chema,2,1,0,2.500000,1.144861,2.289444,2024-10-04 21:03:21


In [10]:
#Asignación de ID
dataset['ID_conversacion'] = 'C' + dataset['ID_conversacion'].astype(str)

In [11]:
#Variables adicionales

#Día de la semana
dataset['dia_sem'] = dataset['date-hour'].dt.dayofweek+1

#Parte del día (madrugada, mañana, tarde, noche)
def part(x):
    if 0 <= x < 6:
        return 0 #madrugada
    elif 6 <= x < 12:
        return 1 #mañana
    elif 12 <= x < 19:
        return 2 #tarde
    else:
        return 3 #noche
dataset['parte_dia'] = dataset['date-hour'].dt.hour.apply(part)
dataset['date'] = dataset['date-hour'].dt.date
dataset['hora'] = dataset['date-hour'].dt.time

In [12]:
dataset

Unnamed: 0,ID_conversacion,person,mensaje,sticker,m_edited,num_palabras,tiempo_entre_msj,tiempo_respuesta,date-hour,dia_sem,parte_dia,date,hora
0,C1,ex,2,0,0,4.500000,0.000833,0.000000,2024-08-20 23:44:48,2,3,2024-08-20,23:44:48
1,C2,chema,5,0,0,5.200000,0.008611,8.400556,2024-08-21 08:08:56,3,1,2024-08-21,08:08:56
2,C3,ex,3,0,0,4.666667,0.004722,2.780556,2024-08-21 10:57:43,3,1,2024-08-21,10:57:43
3,C4,chema,6,0,0,8.166667,0.010694,0.782500,2024-08-21 11:45:10,3,1,2024-08-21,11:45:10
4,C5,ex,4,0,0,7.500000,0.006111,0.826944,2024-08-21 12:57:47,3,2,2024-08-21,12:57:47
...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,C557,ex,2,0,0,3.500000,1.017361,2.033333,2024-10-02 21:42:27,3,3,2024-10-02,21:42:27
557,C558,chema,3,2,0,1.666667,1.012500,1.012500,2024-10-02 22:43:17,3,3,2024-10-02,22:43:17
558,C559,ex,1,0,0,1.000000,1.301111,1.301111,2024-10-04 18:45:59,5,2,2024-10-04,18:45:59
559,C560,chema,2,1,0,2.500000,1.144861,2.289444,2024-10-04 21:03:21,5,3,2024-10-04,21:03:21


In [13]:
#Ordenación de columnas
dataset = dataset[['ID_conversacion', 'date-hour', 'person', 'mensaje', 'sticker','m_edited','num_palabras','tiempo_entre_msj', 'tiempo_respuesta', 'dia_sem', 'parte_dia']]
dataset.columns = ['ID_conversacion', 'date-hour', 'person', 'cantidad_msjs', 'cantidad_stickers','msjs_editados','prom_msjs','tiempo_entre_msj', 'tiempo_respuesta', 'dia_sem', 'parte_dia']

In [14]:
dataset

Unnamed: 0,ID_conversacion,date-hour,person,cantidad_msjs,cantidad_stickers,msjs_editados,prom_msjs,tiempo_entre_msj,tiempo_respuesta,dia_sem,parte_dia
0,C1,2024-08-20 23:44:48,ex,2,0,0,4.500000,0.000833,0.000000,2,3
1,C2,2024-08-21 08:08:56,chema,5,0,0,5.200000,0.008611,8.400556,3,1
2,C3,2024-08-21 10:57:43,ex,3,0,0,4.666667,0.004722,2.780556,3,1
3,C4,2024-08-21 11:45:10,chema,6,0,0,8.166667,0.010694,0.782500,3,1
4,C5,2024-08-21 12:57:47,ex,4,0,0,7.500000,0.006111,0.826944,3,2
...,...,...,...,...,...,...,...,...,...,...,...
556,C557,2024-10-02 21:42:27,ex,2,0,0,3.500000,1.017361,2.033333,3,3
557,C558,2024-10-02 22:43:17,chema,3,2,0,1.666667,1.012500,1.012500,3,3
558,C559,2024-10-04 18:45:59,ex,1,0,0,1.000000,1.301111,1.301111,5,2
559,C560,2024-10-04 21:03:21,chema,2,1,0,2.500000,1.144861,2.289444,5,3


## EDA

In [15]:
#Just for plotting

dplot = dataset.copy()
dplot['date'] = dplot['date-hour'].dt.date
dplot['hour'] = dplot['date-hour'].dt.time

In [16]:
#plot date-cantidad de mensajes por persona

dplot.groupby(['date', 'person']).agg({'cantidad_msjs': 'sum'}).unstack().iplot(kind='bar', barmode='overlay', title='Cantidad de mensajes por persona por día')

In [17]:
dplot.groupby('person').agg({'tiempo_respuesta': 'median'}).iplot(kind='bar',title='Tiempo de respuesta promedio por persona')

In [18]:
dplot.groupby('person').agg({'tiempo_entre_msj': 'median'}).iplot(kind='bar',title='Tiempo entre mensajes promedio por persona')

In [19]:
#distibución de dia a la semana

dplot.groupby(['dia_sem', 'person']).agg({'cantidad_msjs': 'sum'}).unstack().iplot(kind='bar', title='Cantidad de mensajes por día de la semana', barmode='overlay')

In [20]:
#Distribución de la parte del día

dplot.groupby(['parte_dia', 'person']).agg({'cantidad_msjs': 'sum'}).unstack().iplot(kind='bar', title='Cantidad de mensajes por parte del día', barmode='overlay')

In [21]:
#Stickers por persona

dplot.groupby('person').agg({'cantidad_stickers': 'sum'}).iplot(kind='bar', title='Cantidad de stickers por persona')

In [22]:
#mensajes editados por persona

dplot.groupby('person').agg({'msjs_editados': 'sum'}).iplot(kind='bar', title='Cantidad de mensajes editados por persona')

In [23]:
dplot.groupby('person').agg({'prom_msjs': 'mean'}).iplot(kind='bar', title='Promedio de palabras por mensaje por persona')

In [24]:
dplot = dplot.drop(['date', 'hour', 'date-hour'], axis=1)

In [25]:
dplot = dplot.drop(['ID_conversacion'], axis=1)

In [26]:
dplot['person'] = dplot['person'].apply(lambda x: 1 if x == 'chema' else 0)

#Correlación

dplot.corr().iplot(kind='heatmap', colorscale='blues', title='Correlación de variables')


## Linear Regression: tiempo_respuesta

### Data prep

In [27]:
#Recategorización de variables
dataset = dataset.drop(['date-hour'], axis=1)
dataset['person'] = dataset['person'].apply(lambda x: 1 if x == 'chema' else 0)
dataset = pd.get_dummies(dataset, columns=['dia_sem', 'parte_dia'])
dataset = dataset*1

In [28]:
dataset

Unnamed: 0,ID_conversacion,person,cantidad_msjs,cantidad_stickers,msjs_editados,prom_msjs,tiempo_entre_msj,tiempo_respuesta,dia_sem_1,dia_sem_2,dia_sem_3,dia_sem_4,dia_sem_5,dia_sem_6,dia_sem_7,parte_dia_0,parte_dia_1,parte_dia_2,parte_dia_3
0,C1,0,2,0,0,4.500000,0.000833,0.000000,0,1,0,0,0,0,0,0,0,0,1
1,C2,1,5,0,0,5.200000,0.008611,8.400556,0,0,1,0,0,0,0,0,1,0,0
2,C3,0,3,0,0,4.666667,0.004722,2.780556,0,0,1,0,0,0,0,0,1,0,0
3,C4,1,6,0,0,8.166667,0.010694,0.782500,0,0,1,0,0,0,0,0,1,0,0
4,C5,0,4,0,0,7.500000,0.006111,0.826944,0,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,C557,0,2,0,0,3.500000,1.017361,2.033333,0,0,1,0,0,0,0,0,0,0,1
557,C558,1,3,2,0,1.666667,1.012500,1.012500,0,0,1,0,0,0,0,0,0,0,1
558,C559,0,1,0,0,1.000000,1.301111,1.301111,0,0,0,0,1,0,0,0,0,1,0
559,C560,1,2,1,0,2.500000,1.144861,2.289444,0,0,0,0,1,0,0,0,0,0,1


In [29]:
#Variables
um = ['ID_conversacion']
varc = ['tiempo_entre_msj', 'prom_msjs']
target = ['tiempo_respuesta']
vard = [x for x in dataset.columns if x not in um + varc + target]
X = dataset[vard+varc]
y = dataset[target]

In [30]:
#Best variables

kb = SelectKBest(score_func=f_regression, k='all')
kb.fit(X, y)
df_scores = pd.DataFrame(data=zip(X.columns, kb.scores_), columns=["feature", "score"]).set_index("feature").sort_values(by="score", ascending=False).reset_index()
df_scores


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Unnamed: 0,feature,score
0,parte_dia_1,241.287127
1,tiempo_entre_msj,153.346346
2,parte_dia_3,26.787157
3,person,9.34501
4,parte_dia_2,5.704459
5,prom_msjs,5.212241
6,cantidad_stickers,4.508765
7,parte_dia_0,3.122505
8,cantidad_msjs,0.959557
9,dia_sem_4,0.922895


In [31]:
#Valores absolutos de scores

df_scores['score'] = df_scores['score'].abs()
df_scores = df_scores.sort_values(by='score', ascending=False)

In [32]:
df_scores

Unnamed: 0,feature,score
0,parte_dia_1,241.287127
1,tiempo_entre_msj,153.346346
2,parte_dia_3,26.787157
3,person,9.34501
4,parte_dia_2,5.704459
5,prom_msjs,5.212241
6,cantidad_stickers,4.508765
7,parte_dia_0,3.122505
8,cantidad_msjs,0.959557
9,dia_sem_4,0.922895


### Linear Regression: no escalamiento

In [33]:
#Modelos

lr = LinearRegression()
lasso = Lasso()
ridge = Ridge()
lars = Lars()
en = ElasticNet()
br = BayesianRidge()

In [34]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
#Ajuste de modelos

lr.fit(X_train, y_train)
lasso.fit(X_train, y_train)
ridge.fit(X_train, y_train)
lars.fit(X_train, y_train)
en.fit(X_train, y_train)
br.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [36]:
#Scores en train

print('Linear Regression:', lr.score(X_train, y_train))
print('Lasso:', lasso.score(X_train, y_train))
print('Ridge:', ridge.score(X_train, y_train))
print('Lars:', lars.score(X_train, y_train))
print('Elastic Net:', en.score(X_train, y_train))
print('Bayesian Ridge:', br.score(X_train, y_train))

Linear Regression: 0.5584959554283844
Lasso: 0.16899674919020724
Ridge: 0.5583580098572128
Lars: 0.5584959554283844
Elastic Net: 0.20184750942737806
Bayesian Ridge: 0.5575570088558649


In [37]:
#Cross Validation de los modelos

print('Linear Regression:', cross_val_score(lr, X_train, y_train, cv=5).mean(), cross_val_score(lr, X_train, y_train, cv=5).std())
print('Lasso:', cross_val_score(lasso, X_train, y_train, cv=5).mean(), cross_val_score(lasso, X_train, y_train, cv=5).std())
print('Ridge:', cross_val_score(ridge, X_train, y_train, cv=5).mean(), cross_val_score(ridge, X_train, y_train, cv=5).std())
print('Lars:', cross_val_score(lars, X_train, y_train, cv=5).mean(), cross_val_score(lars, X_train, y_train, cv=5).std())
print('Elastic Net:', cross_val_score(en, X_train, y_train, cv=5).mean(), cross_val_score(en, X_train, y_train, cv=5).std())
print('Bayesian Ridge:', cross_val_score(br, X_train, y_train, cv=5).mean(), cross_val_score(br, X_train, y_train, cv=5).std())

Linear Regression: 0.24921646869886568 0.4423645598921768
Lasso: 0.04168564685434646 0.1374119592162483
Ridge: 0.25513925547959665 0.42808536471383163
Lars: -69828.94670618206 139658.72325154624
Elastic Net: 0.05275647541535007 0.145597397794604
Bayesian Ridge: 0.25680566368971947 0.41572649977523196



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

In [38]:
#Validación en test de los modelos

print('Linear Regression:', lr.score(X_test, y_test))
print('Lasso:', lasso.score(X_test, y_test))
print('Ridge:', ridge.score(X_test, y_test))
print('Lars:', lars.score(X_test, y_test))
print('Elastic Net:', en.score(X_test, y_test))
print('Bayesian Ridge:', br.score(X_test, y_test))

Linear Regression: -0.14716228058405
Lasso: -0.0002703623854674042
Ridge: -0.13560295920024967
Lars: -0.14716228058404512
Elastic Net: 0.0016264811980424287
Bayesian Ridge: -0.11737893219951889


In [39]:
#Métricas de performance de los modelos

y_pred_lr = lr.predict(X_test)
y_pred_lasso = lasso.predict(X_test)
y_pred_ridge = ridge.predict(X_test)
y_pred_lars = lars.predict(X_test)
y_pred_en = en.predict(X_test)

#Dataframe de resultados

df_resultados = pd.DataFrame(data={'Modelo': ['Linear Regression', 'Lasso', 'Ridge', 'Lars', 'Elastic Net'],})
df_resultados['R2'] = [r2_score(y_test, y_pred_lr), r2_score(y_test, y_pred_lasso), r2_score(y_test, y_pred_ridge), r2_score(y_test, y_pred_lars), r2_score(y_test, y_pred_en)]
df_resultados['MAE'] = [mean_absolute_error(y_test, y_pred_lr), mean_absolute_error(y_test, y_pred_lasso), mean_absolute_error(y_test, y_pred_ridge), mean_absolute_error(y_test, y_pred_lars), mean_absolute_error(y_test, y_pred_en)]
df_resultados['MSE'] = [mean_squared_error(y_test, y_pred_lr), mean_squared_error(y_test, y_pred_lasso), mean_squared_error(y_test, y_pred_ridge), mean_squared_error(y_test, y_pred_lars), mean_squared_error(y_test, y_pred_en)]
df_resultados['MAPE'] = [mean_absolute_percentage_error(y_test, y_pred_lr), mean_absolute_percentage_error(y_test, y_pred_lasso), mean_absolute_percentage_error(y_test, y_pred_ridge), mean_absolute_percentage_error(y_test, y_pred_lars), mean_absolute_percentage_error(y_test, y_pred_en)]

In [40]:
df_resultados

Unnamed: 0,Modelo,R2,MAE,MSE,MAPE
0,Linear Regression,-0.147162,1.355451,7.290148,4.425814
1,Lasso,-0.00027,1.303656,6.356658,7.161805
2,Ridge,-0.135603,1.349914,7.216689,4.473068
3,Lars,-0.147162,1.355451,7.290148,4.425814
4,Elastic Net,0.001626,1.312181,6.344604,6.899651


In [41]:
#Betas de los modelos por variable

df_betas = pd.DataFrame(data={'Variable': X.columns})
df_betas = df_betas.merge(pd.DataFrame(lr.coef_.T, columns=['Linear Regression']), left_index=True, right_index=True)
df_betas = df_betas.merge(pd.DataFrame(lasso.coef_.T, columns=['Lasso']), left_index=True, right_index=True)
df_betas = df_betas.merge(pd.DataFrame(ridge.coef_.T, columns=['Ridge']), left_index=True, right_index=True)
df_betas = df_betas.merge(pd.DataFrame(lars.coef_.T, columns=['Lars']), left_index=True, right_index=True)
df_betas = df_betas.merge(pd.DataFrame(en.coef_.T, columns=['Elastic Net']), left_index=True, right_index=True)
df_betas = df_betas.merge(pd.DataFrame(br.coef_.T, columns=['Bayesian Ridge']), left_index=True, right_index=True)


In [42]:
df_betas

Unnamed: 0,Variable,Linear Regression,Lasso,Ridge,Lars,Elastic Net,Bayesian Ridge
0,person,-0.722224,-0.0,-0.711746,-0.722224,-0.0,-0.694065
1,cantidad_msjs,0.031428,-0.0,0.032148,0.031428,-0.000779,0.033271
2,cantidad_stickers,-0.051825,-0.0,-0.057758,-0.051825,-0.0,-0.06731
3,msjs_editados,-0.140434,-0.0,-0.151741,-0.140434,-0.0,-0.167605
4,dia_sem_1,0.102803,-0.0,0.099019,0.001606,-0.0,0.092241
5,dia_sem_2,0.041638,-0.0,0.036057,-0.059559,-0.0,0.027093
6,dia_sem_3,0.101197,0.0,0.104822,0.0,0.0,0.109647
7,dia_sem_4,-0.514925,0.0,-0.48566,-0.616122,0.0,-0.439632
8,dia_sem_5,0.137471,0.0,0.139319,0.036274,0.0,0.14105
9,dia_sem_6,-0.164303,0.0,-0.161691,-0.2655,0.0,-0.156976


## Modelos escalados

In [43]:
dataset

Unnamed: 0,ID_conversacion,person,cantidad_msjs,cantidad_stickers,msjs_editados,prom_msjs,tiempo_entre_msj,tiempo_respuesta,dia_sem_1,dia_sem_2,dia_sem_3,dia_sem_4,dia_sem_5,dia_sem_6,dia_sem_7,parte_dia_0,parte_dia_1,parte_dia_2,parte_dia_3
0,C1,0,2,0,0,4.500000,0.000833,0.000000,0,1,0,0,0,0,0,0,0,0,1
1,C2,1,5,0,0,5.200000,0.008611,8.400556,0,0,1,0,0,0,0,0,1,0,0
2,C3,0,3,0,0,4.666667,0.004722,2.780556,0,0,1,0,0,0,0,0,1,0,0
3,C4,1,6,0,0,8.166667,0.010694,0.782500,0,0,1,0,0,0,0,0,1,0,0
4,C5,0,4,0,0,7.500000,0.006111,0.826944,0,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,C557,0,2,0,0,3.500000,1.017361,2.033333,0,0,1,0,0,0,0,0,0,0,1
557,C558,1,3,2,0,1.666667,1.012500,1.012500,0,0,1,0,0,0,0,0,0,0,1
558,C559,0,1,0,0,1.000000,1.301111,1.301111,0,0,0,0,1,0,0,0,0,1,0
559,C560,1,2,1,0,2.500000,1.144861,2.289444,0,0,0,0,1,0,0,0,0,0,1


In [44]:
#MinMaxScaler

mm = MinMaxScaler()
mmy = MinMaxScaler()

X1 = mm.fit_transform(X_train)
y1 = mmy.fit_transform(y_train)

#StandardScaler

ss = StandardScaler()
ssy = StandardScaler()

X2 = ss.fit_transform(X_train)
y2 = ssy.fit_transform(y_train)

#RobustScaler

rs = RobustScaler()
rsy = RobustScaler()

X3 = rs.fit_transform(X_train)
y3 = rsy.fit_transform(y_train)

### MinMax Scaler

In [45]:
#Modelos con MinMaxScaler

lr.fit(X1, y1)
lasso.fit(X1, y1)
ridge.fit(X1, y1)
lars.fit(X1, y1)
en.fit(X1, y1)
br.fit(X1, y1)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [46]:
#Scores en train

print('Linear Regression:', lr.score(X1, y1))
print('Lasso:', lasso.score(X1, y1))
print('Ridge:', ridge.score(X1, y1))
print('Lars:', lars.score(X1, y1))
print('Elastic Net:', en.score(X1, y1))
print('Bayesian Ridge:', br.score(X1, y1))

Linear Regression: 0.5584959554283844
Lasso: 0.0
Ridge: 0.5357460128560033
Lars: 0.5012959676662758
Elastic Net: 0.0
Bayesian Ridge: 0.5569610691302496


In [47]:
#Cross Validation de los modelos

print('Linear Regression:', cross_val_score(lr, X1, y1, cv=5).mean(), cross_val_score(lr, X1, y1, cv=5).std())
print('Lasso:', cross_val_score(lasso, X1, y1, cv=5).mean(), cross_val_score(lasso, X1, y1, cv=5).std())
print('Ridge:', cross_val_score(ridge, X1, y1, cv=5).mean(), cross_val_score(ridge, X1, y1, cv=5).std())
print('Lars:', cross_val_score(lars, X1, y1, cv=5).mean(), cross_val_score(lars, X1, y1, cv=5).std())
print('Elastic Net:', cross_val_score(en, X1, y1, cv=5).mean(), cross_val_score(en, X1, y1, cv=5).std())
print('Bayesian Ridge:', cross_val_score(br, X1, y1, cv=5).mean(), cross_val_score(br, X1, y1, cv=5).std())

Linear Regression: 0.24778677961402687 0.44068473419535414
Lasso: -0.02929763767557856 0.010728388412284654
Ridge: 0.3110953506031448 0.259359391807075
Lars: 0.24378191668100468 0.4411040452174165
Elastic Net: -0.02929763767557856 0.010728388412284654
Bayesian Ridge: 0.235165690452967 0.373034278614688



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

In [48]:
#Validación en test de los modelos

X_test1 = mm.transform(X_test)
y_test1 = mmy.transform(y_test)

print('Linear Regression:', lr.score(X_test1, y_test1))
print('Lasso:', lasso.score(X_test1, y_test1))
print('Ridge:', ridge.score(X_test1, y_test1))
print('Lars:', lars.score(X_test1, y_test1))
print('Elastic Net:', en.score(X_test1, y_test1))
print('Bayesian Ridge:', br.score(X_test1, y_test1))

Linear Regression: -0.14716228058404668
Lasso: -0.001944734448878327
Ridge: -0.16937926837824446
Lars: -0.1710699145698762
Elastic Net: -0.001944734448878327
Bayesian Ridge: -0.1533960676036894


In [49]:
#Métricas de performance de los modelos

y_pred_lr1 = lr.predict(X_test1)
y_pred_lasso1 = lasso.predict(X_test1)
y_pred_ridge1 = ridge.predict(X_test1)
y_pred_lars1 = lars.predict(X_test1)
y_pred_en1 = en.predict(X_test1)
y_pred_br1 = br.predict(X_test1)

#Dataframe de resultados

df_resultados1 = pd.DataFrame(data={'Modelo': ['Linear Regression', 'Lasso', 'Ridge', 'Lars', 'Elastic Net', 'Bayesian Ridge'],})
df_resultados1['R2'] = [r2_score(y_test1, y_pred_lr1), r2_score(y_test1, y_pred_lasso1), r2_score(y_test1, y_pred_ridge1), r2_score(y_test1, y_pred_lars1), r2_score(y_test1, y_pred_en1), r2_score(y_test1, y_pred_br1)]
df_resultados1['MAE'] = [mean_absolute_error(y_test1, y_pred_lr1), mean_absolute_error(y_test1, y_pred_lasso1), mean_absolute_error(y_test1, y_pred_ridge1), mean_absolute_error(y_test1, y_pred_lars1), mean_absolute_error(y_test1, y_pred_en1), mean_absolute_error(y_test1, y_pred_br1)]
df_resultados1['MSE'] = [mean_squared_error(y_test1, y_pred_lr1), mean_squared_error(y_test1, y_pred_lasso1), mean_squared_error(y_test1, y_pred_ridge1), mean_squared_error(y_test1, y_pred_lars1), mean_squared_error(y_test1, y_pred_en1), mean_squared_error(y_test1, y_pred_br1)]
df_resultados1['MAPE'] = [mean_absolute_percentage_error(y_test1, y_pred_lr1), mean_absolute_percentage_error(y_test1, y_pred_lasso1), mean_absolute_percentage_error(y_test1, y_pred_ridge1), mean_absolute_percentage_error(y_test1, y_pred_lars1), mean_absolute_percentage_error(y_test1, y_pred_en1), mean_absolute_percentage_error(y_test1, y_pred_br1)]

In [50]:
df_resultados1

Unnamed: 0,Modelo,R2,MAE,MSE,MAPE
0,Linear Regression,-0.147162,0.038223,0.005797,4.425814
1,Lasso,-0.001945,0.03794,0.005063,7.534369
2,Ridge,-0.169379,0.038624,0.005909,4.250504
3,Lars,-0.17107,0.043607,0.005918,5.916301
4,Elastic Net,-0.001945,0.03794,0.005063,7.534369
5,Bayesian Ridge,-0.153396,0.038332,0.005829,4.36199


In [51]:
#Modelos con StandardScaler

lr.fit(X2, y2)
lasso.fit(X2, y2)
ridge.fit(X2, y2)
lars.fit(X2, y2)
en.fit(X2, y2)
br.fit(X2, y2)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [52]:
#Scores en train

print('Linear Regression:', lr.score(X2, y2))
print('Lasso:', lasso.score(X2, y2))
print('Ridge:', ridge.score(X2, y2))
print('Lars:', lars.score(X2, y2))
print('Elastic Net:', en.score(X2, y2))
print('Bayesian Ridge:', br.score(X2, y2))

Linear Regression: 0.558374456886815
Lasso: 0.0
Ridge: 0.5584938466617202
Lars: 0.5584959554283846
Elastic Net: 0.08517927914366763
Bayesian Ridge: 0.5580346535290295


In [53]:
#Cross Validation de los modelos

print('Linear Regression:', cross_val_score(lr, X2, y2, cv=5).mean(), cross_val_score(lr, X2, y2, cv=5).std())
print('Lasso:', cross_val_score(lasso, X2, y2, cv=5).mean(), cross_val_score(lasso, X2, y2, cv=5).std())
print('Ridge:', cross_val_score(ridge, X2, y2, cv=5).mean(), cross_val_score(ridge, X2, y2, cv=5).std())
print('Lars:', cross_val_score(lars, X2, y2, cv=5).mean(), cross_val_score(lars, X2, y2, cv=5).std())
print('Elastic Net:', cross_val_score(en, X2, y2, cv=5).mean(), cross_val_score(en, X2, y2, cv=5).std())
print('Bayesian Ridge:', cross_val_score(br, X2, y2, cv=5).mean(), cross_val_score(br, X2, y2, cv=5).std())

Linear Regression: 0.24981191089409965 0.4424440665075167
Lasso: -0.029297637675578515 0.010728388412284715
Ridge: 0.2506738566294601 0.4388599322477847
Lars: 0.24921646869887049 0.44236455989217127
Elastic Net: 0.0604666696814379 0.05573676998830984
Bayesian Ridge: 0.2583205513674501 0.40604838894909856



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

In [54]:
#Validación en test de los modelos

X_test2 = ss.transform(X_test)
y_test2 = ssy.transform(y_test)

print('Linear Regression:', lr.score(X_test2, y_test2))
print('Lasso:', lasso.score(X_test2, y_test2))
print('Ridge:', ridge.score(X_test2, y_test2))
print('Lars:', lars.score(X_test2, y_test2))
print('Elastic Net:', en.score(X_test2, y_test2))
print('Bayesian Ridge:', br.score(X_test2, y_test2))

Linear Regression: -0.14435682648070602
Lasso: -0.001944734448878327
Ridge: -0.14599838655018726
Lars: -0.14716228058404512
Elastic Net: 0.04072820848377767
Bayesian Ridge: -0.12986048421378782


In [55]:
#Métricas de performance de los modelos

y_pred_lr2 = lr.predict(X_test2)
y_pred_lasso2 = lasso.predict(X_test2)
y_pred_ridge2 = ridge.predict(X_test2)
y_pred_lars2 = lars.predict(X_test2)
y_pred_en2 = en.predict(X_test2)
y_pred_br2 = br.predict(X_test2)

#Dataframe de resultados

df_resultados2 = pd.DataFrame(data={'Modelo': ['Linear Regression', 'Lasso', 'Ridge', 'Lars', 'Elastic Net', 'Bayesian Ridge'],})
df_resultados2['R2'] = [r2_score(y_test2, y_pred_lr2), r2_score(y_test2, y_pred_lasso2), r2_score(y_test2, y_pred_ridge2), r2_score(y_test2, y_pred_lars2), r2_score(y_test2, y_pred_en2), r2_score(y_test2, y_pred_br2)]
df_resultados2['MAE'] = [mean_absolute_error(y_test2, y_pred_lr2), mean_absolute_error(y_test2, y_pred_lasso2), mean_absolute_error(y_test2, y_pred_ridge2), mean_absolute_error(y_test2, y_pred_lars2), mean_absolute_error(y_test2, y_pred_en2), mean_absolute_error(y_test2, y_pred_br2)]
df_resultados2['MSE'] = [mean_squared_error(y_test2, y_pred_lr2), mean_squared_error(y_test2, y_pred_lasso2), mean_squared_error(y_test2, y_pred_ridge2), mean_squared_error(y_test2, y_pred_lars2), mean_squared_error(y_test2, y_pred_en2), mean_squared_error(y_test2, y_pred_br2)]
df_resultados2['MAPE'] = [mean_absolute_percentage_error(y_test2, y_pred_lr2), mean_absolute_percentage_error(y_test2, y_pred_lasso2), mean_absolute_percentage_error(y_test2, y_pred_ridge2), mean_absolute_percentage_error(y_test2, y_pred_lars2), mean_absolute_percentage_error(y_test2, y_pred_en2), mean_absolute_percentage_error(y_test2, y_pred_br2)]

In [56]:
df_resultados2

Unnamed: 0,Modelo,R2,MAE,MSE,MAPE
0,Linear Regression,-0.144357,0.426355,0.717146,1.539074
1,Lasso,-0.001945,0.422501,0.627899,1.0
2,Ridge,-0.145998,0.425479,0.718175,1.523906
3,Lars,-0.147162,0.425649,0.718904,1.524453
4,Elastic Net,0.040728,0.414473,0.601157,0.983105
5,Bayesian Ridge,-0.12986,0.423045,0.708062,1.513892


### Robust Scaler

In [57]:
#Modelos con RobustScaler

lr.fit(X3, y3)
lasso.fit(X3, y3)
ridge.fit(X3, y3)
lars.fit(X3, y3)
en.fit(X3, y3)
br.fit(X3, y3)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [58]:
#Scores en train

print('Linear Regression:', lr.score(X3, y3))
print('Lasso:', lasso.score(X3, y3))
print('Ridge:', ridge.score(X3, y3))
print('Lars:', lars.score(X3, y3))
print('Elastic Net:', en.score(X3, y3))
print('Bayesian Ridge:', br.score(X3, y3))

Linear Regression: 0.5584959554283844
Lasso: 0.2529201130253603
Ridge: 0.558357506009107
Lars: 0.5584959554283844
Elastic Net: 0.2698361737888382
Bayesian Ridge: 0.5574644331586649


In [59]:
#Cross Validation de los modelos

print('Linear Regression:', cross_val_score(lr, X3, y3, cv=5).mean(), cross_val_score(lr, X3, y3, cv=5).std())
print('Lasso:', cross_val_score(lasso, X3, y3, cv=5).mean(), cross_val_score(lasso, X3, y3, cv=5).std())
print('Ridge:', cross_val_score(ridge, X3, y3, cv=5).mean(), cross_val_score(ridge, X3, y3, cv=5).std())
print('Lars:', cross_val_score(lars, X3, y3, cv=5).mean(), cross_val_score(lars, X3, y3, cv=5).std())
print('Elastic Net:', cross_val_score(en, X3, y3, cv=5).mean(), cross_val_score(en, X3, y3, cv=5).std())
print('Bayesian Ridge:', cross_val_score(br, X3, y3, cv=5).mean(), cross_val_score(br, X3, y3, cv=5).std())

Linear Regression: 0.2492164686988622 0.4423645598921798
Lasso: 0.044565859073717504 0.17735534175767834
Ridge: 0.25450284579080684 0.42987860962340374
Lars: 0.2469813919079728 0.4418790636213539
Elastic Net: 0.06275016783088347 0.17304302224074505
Bayesian Ridge: 0.2564461766652232 0.41781513207222387



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

In [60]:
#Validación en test de los modelos

X_test3 = rs.transform(X_test)
y_test3 = rsy.transform(y_test)

print('Linear Regression:', lr.score(X_test3, y_test3))
print('Lasso:', lasso.score(X_test3, y_test3))
print('Ridge:', ridge.score(X_test3, y_test3))
print('Lars:', lars.score(X_test3, y_test3))
print('Elastic Net:', en.score(X_test3, y_test3))
print('Bayesian Ridge:', br.score(X_test3, y_test3))

Linear Regression: -0.1471622805840498
Lasso: -0.02486677162571671
Ridge: -0.1355465559266631
Lars: -0.1471622805840458
Elastic Net: -0.013092265471452258
Bayesian Ridge: -0.11592911935411543


In [61]:
#Métricas de performance de los modelos

y_pred_lr3 = lr.predict(X_test3)
y_pred_lasso3 = lasso.predict(X_test3)
y_pred_ridge3 = ridge.predict(X_test3)
y_pred_lars3 = lars.predict(X_test3)
y_pred_en3 = en.predict(X_test3)
y_pred_br3 = br.predict(X_test3)

#Dataframe de resultados

df_resultados3 = pd.DataFrame(data={'Modelo': ['Linear Regression', 'Lasso', 'Ridge', 'Lars', 'Elastic Net', 'Bayesian Ridge'],})
df_resultados3['R2'] = [r2_score(y_test3, y_pred_lr3), r2_score(y_test3, y_pred_lasso3), r2_score(y_test3, y_pred_ridge3), r2_score(y_test3, y_pred_lars3), r2_score(y_test3, y_pred_en3), r2_score(y_test3, y_pred_br3)]
df_resultados3['MAE'] = [mean_absolute_error(y_test3, y_pred_lr3), mean_absolute_error(y_test3, y_pred_lasso3), mean_absolute_error(y_test3, y_pred_ridge3), mean_absolute_error(y_test3, y_pred_lars3), mean_absolute_error(y_test3, y_pred_en3), mean_absolute_error(y_test3, y_pred_br3)]
df_resultados3['MSE'] = [mean_squared_error(y_test3, y_pred_lr3), mean_squared_error(y_test3, y_pred_lasso3), mean_squared_error(y_test3, y_pred_ridge3), mean_squared_error(y_test3, y_pred_lars3), mean_squared_error(y_test3, y_pred_en3), mean_squared_error(y_test3, y_pred_br3)]
df_resultados3['MAPE'] = [mean_absolute_percentage_error(y_test3, y_pred_lr3), mean_absolute_percentage_error(y_test3, y_pred_lasso3), mean_absolute_percentage_error(y_test3, y_pred_ridge3), mean_absolute_percentage_error(y_test3, y_pred_lars3), mean_absolute_percentage_error(y_test3, y_pred_en3), mean_absolute_percentage_error(y_test3, y_pred_br3)]

In [62]:
df_resultados3

Unnamed: 0,Modelo,R2,MAE,MSE,MAPE
0,Linear Regression,-0.147162,1.658608,10.91581,7.373603
1,Lasso,-0.024867,1.605368,9.752109,8.79725
2,Ridge,-0.135547,1.65216,10.805281,7.308703
3,Lars,-0.147162,1.658608,10.91581,7.373603
4,Elastic Net,-0.013092,1.599321,9.640069,8.725323
5,Bayesian Ridge,-0.115929,1.646525,10.618612,7.293462


In [63]:
#df to excel

with pd.ExcelWriter('resultados.xlsx') as writer:
    df_resultados.to_excel(writer, sheet_name='Resultados')
    df_resultados1.to_excel(writer, sheet_name='Resultados MinMaxScaler')
    df_resultados2.to_excel(writer, sheet_name='Resultados StandardScaler')
    df_resultados3.to_excel(writer, sheet_name='Resultados RobustScaler')

In [64]:
#STOCASTIC GRADIENT DESCENT AL MEJOR MODELO

sgd = SGDRegressor()
sgd.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [65]:
#Scores en train

print('SGD:', sgd.score(X_train, y_train))

SGD: 0.13333616528645953


In [66]:
#Cross Validation de los modelos

print('SGD:', cross_val_score(sgd, X_train, y_train, cv=5).mean(), cross_val_score(sgd, X_train, y_train, cv=5).std())

SGD: 0.16203880723409694 0.3647598598908795



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

In [67]:
#Validación en test de los modelos

print('SGD:', sgd.score(X_test, y_test))

SGD: -0.8574403207918855


In [68]:
#Métricas de performance de los modelos

y_pred_sgd = sgd.predict(X_test)

#Dataframe de resultados

df_resultados_sgd = pd.DataFrame(data={'Modelo': ['SGD'],})
df_resultados_sgd['R2'] = [r2_score(y_test, y_pred_sgd)]
df_resultados_sgd['MAE'] = [mean_absolute_error(y_test, y_pred_sgd)]
df_resultados_sgd['MSE'] = [mean_squared_error(y_test, y_pred_sgd)]
df_resultados_sgd['MAPE'] = [mean_absolute_percentage_error(y_test, y_pred_sgd)]

df_resultados_sgd

Unnamed: 0,Modelo,R2,MAE,MSE,MAPE
0,SGD,-0.85744,2.37592,11.803922,6.793607


In [69]:
#GRID SEARCH Y RANDOMIZED SEARCH A ELASTICNET CON DATOS STANDARDSCALER

parametros = {'alpha': [0.1, 0.5, 1, 5, 10, 50, 100], 'l1_ratio': [0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1]}
en = ElasticNet()

grid = GridSearchCV(en, param_grid=parametros, cv=5)

In [70]:
grid

In [71]:
grid.fit(X2, y2)

In [72]:
#Mejores parámetros

grid.best_params_

{'alpha': 0.5, 'l1_ratio': 0.1}

In [73]:
#Scores en train

print('Elastic Net:', grid.score(X2, y2))

Elastic Net: 0.48421121118306176


In [74]:
#Cross Validation de los modelos

print('Elastic Net:', cross_val_score(grid, X2, y2, cv=5).mean(), cross_val_score(grid, X2, y2, cv=5).std())

Elastic Net: 0.24991064714841463 0.24593019894764132


In [75]:
#Validación en test de los modelos

print('Elastic Net:', grid.score(X_test2, y_test2))

Elastic Net: 0.042597471319530844


In [76]:
#Métricas de performance de los modelos

y_pred_en_grid = grid.predict(X_test2)

#Dataframe de resultados

df_resultados_en_grid = pd.DataFrame(data={'Modelo': ['Elastic Net Grid'],})
df_resultados_en_grid['R2'] = [r2_score(y_test2, y_pred_en_grid)]
df_resultados_en_grid['MAE'] = [mean_absolute_error(y_test2, y_pred_en_grid)]
df_resultados_en_grid['MSE'] = [mean_squared_error(y_test2, y_pred_en_grid)]
df_resultados_en_grid['MAPE'] = [mean_absolute_percentage_error(y_test2, y_pred_en_grid)]

df_resultados_en_grid

Unnamed: 0,Modelo,R2,MAE,MSE,MAPE
0,Elastic Net Grid,0.042597,0.401302,0.599986,1.286814


In [77]:
#RANDOMIZED SEARCH

random = RandomizedSearchCV(en, param_distributions=parametros, cv=5, n_iter=10)
random.fit(X2, y2)

#Mejores parámetros

random.best_params_

{'l1_ratio': 0.1, 'alpha': 0.5}

In [78]:
#Scores en train

print('Elastic Net:', random.score(X2, y2))

Elastic Net: 0.48421121118306176


In [79]:
#Cross Validation de los modelos

print('Elastic Net:', cross_val_score(random, X2, y2, cv=5).mean(), cross_val_score(random, X2, y2, cv=5).std())

Elastic Net: 0.25409318641966694 0.23437066497351125


In [80]:
#Validación en test de los modelos

print('Elastic Net:', random.score(X_test2, y_test2))

Elastic Net: 0.042597471319530844


In [81]:
#Métricas de performance de los modelos

y_pred_en_random = random.predict(X_test2)

#Dataframe de resultados

df_resultados_en_random = pd.DataFrame(data={'Modelo': ['Elastic Net Random'],})
df_resultados_en_random['R2'] = [r2_score(y_test2, y_pred_en_random)]
df_resultados_en_random['MAE'] = [mean_absolute_error(y_test2, y_pred_en_random)]
df_resultados_en_random['MSE'] = [mean_squared_error(y_test2, y_pred_en_random)]
df_resultados_en_random['MAPE'] = [mean_absolute_percentage_error(y_test2, y_pred_en_random)]


In [82]:
df_resultados_en_random

Unnamed: 0,Modelo,R2,MAE,MSE,MAPE
0,Elastic Net Random,0.042597,0.401302,0.599986,1.286814


## Logistic Regression: person

In [83]:
dataset

Unnamed: 0,ID_conversacion,person,cantidad_msjs,cantidad_stickers,msjs_editados,prom_msjs,tiempo_entre_msj,tiempo_respuesta,dia_sem_1,dia_sem_2,dia_sem_3,dia_sem_4,dia_sem_5,dia_sem_6,dia_sem_7,parte_dia_0,parte_dia_1,parte_dia_2,parte_dia_3
0,C1,0,2,0,0,4.500000,0.000833,0.000000,0,1,0,0,0,0,0,0,0,0,1
1,C2,1,5,0,0,5.200000,0.008611,8.400556,0,0,1,0,0,0,0,0,1,0,0
2,C3,0,3,0,0,4.666667,0.004722,2.780556,0,0,1,0,0,0,0,0,1,0,0
3,C4,1,6,0,0,8.166667,0.010694,0.782500,0,0,1,0,0,0,0,0,1,0,0
4,C5,0,4,0,0,7.500000,0.006111,0.826944,0,0,1,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
556,C557,0,2,0,0,3.500000,1.017361,2.033333,0,0,1,0,0,0,0,0,0,0,1
557,C558,1,3,2,0,1.666667,1.012500,1.012500,0,0,1,0,0,0,0,0,0,0,1
558,C559,0,1,0,0,1.000000,1.301111,1.301111,0,0,0,0,1,0,0,0,0,1,0
559,C560,1,2,1,0,2.500000,1.144861,2.289444,0,0,0,0,1,0,0,0,0,0,1


In [84]:
#Variabes

um = ['ID_conversacion']
varc = ['tiempo_entre_msj', 'tiempo_respuesta', 'prom_msjs']
target = ['person']
vard = [x for x in dataset.columns if x not in um + varc + target]

In [85]:
X = dataset[vard+varc]
y = dataset[target]

In [86]:
#Regresión Logística

lr = LogisticRegression()

#train test split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Ajuste del modelo

lr.fit(X_train, y_train)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [87]:
#Score en train

lr.score(X_train, y_train)

0.7232142857142857

In [88]:
#Cross Validation

lrcv = cross_val_score(lr, X_train, y_train, cv=5, scoring='accuracy')
lrcv


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.

array([0.6       , 0.8       , 0.8       , 0.68539326, 0.69662921])

In [89]:
#Mean y std de cross validation

lrcv.mean(), lrcv.std()

(0.7164044943820225, 0.07599892357326461)

In [90]:
#Validación en test

lr.score(X_test, y_test)

0.831858407079646

In [91]:
#Predicción

y_pred = lr.predict(X_test)

#Métricas de performance accuracy, sensibilidad, especificidad

accuracy = accuracy_score(y_test, y_pred)
roc = roc_auc_score(y_test, y_pred)


print(f'Accuracy: {accuracy}\nROC: {roc}')

Accuracy: 0.831858407079646
ROC: 0.8306603773584906


In [92]:
#Weights

weights = pd.DataFrame({'feature': X.columns, 'weight': lr.coef_[0]}).sort_values(by='weight', ascending=False)

#Valores absolutos de los pesos

weights['weight'] = weights['weight'].abs()

weights.sort_values(by='weight', ascending=False)

Unnamed: 0,feature,weight
1,cantidad_stickers,1.218582
11,parte_dia_1,0.782199
14,tiempo_entre_msj,0.574282
6,dia_sem_4,0.562443
13,parte_dia_3,0.546764
9,dia_sem_7,0.232239
3,dia_sem_1,0.219258
12,parte_dia_2,0.208426
15,tiempo_respuesta,0.169175
8,dia_sem_6,0.160249
