# 3er Entregable

Integrantes:
- Araoz, Tania
- Bajo, Pablo
- Barrera, Manuel

### Carga de librerias a utilizar 

In [54]:
import pandas as pd
from datetime import datetime
from scipy.sparse import csr_matrix
from lightfm import LightFM

### Carga de datasets

In [55]:
movies = pd.read_csv("../data/ml-latest/movies.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [56]:
ratings = pd.read_csv("../data/ml-latest/ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


> Se usa el dataset de ratings para trabajar, tiene las interacciones entre usuarios y películas

In [57]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


> El dataset contiene 100836 interacciones. <span style="color:red">ACTUALIZAR CON DATASET GRANDE</span>

> El timestamp está en formato int64, se debe convertir a formato fecha para poder trabajar.

In [58]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

> No hay valores nulos

In [59]:
ratings['userId'].nunique()

610

> El dataset tiene 610 ususarios. <span style="color:red">Cambiar con dataset grande</span> 

In [60]:
ratings['movieId'].nunique()

9724

> el dataset contiene ratings de 9724 peliculas. <span style="color:red">Actualizar con dataset grande</span> 

In [61]:
ratings['rating'].sort_values(ascending=True).unique()

array([0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. ])

> Los valores posibles de ratings van del 0.5 al 5, con un incremento de 0.5. 

#### Preprocesado

Convertimos el timestamp numerico en formato fecha

In [62]:
ratings["timestamp"] = ratings["timestamp"].apply(lambda x: datetime.utcfromtimestamp(x).strftime('%Y/%m/%d'))

In [63]:
ratings["timestamp"]

0         2000/07/30
1         2000/07/30
2         2000/07/30
3         2000/07/30
4         2000/07/30
             ...    
100831    2017/05/03
100832    2017/05/03
100833    2017/05/08
100834    2017/05/03
100835    2017/05/03
Name: timestamp, Length: 100836, dtype: object

> Vemos que la fecha tiene un formato de fecha, pero la columna es de tipo object

Utilizando pandas convertimos a un formato de fechas que permita el filtrado

In [64]:
ratings["timestamp"] = pd.to_datetime(ratings['timestamp'], format='%Y/%m/%d')

In [65]:
ratings["timestamp"]

0        2000-07-30
1        2000-07-30
2        2000-07-30
3        2000-07-30
4        2000-07-30
            ...    
100831   2017-05-03
100832   2017-05-03
100833   2017-05-08
100834   2017-05-03
100835   2017-05-03
Name: timestamp, Length: 100836, dtype: datetime64[ns]

> Vemos que la columna tiene el formato datetime64

In [66]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30
1,1,3,4.0,2000-07-30
2,1,6,4.0,2000-07-30
3,1,47,5.0,2000-07-30
4,1,50,5.0,2000-07-30


Vemos el rango de fechas del dataset

In [67]:
ratings.timestamp.min()

Timestamp('1996-03-29 00:00:00')

In [68]:
ratings.timestamp.max()

Timestamp('2018-09-24 00:00:00')

> Vemos que el rango de fechas va desde el 29/03/1996 al 24/09/24

#### Dividimos dataset en train, test y validation
Vemos la catidad de ratings por año

In [69]:
plot_df = ratings.copy()
plot_df["year"] = ratings.timestamp.dt.year
plot_df = plot_df.groupby("year", as_index=False).count()[["year", "userId"]]
plot_df.columns = ["year", "reviews_count"]
plot_df.head(25)

Unnamed: 0,year,reviews_count
0,1996,6040
1,1997,1916
2,1998,507
3,1999,2439
4,2000,10061
5,2001,3922
6,2002,3478
7,2003,4014
8,2004,3279
9,2005,5813


> Tomamos una proporción 80/20 para dividir el dataset en train - test

In [70]:
train = ratings[(ratings.timestamp < datetime(year=2016, month=1, day=1))]
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30
1,1,3,4.0,2000-07-30
2,1,6,4.0,2000-07-30
3,1,47,5.0,2000-07-30
4,1,50,5.0,2000-07-30


In [71]:
train.shape

(79517, 4)

In [72]:
train.userId.nunique()

514

In [73]:
train.movieId.nunique()

7789

In [74]:
test = ratings[ratings.timestamp >= datetime(year=2016, month=1, day=1)]
test.head()

Unnamed: 0,userId,movieId,rating,timestamp
1119,10,296,1.0,2016-02-12
1120,10,356,3.5,2016-02-12
1121,10,588,4.0,2016-02-12
1122,10,597,3.5,2016-02-13
1123,10,912,4.0,2016-02-12


In [75]:
test.shape

(21319, 4)

In [76]:
test.userId.nunique()

120

In [77]:
test.movieId.nunique()

5714

In [78]:
plot_df = train.copy()
plot_df["year"] = train.timestamp.dt.year
plot_df = plot_df.groupby("year", as_index=False).count()[["year", "userId"]]
plot_df.columns = ["year", "reviews_count"]
plot_df.head(25)

Unnamed: 0,year,reviews_count
0,1996,6040
1,1997,1916
2,1998,507
3,1999,2439
4,2000,10061
5,2001,3922
6,2002,3478
7,2003,4014
8,2004,3279
9,2005,5813


In [79]:
train.shape

(79517, 4)

> Definimos el conjunto de validación, en función de nuestro conjunto de entrenamiento. <span style="color:red">Actualizar con dataset grande</span>

In [80]:
validation = train[train.timestamp >= datetime(year=2014, month=1, day=1)]
validation.head()

Unnamed: 0,userId,movieId,rating,timestamp
232,2,318,3.0,2015-10-24
233,2,333,4.0,2015-10-24
234,2,1704,4.5,2015-10-24
235,2,3578,4.0,2015-10-24
236,2,6874,4.0,2015-10-24


In [81]:
validation.shape

(8055, 4)

In [82]:
validation.userId.nunique()

69

In [83]:
validation.movieId.nunique()

2732

In [84]:
plot_df = validation.copy()
plot_df["year"] = validation.timestamp.dt.year
plot_df = plot_df.groupby("year", as_index=False).count()[["year", "userId"]]
plot_df.columns = ["year", "reviews_count"]
plot_df.head(25)

Unnamed: 0,year,reviews_count
0,2014,1439
1,2015,6616


> Redefinimos el conjunto de entrenamiento. <span style="color:red">Actualizar con dataset grande</span>

In [85]:
train = train[(train.timestamp < datetime(year=2014, month=1, day=1))]
train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,2000-07-30
1,1,3,4.0,2000-07-30
2,1,6,4.0,2000-07-30
3,1,47,5.0,2000-07-30
4,1,50,5.0,2000-07-30


In [86]:
plot_df = train.copy()
plot_df["year"] = train.timestamp.dt.year
plot_df = plot_df.groupby("year", as_index=False).count()[["year", "userId"]]
plot_df.columns = ["year", "reviews_count"]
plot_df.head(25)

Unnamed: 0,year,reviews_count
0,1996,6040
1,1997,1916
2,1998,507
3,1999,2439
4,2000,10061
5,2001,3922
6,2002,3478
7,2003,4014
8,2004,3279
9,2005,5813


In [87]:
train.shape

(71462, 4)

¿Tenemos COLDSTAR? 

In [88]:
test[~test.userId.isin(train.userId.unique())].userId.nunique()

109

> Tenemos ### Usuarios que se encuentra en el dataset de test y no en el de train. <span style="color:red">Actualizar con dataset grande</span>

In [89]:
validation[~validation.userId.isin(train.userId.unique())].userId.nunique()

55

> Tenemos ### Usuarios que se encuentra en el dataset de validation y no en el de train. <span style="color:red">Actualizar con dataset grande</span>

#### Matriz de Interacciones

In [90]:
interactions_train = train[["userId", "movieId", "rating"]].copy()
interactions_train.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [91]:
interactions_matrix = interactions_train.pivot(index="userId", columns="movieId", values="rating")

In [92]:
interactions_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,104241,104245,104339,104841,104879,105037,105213,105504,105755,107348
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,3.0,5.0,4.0,4.0,3.0,,3.0,...,,,,,,,,,,


In [93]:
interactions_matrix = interactions_matrix.fillna(0)

In [94]:
interactions_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,104241,104245,104339,104841,104879,105037,105213,105504,105755,107348
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,4.0,5.0,3.0,5.0,4.0,4.0,3.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
interactions_matrix.shape

(459, 7122)

In [96]:
interactions_matrix_csr = csr_matrix(interactions_matrix.values)

In [97]:
interactions_matrix_csr

<459x7122 sparse matrix of type '<class 'numpy.float64'>'
	with 71462 stored elements in Compressed Sparse Row format>

In [98]:
user_ids = list(interactions_matrix.index)
user_dict = {}
counter = 0
for i in user_ids:
    user_dict[i] = counter
    counter += 1

In [99]:
user_dict

{1: 0,
 3: 1,
 4: 2,
 5: 3,
 6: 4,
 7: 5,
 8: 6,
 9: 7,
 11: 8,
 12: 9,
 13: 10,
 14: 11,
 15: 12,
 16: 13,
 17: 14,
 19: 15,
 20: 16,
 21: 17,
 22: 18,
 23: 19,
 26: 20,
 27: 21,
 28: 22,
 29: 23,
 31: 24,
 32: 25,
 33: 26,
 34: 27,
 35: 28,
 36: 29,
 37: 30,
 38: 31,
 39: 32,
 40: 33,
 42: 34,
 43: 35,
 44: 36,
 45: 37,
 46: 38,
 48: 39,
 51: 40,
 53: 41,
 54: 42,
 55: 43,
 56: 44,
 57: 45,
 58: 46,
 59: 47,
 61: 48,
 64: 49,
 66: 50,
 68: 51,
 69: 52,
 70: 53,
 71: 54,
 72: 55,
 74: 56,
 75: 57,
 78: 58,
 79: 59,
 80: 60,
 81: 61,
 82: 62,
 83: 63,
 84: 64,
 85: 65,
 86: 66,
 87: 67,
 88: 68,
 90: 69,
 91: 70,
 92: 71,
 93: 72,
 94: 73,
 95: 74,
 96: 75,
 97: 76,
 99: 77,
 100: 78,
 101: 79,
 102: 80,
 104: 81,
 107: 82,
 108: 83,
 109: 84,
 110: 85,
 113: 86,
 115: 87,
 116: 88,
 117: 89,
 118: 90,
 120: 91,
 121: 92,
 124: 93,
 126: 94,
 127: 95,
 128: 96,
 129: 97,
 130: 98,
 131: 99,
 132: 100,
 133: 101,
 134: 102,
 135: 103,
 136: 104,
 137: 105,
 138: 106,
 140: 107,
 142: 10

#### Modelo

In [107]:
model = LightFM(no_components=5, random_state=100, learning_rate=0.03, loss='logistic')

In [108]:
%%time
model = model.fit(interactions_matrix_csr, epochs=100)

CPU times: total: 3.44 s
Wall time: 6.37 s


In [109]:
model

<lightfm.lightfm.LightFM at 0x23be2980dd0>