In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

In [2]:
df_ = pd.read_parquet('../raw/data_model/ratings_train05.parquet')# Carga de datos rating

In [3]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000011 entries, 0 to 1000010
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   userId     999987 non-null   float64
 1   movieId    1000011 non-null  int64  
 2   rating     999987 non-null   float64
 3   timestamp  999987 non-null   float64
 4   title      1000011 non-null  object 
 5   genres     1000011 non-null  object 
dtypes: float64(3), int64(1), object(2)
memory usage: 45.8+ MB


In [4]:
df_.isna().sum()# nulos

userId       24
movieId       0
rating       24
timestamp    24
title         0
genres        0
dtype: int64

In [5]:
# Los valores nulos de df_ los eliminaremos
def imputer_(df_):
    df_ratings_copy = df_.copy()
    df_ratings_copy=df_ratings_copy.dropna()

    return df_ratings_copy

In [6]:
df_=imputer_(df_)# Eliminamos los valores nulos

In [7]:
# Eliminamos variable timestamp, tagId y relevance del dataset
def drop_variables(df_):
    df_ratings_copy = df_.copy()
    df_ratings_copy.drop(['timestamp'], axis=1, inplace=True)
    return df_ratings_copy

In [8]:
df_=drop_variables(df_)

In [9]:
# df_ratings, revisando duplicados en titulos (no queremos el mismo titulo mas de 1 vez)
df_.nunique() # Comprobamos los valores duplicados por columnas

userId     125566
movieId     15256
rating         10
title       15256
genres       1099
dtype: int64

In [10]:
# Transformamos las variables del dataset df_
# title, Extraemos el año de publicacion de cada pelicula en una variable nueva
def launchYear_title_df_(df_):
    df_ratings_copy = df_.copy()
    # Extraemos con expresiones regulares los años del title, rellenamos los nulos con 0 y casteamos a int
    df_ratings_copy['launch_year'] = df_ratings_copy['title'].str.extract(r'\((\d{4})\)', expand=False).fillna('0').astype(int)
    # Eliminamos el año y cualquier contenido entre paréntesis en la columna 'title'
    df_ratings_copy['title'] = df_ratings_copy['title'].apply(lambda x: x.split(' (')[0])
    return df_ratings_copy

In [11]:
df_=launchYear_title_df_(df_) # Extrae año del titulo

In [12]:
list_user_movie=df_[['userId','movieId','rating','title']]# Almacenar valores originales (los usaremos en la comprobacion del recommender)

In [13]:
# Transformamos las variables del dataset df_
# Extraemos  el genero principal de cada movieId (se considera principal al primero que aparece en genres antes del '|')
def splitGenre_df_(df_):
    df_ratings_copy=df_.copy()
    # Extraemos el primer genero de la columna genres y lo almacenamos en una nueva variable
    df_ratings_copy['genre_split']=df_ratings_copy['genres'].apply(lambda x: x.split('|'))
    df_ratings_copy.drop('genres', axis=1, inplace=True)
    return df_ratings_copy

In [14]:
df_=splitGenre_df_(df_)# extraemos generos

In [15]:
df_.drop(['movieId', 'userId', 'title'], axis=1, inplace=True)

In [16]:
df_.head()

Unnamed: 0,rating,launch_year,genre_split
0,4.0,1987,"[Comedy, Fantasy, Horror]"
1,3.0,1996,[Comedy]
2,4.0,1983,[Comedy]
3,2.0,1996,"[Drama, Fantasy, Horror, Thriller]"
4,4.0,1974,"[Action, Crime, Drama]"


In [17]:
# Codificacion de categoricos a OneHotEncoder
def encoder_genre(df_):
    df_ratings_copy = df_.copy()
    # instancia del codificador
    encoder=MultiLabelBinarizer()
    encoded = pd.DataFrame(encoder.fit_transform(df_ratings_copy['genre_split']), columns=encoder.classes_, index=df_ratings_copy.index)
    df_ratings_copy.drop('genre_split', axis=1, inplace=True)
    result = pd.concat([df_ratings_copy, encoded], axis=1)
    
    return result

In [18]:
df_=encoder_genre(df_)# Codificacion categoricas genres

In [19]:
df_.head()

Unnamed: 0,rating,launch_year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,4.0,1987,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,3.0,1996,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,4.0,1983,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.0,1996,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,4.0,1974,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Normalizamos valores numericos a rangos entre 0-1
def normalizer(df_):
    df_ratings_copy = df_.copy()

    normalizer = MinMaxScaler()
    df_ratings_copy[['launch_year', 'rating']] = normalizer.fit_transform(df_ratings_copy[['launch_year', 'rating']])
 
    return df_ratings_copy

In [21]:
df_=normalizer(df_)# normalizacion numericos

In [22]:
df_.head()

Unnamed: 0,rating,launch_year,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.777778,0.986104,0,0,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0.555556,0.990571,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0.777778,0.984119,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.333333,0.990571,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
4,0.777778,0.979653,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
