In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler

In [2]:
df_rating = pd.read_parquet('../raw/data_model/ratings_train05.parquet')# Carga de datos rating
df_tag = pd.read_parquet('../raw/data_model/tags_train05.parquet')# Carga de datos tag

In [3]:
df_ = df_rating.merge(df_tag, how='right', on='movieId')

In [4]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46791 entries, 0 to 46790
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   userId     46782 non-null  float64
 1   movieId    46791 non-null  int64  
 2   rating     46782 non-null  float64
 3   timestamp  46782 non-null  float64
 4   title      46782 non-null  object 
 5   genres     46782 non-null  object 
 6   tagId      46791 non-null  int64  
 7   relevance  46791 non-null  float64
 8   tag        46791 non-null  object 
dtypes: float64(4), int64(2), object(3)
memory usage: 3.2+ MB


In [5]:
df_.isna().sum()# nulos

userId       9
movieId      0
rating       9
timestamp    9
title        9
genres       9
tagId        0
relevance    0
tag          0
dtype: int64

In [6]:
# Los valores nulos de df_ los eliminaremos
def imputer_(df_):
    df_ratings_copy = df_.copy()
    df_ratings_copy=df_ratings_copy.dropna()

    return df_ratings_copy

In [7]:
df_=imputer_(df_)# Eliminamos los valores nulos

In [8]:
# Eliminamos variable timestamp, tagId y relevance del dataset
def drop_variables(df_):
    df_ratings_copy = df_.copy()
    df_ratings_copy.drop(['timestamp','tag', 'relevance'], axis=1, inplace=True)
    return df_ratings_copy

In [9]:
df_=drop_variables(df_)

In [10]:
# df_ratings, revisando duplicados en titulos (no queremos el mismo titulo mas de 1 vez)
df_.nunique() # Comprobamos los valores duplicados por columnas

userId     31893
movieId      511
rating        10
title        511
genres       174
tagId        271
dtype: int64

In [11]:
# Transformamos las variables del dataset df_
# title, Extraemos el año de publicacion de cada pelicula en una variable nueva
def launchYear_title_df_(df_):
    df_ratings_copy = df_.copy()
    # Extraemos con expresiones regulares los años del title, rellenamos los nulos con 0 y casteamos a int
    df_ratings_copy['launch_year'] = df_ratings_copy['title'].str.extract(r'\((\d{4})\)', expand=False).fillna('0').astype(int)
    # Eliminamos el año y cualquier contenido entre paréntesis en la columna 'title'
    df_ratings_copy['title'] = df_ratings_copy['title'].apply(lambda x: x.split(' (')[0])
    return df_ratings_copy

In [12]:
df_=launchYear_title_df_(df_) # Extrae año del titulo

In [13]:
list_user_movie=df_[['userId','movieId','rating','title']]# Almacenar valores originales (los usaremos en la comprobacion del recommender)

In [14]:
# Transformamos las variables del dataset df_
# Extraemos  el genero principal de cada movieId (se considera principal al primero que aparece en genres antes del '|')
def splitGenre_df_(df_):
    df_ratings_copy=df_.copy()
    # Extraemos el primer genero de la columna genres y lo almacenamos en una nueva variable
    df_ratings_copy['genre_split']=df_ratings_copy['genres'].apply(lambda x: x.split('|'))
    df_ratings_copy.drop('genres', axis=1, inplace=True)
    return df_ratings_copy

In [15]:
df_=splitGenre_df_(df_)# extraemos generos

In [16]:
df_.drop(['movieId', 'userId', 'title'], axis=1, inplace=True)

In [17]:
df_.head()

Unnamed: 0,rating,tagId,launch_year,genre_split
0,1.0,522,1982,[Horror]
1,4.0,522,1982,[Horror]
2,3.0,522,1982,[Horror]
3,5.0,522,1982,[Horror]
4,3.0,522,1982,[Horror]


In [18]:
# Codificacion de categoricos a OneHotEncoder
def encoder_genre(df_):
    df_ratings_copy = df_.copy()
    # instancia del codificador
    encoder=MultiLabelBinarizer()
    encoded = pd.DataFrame(encoder.fit_transform(df_ratings_copy['genre_split']), columns=encoder.classes_, index=df_ratings_copy.index)
    df_ratings_copy.drop('genre_split', axis=1, inplace=True)
    result = pd.concat([df_ratings_copy, encoded], axis=1)
    
    return result

In [19]:
df_=encoder_genre(df_)# Codificacion categoricas genres

In [20]:
df_

Unnamed: 0,rating,tagId,launch_year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1.0,522,1982,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,4.0,522,1982,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,3.0,522,1982,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,5.0,522,1982,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,3.0,522,1982,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46786,2.0,3,2007,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
46787,5.0,3,2007,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
46788,5.0,3,2007,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
46789,3.5,3,2007,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [21]:
# Normalizamos valores numericos a rangos entre 0-1
def normalizer(df_):
    df_ratings_copy = df_.copy()

    normalizer = MinMaxScaler()
    df_ratings_copy[['launch_year', 'rating']] = normalizer.fit_transform(df_ratings_copy[['launch_year', 'rating']])
 
    return df_ratings_copy

In [22]:
df_=normalizer(df_)# normalizacion numericos

In [23]:
df_

Unnamed: 0,rating,tagId,launch_year,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.111111,522,0.652632,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0.777778,522,0.652632,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,0.555556,522,0.652632,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,1.000000,522,0.652632,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0.555556,522,0.652632,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46786,0.333333,3,0.915789,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
46787,1.000000,3,0.915789,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
46788,1.000000,3,0.915789,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
46789,0.666667,3,0.915789,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
