In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler

In [2]:
df_ = pd.read_parquet('../raw/data_model/ratings_train.parquet')# Carga de datos

In [3]:
df_.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16000175 entries, 0 to 16000174
Data columns (total 6 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     float64
 1   movieId    int64  
 2   rating     float64
 3   timestamp  float64
 4   title      object 
 5   genres     object 
dtypes: float64(3), int64(1), object(2)
memory usage: 732.4+ MB


In [4]:
df_.isna().sum()# nulos

userId       415
movieId        0
rating       415
timestamp    415
title          0
genres         0
dtype: int64

In [3]:
# Los valores nulos de df_ los eliminaremos
def imputer_(df_):
    df_ratings_copy = df_.copy()
    df_ratings_copy=df_ratings_copy.dropna()

    return df_ratings_copy

In [4]:
df_=imputer_(df_)# Eliminamos los valores nulos

In [5]:
# Eliminamos variable timestamp del dataset
def drop_timestamp(df_):
    df_ratings_copy = df_.copy()
    df_ratings_copy.drop('timestamp', axis=1, inplace=True)
    return df_ratings_copy

In [6]:
df_=drop_timestamp(df_)

In [7]:
# df_ratings, revisando duplicados en titulos (no queremos el mismo titulo mas de 1 vez)
df_.nunique() # Comprobamos los valores duplicados por columnas

userId     138493
movieId     25840
rating         10
title       25840
genres       1317
dtype: int64

In [7]:
# Transformamos las variables del dataset df_
# title, Extraemos el año de publicacion de cada pelicula en una variable nueva
def launchYear_title_df_(df_):
    df_ratings_copy = df_.copy()
    # Extraemos con expresiones regulares los años del title, rellenamos los nulos con 0 y casteamos a int
    df_ratings_copy['launch_year'] = df_ratings_copy['title'].str.extract(r'\((\d{4})\)', expand=False).fillna('0').astype(int)
    # Eliminamos el año y cualquier contenido entre paréntesis en la columna 'title'
    df_ratings_copy['title'] = df_ratings_copy['title'].apply(lambda x: x.split(' (')[0])
    return df_ratings_copy

In [8]:
df_=launchYear_title_df_(df_) # Extrae año del titulo

In [9]:
# Transformamos las variables del dataset df_
# Convertimos userId a entero
def int_userId_df_(df_):
    df_ratings_copy=df_.copy()
    # Convertir 'userId' a int
    df_ratings_copy['userId'] = pd.to_numeric(df_ratings_copy['userId'], errors='coerce').astype('int64')
    return df_ratings_copy

In [10]:
df_=int_userId_df_(df_)# Convierte userId a entero

In [11]:
# Transformamos las variables del dataset df_
# Extraemos la longitud de cada title y lo añadimos como nueva variable
def lenTitle_df_(df_):
    df_ratings_copy=df_.copy()
    #Extraemos la longitud de cada uno de los titulos
    df_ratings_copy['len_title']=df_ratings_copy['title'].apply(lambda x: len(x))

    return df_ratings_copy

In [12]:
df_=lenTitle_df_(df_) # Extraemos longitud del titulo

In [13]:
# Transformamos las variables del dataset df_
# Extraemos  el genero principal de cada movieId (se considera principal al primero que aparece en genres antes del '|')
def mainGenre_df_(df_):
    df_ratings_copy=df_.copy()
    # Extraemos el primer genero de la columna genres y lo almacenamos en una nueva variable
    df_ratings_copy['main_genre']=df_ratings_copy['genres'].apply(lambda x: x.split('|')[0])
    return df_ratings_copy

In [14]:
df_=mainGenre_df_(df_)# extraemos generos

In [15]:
# eliminamos genres
def drop_genres(df_):
    df_ratings_copy=df_.copy()
    df_ratings_copy.drop('genres', axis=1, inplace=True)
    return df_ratings_copy

In [16]:
df_=drop_genres(df_)

In [17]:
def title_save(df_):
    df_ratings_copy=df_.copy()
    title_key = {'movieId': df_ratings_copy['movieId'], 'title': df_ratings_copy['title']}
    return title_key

In [18]:
title=title_save(df_)

In [19]:
# Eliminamos variable title del dataset
def drop_title(df_):
    df_ratings_copy = df_.copy()
    df_ratings_copy.drop('title', axis=1, inplace=True)
    return df_ratings_copy

In [20]:
df_=drop_title(df_)

In [21]:
# Eliminamos variable timestamp del dataset
def drop_userId(df_):
    df_ratings_copy = df_.copy()
    df_ratings_copy.drop('userId', axis=1, inplace=True)
    return df_ratings_copy

In [22]:
df_=drop_userId(df_)

In [23]:
# Codificacion de categoricos a OneHotEncoder
def encoder_main_genre(df_):
    df_ratings_copy = df_.copy()
    # instancia del codificador
    encoder=OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first')
    encoded_data=encoder.fit_transform(df_ratings_copy[['main_genre']])

    # Convertimos a dataframe y eliminamos columna 'main_genre'
    encoded_data=pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())
    df_ratings_copy.drop('main_genre', axis=1, inplace=True)

    # reseteamos indices
    df_ratings_copy.reset_index(drop=True, inplace=True)
    encoded_data.reset_index(drop=True, inplace=True)
    
    new_df = pd.concat([df_ratings_copy, encoded_data], axis=1)
    return new_df

In [24]:
df_=encoder_main_genre(df_)# Codificacion categoricas genres

In [25]:
df_ = df_.set_index('movieId')# Establecemos movieId como indice del df

In [27]:
# Normalizamos valores numericos a rangos entre 0-1
def normalizer(df_):
    df_ratings_copy = df_.copy()

    scaler = MinMaxScaler()
    df_ratings_copy[['rating','launch_year','len_title']] = scaler.fit_transform(df_ratings_copy[['rating','launch_year','len_title']])

    return df_ratings_copy

In [28]:
df_=normalizer(df_)# normalizacion numericos

In [29]:
df_

Unnamed: 0_level_0,rating,launch_year,len_title,main_genre_Action,main_genre_Adventure,main_genre_Animation,main_genre_Children,main_genre_Comedy,main_genre_Crime,main_genre_Documentary,...,main_genre_Film-Noir,main_genre_Horror,main_genre_IMAX,main_genre_Musical,main_genre_Mystery,main_genre_Romance,main_genre_Sci-Fi,main_genre_Thriller,main_genre_War,main_genre_Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3637,0.777778,0.985112,0.076087,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
595,0.777778,0.988089,0.206522,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4886,0.777778,0.993052,0.141304,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7150,0.000000,0.994045,0.119565,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4874,0.555556,0.993052,0.043478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059,0.555556,0.991563,0.163043,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4306,0.777778,0.993052,0.043478,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
329,0.555556,0.989578,0.228261,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3527,1.000000,0.986104,0.076087,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
