# Preparando Dados

In [1]:
import numpy as np
import pandas as pd

from itertools import chain
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer

In [4]:
movies = pd.read_csv('./datasets/movies.csv', encoding='utf-8')
ratings = pd.read_csv('./datasets/ratings.csv', encoding='utf-8')
tags = pd.read_csv('./datasets/tags.csv', encoding='utf-8')

In [5]:
print(ratings.shape)
print(len(ratings["movieId"].unique()))
ratings.head()

(100836, 4)
9724


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
tags_grouped_by_movie = tags.groupby('movieId')['tag'].apply(list).reset_index()
tags_grouped_by_movie['tag'] = tags_grouped_by_movie['tag'].apply(lambda x: [item.lower() for item in x])
tags_grouped_by_movie['tag'] = tags_grouped_by_movie['tag'].apply(lambda x: list(set(x)))

In [7]:
tags_grouped_by_movie.head()

Unnamed: 0,movieId,tag
0,1,"[pixar, fun]"
1,2,"[robin williams, game, magic board game, fantasy]"
2,3,"[old, moldy]"
3,5,"[pregnancy, remake]"
4,7,[remake]


In [8]:
data = pd.merge(movies, ratings, on="movieId")
data = pd.merge(data, tags_grouped_by_movie, on="movieId", how='left')
data = data.drop(["title", "timestamp"], axis=1)

# Split genres into a list
data['genres'] = data['genres'].str.split('|')

data.head()

Unnamed: 0,movieId,genres,userId,rating,tag
0,1,"[Adventure, Animation, Children, Comedy, Fantasy]",1,4.0,"[pixar, fun]"
1,1,"[Adventure, Animation, Children, Comedy, Fantasy]",5,4.0,"[pixar, fun]"
2,1,"[Adventure, Animation, Children, Comedy, Fantasy]",7,4.5,"[pixar, fun]"
3,1,"[Adventure, Animation, Children, Comedy, Fantasy]",15,2.5,"[pixar, fun]"
4,1,"[Adventure, Animation, Children, Comedy, Fantasy]",17,4.5,"[pixar, fun]"


In [9]:
a = data[data["movieId"] == 73]
a.head()

Unnamed: 0,movieId,genres,userId,rating,tag
2926,73,"[Drama, War]",117,4.0,
2927,73,"[Drama, War]",169,4.0,
2928,73,"[Drama, War]",288,3.0,
2929,73,"[Drama, War]",304,4.0,
2930,73,"[Drama, War]",420,4.0,


## Separando Gêneros por Colunas

In [None]:
genres_df = data[['genres']]

# Create an instance of the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the 'genres' data using the encoder
genres_encoded = encoder.fit_transform(genres_df).toarray()

# Create a new DataFrame with the encoded genres
encoded_df = pd.DataFrame(genres_encoded, columns=encoder.get_feature_names_out(['genres']))

# Concatenate the original DataFrame with the encoded genres
data = pd.concat([data, encoded_df], axis=1)

# Remove the original 'genres' column
data.drop('genres', axis=1, inplace=True)

data.head()

In [None]:
genres = data['genres']

# Convert the lists of genres into separate binary columns
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(genres)

# Create a new DataFrame with the encoded genres
encoded_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)

# Concatenate the original DataFrame with the encoded genres
df_encoded = pd.concat([data, encoded_df], axis=1)

# Remove the original 'genres' column
df_encoded.drop('genres', axis=1, inplace=True)

df_encoded.head()

In [None]:
df_encoded['tag'] = df_encoded['tag'].fillna('')

tags = df_encoded['tag']

# Convert the lists of tags into separate binary columns
tags_encoded = mlb.fit_transform(tags)

# Create a new DataFrame with the encoded tags
encoded_df = pd.DataFrame(tags_encoded, columns=mlb.classes_)

# Concatenate the original DataFrame with the encoded tags
df_encoded = pd.concat([df_encoded, encoded_df], axis=1)

# Remove the original 'tag' column
df_encoded.drop('tag', axis=1, inplace=True)

df_encoded.head()

In [None]:
print(df_encoded.shape)