### IMPORTAÇÃO DOS DADOS

In [1]:
import pandas as pd 
from datetime import datetime

### CARREGAR OS DADOS

In [2]:
# carregar dados dos filmes
movies = pd.read_csv('../data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies.shape

(9742, 3)

In [4]:
# carregar dados das avaliações
ratings = pd.read_csv('../data/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
ratings.shape

(100836, 4)

### AJUSTE DOS DADOS

In [6]:
movies['genres'] = movies.genres.apply(lambda x: x.split("|"))
# explode para separar os generos em cada linha
movies = movies.explode('genres')

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure
0,1,Toy Story (1995),Animation
0,1,Toy Story (1995),Children
0,1,Toy Story (1995),Comedy
0,1,Toy Story (1995),Fantasy


In [7]:
# editar o campo de data 
ratings['date'] = ratings.timestamp.apply(lambda x: datetime.fromtimestamp(x).strftime('%d-%m-%y'))
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,date
0,1,1,4.0,964982703,30-07-00
1,1,3,4.0,964981247,30-07-00
2,1,6,4.0,964982224,30-07-00
3,1,47,5.0,964983815,30-07-00
4,1,50,5.0,964982931,30-07-00


In [8]:
# juntar as duas tabelas
df = pd.merge(movies, ratings, on='movieId')
df.head(8)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,date
0,1,Toy Story (1995),Adventure,1,4.0,964982703,30-07-00
1,1,Toy Story (1995),Adventure,5,4.0,847434962,08-11-96
2,1,Toy Story (1995),Adventure,7,4.5,1106635946,25-01-05
3,1,Toy Story (1995),Adventure,15,2.5,1510577970,13-11-17
4,1,Toy Story (1995),Adventure,17,4.5,1305696483,18-05-11
5,1,Toy Story (1995),Adventure,18,3.5,1455209816,11-02-16
6,1,Toy Story (1995),Adventure,19,4.0,965705637,08-08-00
7,1,Toy Story (1995),Adventure,21,3.5,1407618878,09-08-14


In [9]:
# veirificar se existe valor nulo
df.isnull().sum()

movieId      0
title        0
genres       0
userId       0
rating       0
timestamp    0
date         0
dtype: int64

### ANÁLISES E PERGUNTAS

1) Quantos gêneros o filme "The Matrix" tem? Quais são os gêneros?

In [10]:
# pegar os generos unicos do filme The Matrix.
total_generos_matrix = movies[movies["title"].str.lower().str.contains("matrix")]['genres'].nunique()
# lista com os generos.
lista_generos = list(movies[movies["title"].str.lower().str.contains("matrix")]['genres'].unique())

print('Temos um total de {0} gêneros para o filme "The Matrix".'.format(total_generos_matrix))
print('São eles: {0}'.format(lista_generos))

Temos um total de 7 gêneros para o filme "The Matrix".
São eles: ['Action', 'Sci-Fi', 'Thriller', 'Adventure', 'IMAX', 'Animation', 'Drama']


2) Quantos gêneros cada filme tem, em média?

In [11]:
# total de generos por filme
movies.groupby('title')['genres'].nunique().reset_index()

Unnamed: 0,title,genres
0,'71 (2014),4
1,'Hellboy': The Seeds of Creation (2004),5
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),4
4,'Til There Was You (1997),2
...,...,...
9732,eXistenZ (1999),3
9733,xXx (2002),3
9734,xXx: State of the Union (2005),3
9735,¡Three Amigos! (1986),2


In [12]:
# média dos gêneros por filmes
round(movies.groupby('title')['genres'].nunique().reset_index()['genres'].mean())

2

3) Quais são os cinco filmes com maior número de gêneros?

In [13]:
# filmes com maior numero de gêneros
(movies
 .groupby('title')['genres']
 .nunique()
 .reset_index()
 .sort_values('genres', ascending=False)
 .reset_index(drop=True))[:5]

Unnamed: 0,title,genres
0,Rubber (2010),10
1,Patlabor: The Movie (Kidô keisatsu patorebâ: T...,8
2,Osmosis Jones (2001),7
3,Aelita: The Queen of Mars (Aelita) (1924),7
4,Who Framed Roger Rabbit? (1988),7


4) Quantos usuários deram nota para o filme "The Matrix"?

In [14]:
qtd_usuarios_rating = df[df["title"].str.lower().str.contains("matrix")]['userId'].nunique()

print('{0} usuários deram nota para o filme The Matrix.'.format(qtd_usuarios_rating))

288 usuários deram nota para o filme The Matrix.


5) Quantos usuários, em média, deram nota para cada filme?

In [15]:
# total de usuários que deram nota por filme
df.groupby('title')['userId'].nunique().reset_index()

Unnamed: 0,title,userId
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2
...,...,...
9714,eXistenZ (1999),22
9715,xXx (2002),24
9716,xXx: State of the Union (2005),5
9717,¡Three Amigos! (1986),26


In [16]:
round(df.groupby('title')['userId'].nunique().reset_index()['userId'].mean())

10

6) Quais são os cinco filmes para os quais mais usuários deram nota?

In [17]:
(df
 .groupby('title')['userId']
 .nunique()
 .reset_index()
 .sort_values('userId', ascending=False)
 .reset_index(drop=True))[:5]

Unnamed: 0,title,userId
0,Forrest Gump (1994),329
1,"Shawshank Redemption, The (1994)",317
2,Pulp Fiction (1994),307
3,"Silence of the Lambs, The (1991)",279
4,"Matrix, The (1999)",278


7) Qual o rating médio do filme "The Matrix"?

In [18]:
rating_mean_matrix = round(df[df["title"].str.lower().str.contains("matrix")]['rating'].mean(),2)

print('O rating médio do filme "The Matrix" é {0}.'.format(rating_mean_matrix))

O rating médio do filme "The Matrix" é 3.72.


8) Cada filme tem um rating médio. Qual a média desses ratings, por gênero?

In [19]:
# calcular media do rating para cada filme
ratings_movies_mean = df.groupby('movieId')['rating'].mean().reset_index().rename(columns={'rating':'rating_mean'})
ratings_movies_mean

Unnamed: 0,movieId,rating_mean
0,1,3.920930
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429
...,...,...
9719,193581,4.000000
9720,193583,3.500000
9721,193585,3.500000
9722,193587,3.500000


In [20]:
# juntar os dois dataframes
df = pd.merge(df,ratings_movies_mean,on='movieId')
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp,date,rating_mean
0,1,Toy Story (1995),Adventure,1,4.0,964982703,30-07-00,3.92093
1,1,Toy Story (1995),Adventure,5,4.0,847434962,08-11-96,3.92093
2,1,Toy Story (1995),Adventure,7,4.5,1106635946,25-01-05,3.92093
3,1,Toy Story (1995),Adventure,15,2.5,1510577970,13-11-17,3.92093
4,1,Toy Story (1995),Adventure,17,4.5,1305696483,18-05-11,3.92093


In [21]:
# media dos ratings por genero
df.groupby('genres')['rating_mean'].mean()

genres
(no genres listed)    3.489362
Action                3.447984
Adventure             3.508609
Animation             3.629937
Children              3.412956
Comedy                3.384721
Crime                 3.658294
Documentary           3.797785
Drama                 3.656184
Fantasy               3.491001
Film-Noir             3.920115
Horror                3.258195
IMAX                  3.618335
Musical               3.563678
Mystery               3.632460
Romance               3.506511
Sci-Fi                3.455721
Thriller              3.493706
War                   3.808294
Western               3.583938
Name: rating_mean, dtype: float64

9) Para cada gênero de filme, quantos usuários deram cada nota possível?

In [22]:
(df
 .groupby(['title','genres'])['userId']
 .nunique()
 .reset_index())

Unnamed: 0,title,genres,userId
0,'71 (2014),Action,1
1,'71 (2014),Drama,1
2,'71 (2014),Thriller,1
3,'71 (2014),War,1
4,'Hellboy': The Seeds of Creation (2004),Action,1
...,...,...,...
22031,xXx: State of the Union (2005),Thriller,5
22032,¡Three Amigos! (1986),Comedy,26
22033,¡Three Amigos! (1986),Western,26
22034,À nous la liberté (Freedom for Us) (1931),Comedy,1


In [23]:
print('FIM.')

FIM.
