In [2]:
import pandas as pd

filmes = pd.read_csv("dados/movies.csv.bz2")
filmes.columns = ["filmeId", "titulo", "generos"]
filmes = filmes.set_index("filmeId")
filmes.head()

Unnamed: 0_level_0,titulo,generos
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [3]:
notas = pd.read_csv("dados/ratings.csv.bz2")
notas.columns = ["usuarioId", "filmeId", "nota", "momento"]
notas.head()

Unnamed: 0,usuarioId,filmeId,nota,momento
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
notas.describe()

Unnamed: 0,usuarioId,filmeId,nota,momento
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


# Primeira tentativa de recomendação: heurística de total de votos

In [5]:
total_de_votos = notas["filmeId"].value_counts()
total_de_votos.head()

356     329
318     317
296     307
593     279
2571    278
Name: filmeId, dtype: int64

In [6]:
filmes['total_de_votos'] = total_de_votos
filmes.head()

Unnamed: 0_level_0,titulo,generos,total_de_votos
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,215.0
2,Jumanji (1995),Adventure|Children|Fantasy,110.0
3,Grumpier Old Men (1995),Comedy|Romance,52.0
4,Waiting to Exhale (1995),Comedy|Drama|Romance,7.0
5,Father of the Bride Part II (1995),Comedy,49.0


In [7]:
# lista com os 10 filmes mais "populares"
filmes.sort_values("total_de_votos", ascending=False).head(10)

Unnamed: 0_level_0,titulo,generos,total_de_votos
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0
110,Braveheart (1995),Action|Drama|War,237.0
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0
527,Schindler's List (1993),Drama|War,220.0


In [9]:
notas_medias = notas.groupby("filmeId").mean()["nota"]
notas_medias.head()

filmeId
1    3.920930
2    3.431818
3    3.259615
4    2.357143
5    3.071429
Name: nota, dtype: float64

In [10]:
filmes["nota_media"] = notas_medias
filmes.sort_values("total_de_votos", ascending = False).head(10)

Unnamed: 0_level_0,titulo,generos,total_de_votos,nota_media
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0,4.164134
318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022
296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0,4.197068
593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0,4.16129
2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0,4.192446
260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0,4.231076
480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,238.0,3.75
110,Braveheart (1995),Action|Drama|War,237.0,4.031646
589,Terminator 2: Judgment Day (1991),Action|Sci-Fi,224.0,3.970982
527,Schindler's List (1993),Drama|War,220.0,4.225


# Uma segunda heurística: nota média e filtrando votos

In [24]:
filmes.sort_values("nota_media", ascending = False).head(10)

Unnamed: 0_level_0,titulo,generos,total_de_votos,nota_media
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
76171,India (Indien) (1993),Comedy|Drama,1.0,5.0
115943,The Land Before Time V: The Mysterious Island ...,Adventure|Animation|Children,1.0,5.0
117368,The Madagascar Penguins in a Christmas Caper (...,Animation|Comedy,1.0,5.0
117310,Aashiqui 2 (2013),Drama|Musical|Romance,1.0,5.0
158486,Side Effects (2005),Comedy|Drama|Romance,1.0,5.0
50942,"Wake Up, Ron Burgundy (2004)",Comedy,1.0,5.0
27235,"Shrink Is In, The (2001)",Comedy|Romance,1.0,5.0
158878,The Letter Writer (2011),Children,1.0,5.0
116493,The Returned (2013),Drama|Horror|Thriller,1.0,5.0
159161,Ali Wong: Baby Cobra (2016),(no genres listed),1.0,5.0


In [25]:
filmes.query("total_de_votos >= 10").sort_values("nota_media", ascending = False).head(10)

Unnamed: 0_level_0,titulo,generos,total_de_votos,nota_media
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
69849,Roots (1977),Drama|War,10.0,4.45
174053,Black Mirror: White Christmas (2014),Drama|Horror|Mystery|Sci-Fi|Thriller,26.0,4.423077
7396,Scenes From a Marriage (Scener ur ett äktenska...,Drama,12.0,4.416667
318,"Shawshank Redemption, The (1994)",Crime|Drama,1739.0,4.410293
6650,Kind Hearts and Coronets (1949),Comedy|Drama,18.0,4.361111
6852,In Cold Blood (1967),Crime|Drama,14.0,4.357143
163134,Your Name. (2016),Animation|Drama|Fantasy|Romance,24.0,4.354167
55908,"Man from Earth, The (2007)",Drama|Sci-Fi,58.0,4.353448
5604,"Man in the White Suit, The (1951)",Comedy|Sci-Fi,10.0,4.35
7234,"Strada, La (1954)",Drama,25.0,4.32


In [27]:
filmes_com_mais_de_50_votos = filmes.query("total_de_votos >= 50")
filmes_com_mais_de_50_votos.sort_values("nota_media", ascending = False).head(10)

Unnamed: 0_level_0,titulo,generos,total_de_votos,nota_media
filmeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
318,"Shawshank Redemption, The (1994)",Crime|Drama,1739.0,4.410293
55908,"Man from Earth, The (2007)",Drama|Sci-Fi,58.0,4.353448
858,"Godfather, The (1972)",Crime|Drama,1053.0,4.307217
50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,1123.0,4.303206
678,Some Folks Call It a Sling Blade (1993),Drama|Thriller,51.0,4.294118
3359,Breaking Away (1979),Comedy|Drama,64.0,4.289062
926,All About Eve (1950),Drama,99.0,4.277778
527,Schindler's List (1993),Drama|War,1299.0,4.265204
1280,Raise the Red Lantern (Da hong deng long gao g...,Drama,57.0,4.263158
3435,Double Indemnity (1944),Crime|Drama|Film-Noir,99.0,4.252525


In [11]:
total_de_votos.describe()

count    9724.000000
mean       10.369807
std        22.401005
min         1.000000
25%         1.000000
50%         3.000000
75%         9.000000
max       329.000000
Name: filmeId, dtype: float64