# Lecture 1. Movies Dataset

In [5]:
import pandas as pd


In [6]:
links = pd.read_csv('data/links.csv')
tags = pd.read_csv('data/tags.csv')
movies = pd.read_csv('data/movies.csv')
ratings = pd.read_csv('data/ratings.csv')

## Quick Look

### Links

In [11]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [13]:
links.describe()

Unnamed: 0,movieId,imdbId,tmdbId
count,9742.0,9742.0,9734.0
mean,42200.353623,677183.9,55162.123793
std,52160.494854,1107228.0,93653.481487
min,1.0,417.0,2.0
25%,3248.25,95180.75,9665.5
50%,7300.0,167260.5,16529.0
75%,76232.0,805568.5,44205.75
max,193609.0,8391976.0,525662.0


In [15]:
links.shape

(9742, 3)

### Tags

In [8]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [16]:
tags.describe()

Unnamed: 0,userId,movieId,timestamp
count,3683.0,3683.0,3683.0
mean,431.149335,27252.013576,1320032000.0
std,158.472553,43490.558803,172102500.0
min,2.0,1.0,1137179000.0
25%,424.0,1262.5,1137521000.0
50%,474.0,4454.0,1269833000.0
75%,477.0,39263.0,1498457000.0
max,610.0,193565.0,1537099000.0


In [17]:
tags.shape

(3683, 4)

### Movies

In [9]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [18]:
movies.describe()

Unnamed: 0,movieId
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [19]:
movies.shape

(9742, 3)

### Ratings

In [10]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [20]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [21]:
ratings.shape

(100836, 4)

## Sorting movies by rating

In [76]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId')

In [77]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,964982700.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,847435000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1106636000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1510578000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1305696000.0


In [78]:
top_10_movie_ratings = (
    movies_with_ratings[['movieId', 'title', 'genres', 'rating']]
    .groupby(['movieId', 'title', 'genres'])
    .agg('sum')
)

In [79]:
top_10_movie_ratings = top_10_movie_ratings.sort_values(by='rating', ascending=False).reset_index()

In [80]:
top_10_movie_ratings.head(20)

Unnamed: 0,movieId,title,genres,rating
0,318,"Shawshank Redemption, The (1994)",Crime|Drama,1404.0
1,356,Forrest Gump (1994),Comedy|Drama|Romance|War,1370.0
2,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,1288.5
3,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,1165.5
4,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,1161.0
5,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1062.0
6,110,Braveheart (1995),Action|Drama|War,955.5
7,2959,Fight Club (1999),Action|Crime|Drama|Thriller,931.5
8,527,Schindler's List (1993),Drama|War,929.5
9,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,892.5


In [81]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [82]:
tags_grouped = tags[['movieId', 'tag']].groupby('movieId').agg('count')

In [83]:
top_10_movie_ratings_with_tags = top_10_movie_ratings.join(tags_grouped, on='movieId')

In [84]:
top_10_movie_ratings_with_tags.sort_values(by='tag', ascending=False).head()

Unnamed: 0,movieId,title,genres,rating,tag
2,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,1288.5,181.0
7,2959,Fight Club (1999),Action|Crime|Drama|Thriller,931.5,54.0
100,924,2001: A Space Odyssey (1968),Adventure|Drama|Sci-Fi,424.5,41.0
59,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,534.5,35.0
56,7361,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,545.0,34.0


In [85]:
top_10_movie_ratings_with_tags.shape

(9742, 5)

In [88]:
top_10_movie_ratings_with_tags['res'] = top_10_movie_ratings_with_tags['rating'] * (top_10_movie_ratings_with_tags['tag'] + 1 )

In [89]:
top_10_movie_ratings_with_tags.sort_values(by='res', ascending=False).head()

Unnamed: 0,movieId,title,genres,rating,tag,res
2,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,1288.5,181.0,234507.0
7,2959,Fight Club (1999),Action|Crime|Drama|Thriller,931.5,54.0,51232.5
5,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,1062.0,26.0,28674.0
59,293,Léon: The Professional (a.k.a. The Professiona...,Action|Crime|Drama|Thriller,534.5,35.0,19242.0
56,7361,Eternal Sunshine of the Spotless Mind (2004),Drama|Romance|Sci-Fi,545.0,34.0,19075.0
