In [1]:
#importing the relevant python libraries
import pandas as pd
import numpy as np

In [2]:
#loading the ratings table
ratings_df = pd.read_csv("ratings.csv")
ratings_df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [3]:
#We scale down the timestamp values for facilitating later use
#Timestamp values are somewhat a measure of how trending a movie is 
#Greater the value the more recent the rating had been
ratings_df['timestamp'] = ratings_df['timestamp'] / 1000000
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964.982703
1,1,3,4.0,964.981247
2,1,6,4.0,964.982224
3,1,47,5.0,964.983815
4,1,50,5.0,964.982931
...,...,...,...,...
100831,610,166534,4.0,1493.848402
100832,610,168248,5.0,1493.850091
100833,610,168250,5.0,1494.273047
100834,610,168252,5.0,1493.846352


In [4]:
#Calculating the average rating for each movie and storing it in a separate dataframe
#Mean ratings are the measure of how likeable the movie is 
ratings_avg = pd.DataFrame(ratings_df.groupby('movieId').mean()['rating'].reset_index().rename(columns={'rating':'mean_rating'}))
ratings_avg

Unnamed: 0,movieId,mean_rating
0,1,3.920930
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429
...,...,...
9719,193581,4.000000
9720,193583,3.500000
9721,193585,3.500000
9722,193587,3.500000


In [5]:
#Counting the number of ratings for each movie and storing it in a separate dataframe
#Number of ratings are a measure of how popular the movie is
ratings_count = pd.DataFrame(ratings_df.groupby('movieId').count()['userId'].reset_index().rename(columns={'userId':'rating_count'}))
ratings_count

Unnamed: 0,movieId,rating_count
0,1,215
1,2,110
2,3,52
3,4,7
4,5,49
...,...,...
9719,193581,1
9720,193583,1
9721,193585,1
9722,193587,1


In [6]:
#Calculating the sum of timestamp values for each movie and storing it in a separate dataframe
#The sum of timestamp values is a measure of how much trending a movie is
trending_count = pd.DataFrame(ratings_df.groupby('movieId').sum()['timestamp'].reset_index().rename(columns={'timestamp':'trending_value'}))
trending_count

Unnamed: 0,movieId,trending_value
0,1,242914.455479
1,2,124938.583322
2,3,52265.734386
3,4,6290.052048
4,5,48640.552594
...,...,...
9719,193581,1537.109082
9720,193583,1537.109545
9721,193585,1537.109805
9722,193587,1537.110021


In [7]:
#Merging the ratings and mean ratings dataframes on the movieId column
movie_popularity = pd.merge(ratings_df, ratings_avg, on='movieId')
movie_popularity

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating
0,1,1,4.0,964.982703,3.92093
1,5,1,4.0,847.434962,3.92093
2,7,1,4.5,1106.635946,3.92093
3,15,1,2.5,1510.577970,3.92093
4,17,1,4.5,1305.696483,3.92093
...,...,...,...,...,...
100831,610,160341,2.5,1479.545749,2.50000
100832,610,160527,4.5,1479.544998,4.50000
100833,610,160836,3.0,1493.844794,3.00000
100834,610,163937,3.5,1493.848789,3.50000


In [8]:
#Merging the last dataframe with the ratings count dataframe on the movieId column
movie_popularity = pd.merge(movie_popularity, ratings_count, on='movieId')
movie_popularity

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating,rating_count
0,1,1,4.0,964.982703,3.92093,215
1,5,1,4.0,847.434962,3.92093,215
2,7,1,4.5,1106.635946,3.92093,215
3,15,1,2.5,1510.577970,3.92093,215
4,17,1,4.5,1305.696483,3.92093,215
...,...,...,...,...,...,...
100831,610,160341,2.5,1479.545749,2.50000,1
100832,610,160527,4.5,1479.544998,4.50000,1
100833,610,160836,3.0,1493.844794,3.00000,1
100834,610,163937,3.5,1493.848789,3.50000,1


In [9]:
#Merging the last dataframe with the timestamp sum dataframe on the movieId column
movie_popularity = pd.merge(movie_popularity, trending_count, on='movieId')
movie_popularity

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating,rating_count,trending_value
0,1,1,4.0,964.982703,3.92093,215,242914.455479
1,5,1,4.0,847.434962,3.92093,215,242914.455479
2,7,1,4.5,1106.635946,3.92093,215,242914.455479
3,15,1,2.5,1510.577970,3.92093,215,242914.455479
4,17,1,4.5,1305.696483,3.92093,215,242914.455479
...,...,...,...,...,...,...,...
100831,610,160341,2.5,1479.545749,2.50000,1,1479.545749
100832,610,160527,4.5,1479.544998,4.50000,1,1479.544998
100833,610,160836,3.0,1493.844794,3.00000,1,1493.844794
100834,610,163937,3.5,1493.848789,3.50000,1,1493.848789


In [10]:
#Dropping the userId column
movie_popularity = movie_popularity.drop('userId', axis=1)

In [11]:
#Dropping the rating column
movie_popularity = movie_popularity.drop('rating', axis=1)

In [12]:
#Dropping the timestamp column
movie_popularity = movie_popularity.drop('timestamp', axis=1)

In [13]:
#Dropping duplicate rows from the popularity dataframe
movie_popularity = movie_popularity.drop_duplicates('movieId').sort_index()
movie_popularity

Unnamed: 0,movieId,mean_rating,rating_count,trending_value
0,1,3.920930,215,242914.455479
215,3,3.259615,52,52265.734386
267,6,3.946078,102,107897.369568
369,47,3.975369,203,231438.354555
572,50,4.237745,204,237150.222745
...,...,...,...,...
100831,160341,2.500000,1,1479.545749
100832,160527,4.500000,1,1479.544998
100833,160836,3.000000,1,1493.844794
100834,163937,3.500000,1,1493.848789


In [14]:
#Sorting the popularity dataframe on ratings count, sum of timestamp values, and average rating
movie_popularity = movie_popularity.sort_values(['rating_count','trending_value','mean_rating'],ascending=False)
movie_popularity

Unnamed: 0,movieId,mean_rating,rating_count,trending_value
2426,356,4.164134,329,386165.236681
16296,318,4.429022,317,376924.839127
1819,296,4.197068,307,349204.311001
4310,593,4.161290,279,320035.674330
12642,2571,4.192446,278,350270.041779
...,...,...,...,...
100523,117,3.000000,1,832.080636
100524,220,3.000000,1,832.080636
93243,496,5.000000,1,829.760898
93233,178,1.000000,1,829.760898


In [15]:
#Loading the movies csv file into a dataframe
movies_df = pd.read_csv('movies.csv')
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [16]:
#Merging the popularity and movies dataframe on movieId column
#This would help us to identify the movies by there names instead of their ids
top_movies = pd.merge(movie_popularity, movies_df, on='movieId')
top_movies

Unnamed: 0,movieId,mean_rating,rating_count,trending_value,title,genres
0,356,4.164134,329,386165.236681,Forrest Gump (1994),Comedy|Drama|Romance|War
1,318,4.429022,317,376924.839127,"Shawshank Redemption, The (1994)",Crime|Drama
2,296,4.197068,307,349204.311001,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,593,4.161290,279,320035.674330,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
4,2571,4.192446,278,350270.041779,"Matrix, The (1999)",Action|Sci-Fi|Thriller
...,...,...,...,...,...,...
9719,117,3.000000,1,832.080636,"Young Poisoner's Handbook, The (1995)",Crime|Drama
9720,220,3.000000,1,832.080636,Castle Freak (1995),Horror
9721,496,5.000000,1,829.760898,What Happened Was... (1994),Comedy|Drama|Romance|Thriller
9722,178,1.000000,1,829.760898,Love & Human Remains (1993),Comedy|Drama


In [17]:
#Dropping the movieId, mean ratings, ratings count and sum of timestamp values from the last table
top_movies = top_movies.drop(['movieId', 'trending_value', 'mean_rating', 'rating_count'], axis=1)
top_movies

Unnamed: 0,title,genres
0,Forrest Gump (1994),Comedy|Drama|Romance|War
1,"Shawshank Redemption, The (1994)",Crime|Drama
2,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
4,"Matrix, The (1999)",Action|Sci-Fi|Thriller
...,...,...
9719,"Young Poisoner's Handbook, The (1995)",Crime|Drama
9720,Castle Freak (1995),Horror
9721,What Happened Was... (1994),Comedy|Drama|Romance|Thriller
9722,Love & Human Remains (1993),Comedy|Drama


In [18]:
#Printing the top 20 popular movies according to the current datasate
print("Top 20 Popular movies :-")
top_movies.head(20)

Top 20 Popular movies :-


Unnamed: 0,title,genres
0,Forrest Gump (1994),Comedy|Drama|Romance|War
1,"Shawshank Redemption, The (1994)",Crime|Drama
2,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
3,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller
4,"Matrix, The (1999)",Action|Sci-Fi|Thriller
5,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
6,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
7,Braveheart (1995),Action|Drama|War
8,Terminator 2: Judgment Day (1991),Action|Sci-Fi
9,Schindler's List (1993),Drama|War
