# Imports

In [1]:
import pandas as pd
import numpy as np

from fetching_ratings import get_user_ratings
from app_functions import unzip_and_load_datasets, normalise_scores

## Loading Data

### My Ratings

In [2]:
df_my_ratings = get_user_ratings()
df_my_ratings

Retrieving ratings from page 1/3
Content retrieved: 100

Retrieving ratings from page 2/3
Content retrieved: 200

Retrieving ratings from page 3/3
Content retrieved: 261



Unnamed: 0,tconst,userRating
0,tt16968450,4
1,tt1517268,5
2,tt10478048,7
3,tt14230458,7
4,tt17351924,6
...,...,...
256,tt7366338,8
257,tt2707408,9
258,tt0111161,10
259,tt0944947,9


### IMDB Data

In [3]:
datasets = unzip_and_load_datasets()

In [4]:
# Get series and episode data
df_imdb_titles = datasets[0]
df_imdb_titles = df_imdb_titles.loc[~(df_imdb_titles['titleType'].isin(['tvSeries', 'tvMiniSeries', 'tvEpisode']))]
num_non_series = len(df_imdb_titles)
print('# NON-SERIES:', num_non_series)

# Get ratings for series and episodes
df_imdb_ratings = datasets[1]
df_imdb_titles = df_imdb_titles.merge(df_imdb_ratings, on='tconst')
num_non_series_with_ratings = len(df_imdb_titles)
print('# NON-SERIES WITH RATINGS: {} ({}%)'.format(num_non_series_with_ratings, round(100*num_non_series_with_ratings / num_non_series)))

df_imdb_titles['titleType'].value_counts()

# NON-SERIES: 2174181
# NON-SERIES WITH RATINGS: 595750 (27%)


movie        305312
short        155504
video         52440
tvMovie       52282
videoGame     15871
tvSpecial     12070
tvShort        2271
Name: titleType, dtype: int64

# Steps
- Rank films by score (combination of number of votes and series ratings)
- Rank films by runtime

## Score


In [8]:
df_score = df_imdb_titles.copy()
df_score['score'] = df_score['averageRating'] * df_score['numVotes']
df_score = normalise_scores(df_score, score_col='score')
df_score['scoreRank'] = df_score['score'].rank(method='min', ascending=False).astype(int) # Ranks 1,2,2,4,5,6,6,8 style
df_score.sort_values('scoreRank', inplace=True)
df_score.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,score,scoreRank
76240,tt0111161,movie,The Shawshank Redemption,The Shawshank Redemption,0,1994,\N,142,Drama,9.3,2855595,9.65,1
44707,tt0068646,movie,The Godfather,The Godfather,0,1972,\N,175,"Crime,Drama",9.2,1989468,9.59,2
221173,tt0468569,movie,The Dark Knight,The Dark Knight,0,2008,\N,152,"Action,Crime,Drama",9.0,2836667,9.5,3
104883,tt0167260,movie,The Lord of the Rings: The Return of the King,The Lord of the Rings: The Return of the King,0,2003,\N,201,"Action,Adventure,Drama",9.0,1956052,9.49,4
74028,tt0108052,movie,Schindler's List,Schindler's List,0,1993,\N,195,"Biography,Drama,History",9.0,1434395,9.46,5


# Finished and Unwatched

In [7]:
from datetime import datetime

df_watch = df_score.loc[~(df_score['tconst'].isin(df_my_ratings['tconst']))]
df_watch.index = np.arange(1, 1+len(df_watch))
df_watch.to_csv('df_nonseries_watch.csv')
df_watch.head(20)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,score,scoreRank
1,tt0078748,movie,Alien,Alien,0,1979,\N,117,"Horror,Sci-Fi",8.5,939284,9.01,48.0
2,tt0245429,movie,Spirited Away,Sen to Chihiro no kamikakushi,0,2001,\N,125,"Adventure,Animation,Family",8.6,835663,8.99,54.0
3,tt0253474,movie,The Pianist,The Pianist,0,2002,\N,150,"Biography,Drama,Music",8.5,900366,8.98,56.0
4,tt0317248,movie,City of God,Cidade de Deus,0,2002,\N,130,"Crime,Drama",8.6,793764,8.94,64.0
5,tt0118799,movie,Life Is Beautiful,La vita è bella,0,1997,\N,116,"Comedy,Drama,Romance",8.6,735923,8.88,72.0
6,tt0086250,movie,Scarface,Scarface,0,1983,\N,170,"Crime,Drama",8.3,905700,8.87,73.0
7,tt0066921,movie,A Clockwork Orange,A Clockwork Orange,0,1971,\N,136,"Crime,Sci-Fi",8.3,873847,8.84,81.0
8,tt0095016,movie,Die Hard,Die Hard,0,1988,\N,132,"Action,Thriller",8.2,937500,8.84,81.0
9,tt0075314,movie,Taxi Driver,Taxi Driver,0,1976,\N,114,"Crime,Drama",8.2,908784,8.82,85.0
10,tt0198781,movie,"Monsters, Inc.","Monsters, Inc.",0,2001,\N,92,"Adventure,Animation,Comedy",8.1,968317,8.8,87.0
