In [None]:
# Retrieving the IMDb API: 
!pip3 install imdbpy

# Documentation for IMDb API:
# https://buildmedia.readthedocs.org/media/pdf/imdbpy/latest/imdbpy.pdf



In [None]:
# Mount Google Drive folder where the downloaded Netflix Viewing History has been stored
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Load Netflix Viewing History data
history = pd.read_csv("path to your Netflix CSV file in Google Drive")
# Sanity check:
# display(history.head(10))
# print(history.info())

# Processing 
# Clean titles to drop Season and Episode specific details
history["Title"] = history["Title"].str.split(':').str[0]
# Drop any rows with null values for Title
history = history[history["Title"].notna()]
# Sanity check:
display(history.sample(10))
# print(history.info())

Unnamed: 0,Title,Date
768,Unbreakable Kimmy Schmidt,7/22/20
25,New Girl,1/4/22
44,New Girl,12/28/21
720,Designated Survivor,10/23/20
518,Young & Hungry,2/7/21
465,Lucifer,2/27/21
538,Young & Hungry,2/4/21
786,Unbreakable Kimmy Schmidt,7/19/20
815,Space Force,6/7/20
741,Unbreakable Kimmy Schmidt,7/29/20


In [None]:
from imdb import IMDb

# The data we would like to retrieve for each show:
kind = []
movieID = []
genres = []
languages = []
runtime = []
ratings = []
actors = []
directors = []g,

# Create an instance of the IMDb class
ia = IMDb()

# # Get a show by name
for index, row in history.iterrows():
  show = row["Title"]
# for show in history["Title"]: 
  search = ia.search_movie(show)
  if not search:
    history.drop(index, inplace=True)
  if search: 
    # Pick most relevant title
    most_relevant = search[0]
    # Find the show's ID in the IMDb database 
    ID = most_relevant.movieID
    movieID.append(ID)

    obj = ia.get_movie(ID)  
    # Find if the show is a movie or show
    types = obj.get('kind')
    if kind:
      kind.append(types)
    else:
      kind.append(None)
    # Find the show's genre 
    genre = obj.get('genres')
    if genre:
      genres.append(genre)
    else:
      genres.append(None)
    # Find the show's languages
    language = obj.get('languages')
    if genre:
      languages.append(language)
    else:
      genres.append(None)
    # Find the show's runtime
    time = obj.get('runtime')
    if time:
      runtime.append(time)
    else:
      runtime.append(None)
    # Find the show's rating
    rating = obj.get("rating")
    if rating:
      ratings.append(rating)
    else:
      ratings.append(None)
    # Find the show's actors
    if obj.get('cast'):
      cast = []
      for person in obj.get('cast'):
        cast.append(person['name'])
      actors.append(cast)
    else:
      actors.append(None)
    # Find the show's directors
    if obj.get('directors'):
      director = []
      for person in obj.get('directors'):
        director.append(person['name'])
      directors.append(director)
    else:
      directors.append(None)   

history["kind"] = kind
history["movieID"] = movieID
history["genres"] = genres
history["languages"] = languages
history["runtime"] = runtime
history["rating"] = ratings
history["actors"] = actors
history["directors"] = directors

display(history)

Unnamed: 0,Title,Date,kind,movieID,genres,languages,runtime,rating,actors,directors
0,New Girl,1/11/22,,1826940,[Comedy],[English],[22],7.7,"[Zooey Deschanel, Jake Johnson, Max Greenfield...",
1,New Girl,1/11/22,tv series,1826940,[Comedy],[English],[22],7.7,"[Zooey Deschanel, Jake Johnson, Max Greenfield...",
2,New Girl,1/11/22,tv series,1826940,[Comedy],[English],[22],7.7,"[Zooey Deschanel, Jake Johnson, Max Greenfield...",
3,New Girl,1/10/22,tv series,1826940,[Comedy],[English],[22],7.7,"[Zooey Deschanel, Jake Johnson, Max Greenfield...",
4,New Girl,1/10/22,tv series,1826940,[Comedy],[English],[22],7.7,"[Zooey Deschanel, Jake Johnson, Max Greenfield...",
...,...,...,...,...,...,...,...,...,...,...
854,Varane Avashyamund,5/6/20,movie,11531530,"[Comedy, Drama, Romance]",[Malayalam],[145],6.9,"[Shobana, Suresh Gopi, Kalyani Priyadarshan, D...",[Anoop Sathyan]
855,Psycho,5/3/20,movie,0054215,"[Horror, Mystery, Thriller]",[English],[109],8.5,"[Anthony Perkins, Vera Miles, John Gavin, Jane...",[Alfred Hitchcock]
856,Game,5/3/20,movie,0119174,"[Drama, Mystery, Thriller]","[Spanish, Thai, English, Cantonese, German]",[129],7.7,"[Michael Douglas, Sean Penn, Deborah Kara Unge...",[David Fincher]
857,Superman Returns,5/2/20,movie,0348150,"[Action, Adventure, Sci-Fi]","[English, German, French]",[154],6.0,"[Brandon Routh, Kate Bosworth, Kevin Spacey, J...",[Bryan Singer]


<class 'pandas.core.frame.DataFrame'>
Int64Index: 852 entries, 0 to 858
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Title      852 non-null    object 
 1   Date       852 non-null    object 
 2   kind       851 non-null    object 
 3   movieID    852 non-null    object 
 4   genres     852 non-null    object 
 5   languages  844 non-null    object 
 6   runtime    808 non-null    object 
 7   rating     850 non-null    float64
 8   actors     851 non-null    object 
 9   directors  202 non-null    object 
dtypes: float64(1), object(9)
memory usage: 73.2+ KB
None


In [None]:
import datetime as dt

data = history.copy()

# Filter for dates in 2021
data["Date"] = pd.to_datetime(data['Date'])
data = data[data['Date'].dt.year == 2021]

# Analyze runtime
data["runtime"] = data["runtime"].apply(lambda x : int(x[0]) if x != None else x)
total_watch_time = data['runtime'].sum()
# Convert from mins to hours
total_watch_time_hrs = total_watch_time/60
# Convert from hours to days
total_watch_time_days = round(total_watch_time_hrs/24, 2)
print("You watched a total of " + str(total_watch_time) + " minutes of Netflix in 2021")
print("That is " + str(total_watch_time_days) + " days of chilling!")
print("On average, you spent " + str(round(total_watch_time/365, 2)) + " minutes each day of the year")
print("\n")
# See which day you binged the most
df1 = data.groupby("Date", as_index=False)["runtime"].sum()
df1 = df1.sort_values("runtime", ascending=False)
print("The day you binged the most was", df1.iloc[0]["Date"].date())
print("You watched " + str(df1.iloc[0]["runtime"]) + " minutes or " + str(round(df1.iloc[0]["runtime"]/60, 2)) + " hours of Netflix on that day")
print("It's okay, we all have those days :-)")
print("\n")

# Analyze kind
# Look at the different kind of shows available
# data.kind.value_counts() # The 2 main types are tv series and movie
movies = data.loc[data['kind'] == "movie"]
series = data.loc[data['kind'] == "tv series"]

# For movies:
movie_count = movies.shape[0]
movie_total_time = movies['runtime'].sum()
highest_rated_movies = movies.sort_values("rating", ascending=False)
# Analyze genres 
df2 = movies.drop_duplicates(subset=['Title'])
df2 = df2.explode('genres')
df2 = df2.groupby("genres").size().reset_index(name='counts').sort_values("counts", ascending=False)
top_movie_genres = df2.head(3).genres
print("Are you a movie buff? We think so. You watched " + str(movie_count) + " movies for a total of " + str(round(movie_total_time/60, 2)) + " hours" )
print("Your top movie genres are " + top_movie_genres.iloc[2] + ", " + top_movie_genres.iloc[1] + " and " + top_movie_genres.iloc[0])
print("\n")
print("Here's the list of the top 5 critically acclaimed movies that you watched: ")
iter = 0
for row in highest_rated_movies[:5].itertuples():
  iter+= 1
  print(str(iter) + ". " + str(row[1]) + " with an IMDb rating of " + str(row[8]))
print("\n")
# Analyze actors
df3 = movies.drop_duplicates(subset=['Title'])
df3 = df3.explode('actors')
df3 = df3.groupby("actors").size().reset_index(name='counts').sort_values("counts", ascending=False)
top_movie_actor = df3.head(3).actors
print("Your favorite movie actors are " + top_movie_actor.iloc[2] + ", " + top_movie_actor.iloc[1] + " and " + top_movie_actor.iloc[0])
print("\n")

# For series:
# Most watched
series_unique = series.copy().drop_duplicates(subset=['Title'])
series_count = series_unique.shape[0]
series_total_time = series['runtime'].sum()
print("You watched " + str(series_count) + " series for a total of " + str(round(series_total_time/60, 2)) + " hours" )
most_watched_series = series.groupby("Title")["runtime"].sum().reset_index().sort_values("runtime", ascending=False)
print("The most watched series of 2021 was " + str(most_watched_series.iloc[0][0]) + " for a total of " + str(round(most_watched_series.iloc[0][1]/60, 2)) + " hours")
print("\n")
# Analyze actors
df4 = series.drop_duplicates(subset=['Title'])
df4 = df4.explode('actors')
df4 = df4.groupby("actors").size().reset_index(name='counts').sort_values("counts", ascending=False)
top_series_actor = df4.head(3).actors
top_series_actor
print("Your favorite tv series actors are " + top_series_actor.iloc[2] + ", " + top_series_actor.iloc[1] + " and " + top_series_actor.iloc[0])
print("Bet you didn't even know some of them by name!")


You watched a total of 24869.0 minutes of Netflix in 2021
That is 17.27 days of chilling!
On average, you spent 68.13 minutes each day of the year


The day you binged the most was 2021-01-26
You watched 667.0 minutes or 11.12 hours of Netflix on that day
It's okay, we all have those days :-)


Are you a movie buff? We think so. You watched 110 movies for a total of 203.4 hours
Your top movie actors are Romance, Drama and Comedy


Here's the list of the top 5 critically acclaimed movies that you watched: 
1. Mandela with an IMDb rating of 8.5
2. Seaspiracy with an IMDb rating of 8.2
3. Zindagi Na Milegi Dobara with an IMDb rating of 8.2
4. Sudani from Nigeria with an IMDb rating of 8.2
5. Minnal Murali with an IMDb rating of 8.2


Your favorite movie actors are Elizabeth Banks, Paul Rudd and Pulkit Kumar


You watched 23 series for a total of 210.22 hours
The most watched series of 2021 was Lucifer for a total of 65.1 hours


Your favorite tv series actors are Alex Mallari Jr., Gregg S