# Analysing Netflix Data
Finding most-watched genres, which day I spent the most watching Netflix and more...

Import Libraries

In [136]:
# import libraries
import numpy as np 
import pandas as pd
from matplotlib import pyplot as plt
from collections import Counter
import seaborn as sns

Load DataFrames

In [137]:
df = pd.read_csv('../input/netflixdata-viewingactivity/SampleViewingAcitivty.csv')
netflix_df = pd.read_csv('../input/netflix-shows/netflix_titles.csv')

## Data Cleaning

Drop unnecessary columns

In [138]:
df = df.drop(["Profile Name", 'Attributes', 'Supplemental Video Type',
             'Device Type', 'Bookmark', 'Latest Bookmark', 'Country'], axis=1)

Drop rows for Trailers

In [139]:
trailer = df[df["Title"].str.contains("(Trailer)")]
# drop only once
# df = df.drop([97,32],axis=0,inplace=False)

Clean Titles to a new colun Title_clean
- remove details on seasons
- remove details after underscores

In [140]:
df['Title_clean'] = df['Title'].str.split('_').str[0]
df['Title_clean'] = df['Title_clean'].str.split(':').str[0]

In [141]:
df.head()

## Correcting format of date and time
We need to do this as it is a string in the dataframe so we convert it to time and date formats.

In [142]:
df['Duration'] = pd.to_timedelta(df['Duration'])
df['Start Time'] = pd.to_datetime(df['Start Time'])

## Getting used to the dataframe
Which movie have I watched the most in one sitting and how long did I watch it for?

In [143]:
def longestSession():
    print('Longest Session:',df["Duration"].max())
    longsess = df["Duration"].max()
    # find index of longest duration according to excel sheet
    longsess_idx = df.index[df["Duration"] == longsess][0] #get first match of longest duration index
    # get entire row using the index    
    longsess = df.iloc[[longsess_idx]] #takes in a list and returns all the data in that row
    print("You watched", longsess["Title"].values[0], "for", longsess["Duration"].values[0])
longestSession()

# Exploratory Data Analysis
Total time spend on Netflix

In [144]:
print('Longest Session:',df["Duration"].max())
longsess = df["Duration"].max()
# find index of longest duration according to excel sheet
longsess_idx = df.index[df["Duration"] == longsess][0] #get first match of longest duration index
# get entire row using the index    
longsess = df.iloc[[longsess_idx]] #takes in a list and returns all the data in that row
print("You watched", longsess["Title"].values[0], "for", longsess["Duration"].values[0])

Finding total time spent watching a specific movie

In [145]:
def totalTimeSpent(movie):
    moviename = df[df['Title'].str.contains(movie, regex=False)]
    print("time spend watching",movie,moviename["Duration"].sum())

totalTimeSpent('Shutter Island')
totalTimeSpent('The Chase')
totalTimeSpent('Strangers From Hell')

### Which day I watch Netflix the most
Bar graph showing toral duration I watched Netflix for each day of the week

In [146]:
pd.options.mode.chained_assignment = None  # default='warn' use this to remove warnings

df['weekday'] = df['Start Time'].dt.weekday #returns 0 to 6
df['hour'] = df['Start Time'].dt.hour # return the hour of the day from 0 23

# print(df['hour'])

monday = df[df['weekday']==0]['Duration'].sum()
tuesday = df[df['weekday']==1]['Duration'].sum()
wednesday = df[df['weekday']==2]['Duration'].sum()
thursday = wednesday = df[df['weekday']==3]['Duration'].sum()
friday = wednesday = df[df['weekday']==4]['Duration'].sum()
saturday = df[df['weekday']==5]['Duration'].sum()
sunday = df[df['weekday']==6]['Duration'].sum()

dayList = [monday,tuesday,wednesday,thursday,friday,saturday,sunday]
for i in range(len(dayList)):
    dayList[i] = dayList[i].total_seconds()/60/60
print(dayList)

fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
days = ['monday','tuesday','wednesday','thursday','friday','saturday','sunday']
ax.bar(days,dayList, color=['#D77FA1', '#92A9BD', '#9D9D9D', '#BCCC9A', '#D1E8E4','#5E454B'])
plt.ylabel("Time Spent (hours)")
plt.title("Time spent watching Netflix each day")
plt.show()

# API to get Movie Genres to find Favourite Genres
We use the rapidAPI IMDB API to get genres of every movie in our dataset, this takes about 3 minutes for this dataset, its results are saved in "genres" variable

In [147]:
def findGenre(title):
    API_KEY = 'XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' #use your own API key
    url = "https://imdb8.p.rapidapi.com/title/find"

    querystring = {"q":title}

    headers = {
        'x-rapidapi-host': "imdb8.p.rapidapi.com",
        'x-rapidapi-key': API_KEY
        }

    response = requests.request("GET", url, headers=headers, params=querystring)

    movieCode = response.json()
    # might not be accurate cuz we only care abt the first response
    try:
        movieCode = movieCode['results'][0]['id']
        movieCode = movieCode[7:]
        # print(movieCode)

        url = "https://imdb8.p.rapidapi.com/title/get-overview-details"

        querystring = {"tconst":movieCode,"currentCountry":"US"}

        headers = {
            'x-rapidapi-host': "imdb8.p.rapidapi.com",
            'x-rapidapi-key': API_KEY
            }

        response = requests.request("GET", url, headers=headers, params=querystring)

        response = response.json()
        movieGenres = response['genres']
        return movieGenres
        # print(movieGenres)
    except:
        return 'no data found'

# findGenre('Trailer') 
# this will return no data found cuz their API doesn't hv this record and we handled the error
# print(findGenre('Strangers from hell'))

Using the function we just created

In [148]:
def findGenres():
    movies = list(df['Title_clean'])
    genres = []

    no_duplicate_movielist = list(dict.fromkeys(movies))

    print(no_duplicate_movielist)

#     get genre with function and API
    for movie in no_duplicate_movielist:
        genres.append(findGenre(movie))
    
    print(genres)

# fill in list according to the output from the function above, js copy paste
genres = [['Drama', 'Family', 'Sci-Fi'], ['Adventure', 'Drama', 'Horror', 'Thriller'], ['Adventure', 'Drama', 'Horror', 'Thriller'], ['Crime', 'Drama', 'Mystery', 'Thriller'], ['Comedy', 'Romance'], ['Action', 'Drama', 'Mystery', 'Thriller'], 'no data found', ['Drama', 'Mystery'], ['Crime', 'Horror', 'Mystery'], ['Drama', 'Horror', 'Mystery', 'Thriller'], ['Action', 'Drama', 'Horror', 'Sci-Fi'], ['Comedy', 'Drama', 'Music', 'Romance'], ['Mystery', 'Thriller'], ['Action', 'Adventure', 'Thriller'], ['Action', 'Crime', 'Drama', 'Sci-Fi', 'Thriller'], ['Crime', 'Drama', 'Thriller'], ['Action', 'Crime', 'Thriller'], ['Horror', 'Mystery'], 'no data found', ['Action', 'Adventure', 'Drama', 'Fantasy', 'Sci-Fi'], ['Horror', 'Mystery', 'Thriller'], ['Action', 'Drama', 'Horror', 'Thriller'], 'no data found', ['Drama', 'Romance'], ['Action', 'Horror', 'Thriller'], ['Comedy', 'Horror'], ['Drama', 'Romance'], ['Drama', 'History', 'Romance'], ['Crime', 'Mystery', 'News'], ['Comedy'], ['Comedy', 'Drama'], ['Comedy', 'Drama', 'Romance'], ['Action', 'Adventure', 'Mystery', 'Sci-Fi'], ['Drama', 'Horror', 'Mystery'], ['Drama', 'Thriller'], ['Adventure', 'Drama', 'Fantasy', 'Romance'], ['Drama', 'Fantasy', 'Romance'], ['Mystery', 'Thriller'], ['Biography', 'Crime', 'Drama', 'History', 'Mystery', 'Thriller'], ['Horror', 'Thriller'], ['Game-Show', 'Reality-TV'], ['Crime', 'Drama', 'Thriller'], ['Comedy'], ['Comedy', 'Romance'], ['Action', 'Comedy', 'Romance'], ['Crime', 'Drama', 'Mystery', 'Thriller'], ['Action', 'Comedy', 'Crime', 'Thriller'], ['Fantasy', 'Horror', 'Thriller'], ['Horror', 'Mystery', 'Thriller'], ['Drama', 'Horror', 'Thriller'], ['Horror', 'Mystery', 'Thriller'], ['Drama', 'Horror', 'Mystery', 'Thriller'], ['Horror', 'Mystery', 'Thriller'], ['Horror', 'Thriller'], ['Drama', 'Horror', 'Thriller'], ['Drama'], ['Action', 'Comedy', 'Fantasy'], ['Horror', 'Mystery', 'Thriller'], ['Crime', 'Drama', 'Thriller'], ['Comedy', 'Family'], ['Mystery', 'Thriller'], ['Action', 'Adventure', 'Crime', 'Drama', 'Thriller'], ['Drama', 'Horror', 'Mystery', 'Thriller'], ['Horror', 'Thriller'], ['Crime', 'Horror', 'Thriller'], ['Crime', 'Drama', 'Thriller'], ['Horror', 'Sci-Fi', 'Thriller'], ['Action', 'Crime', 'Drama', 'Mystery', 'Thriller']]

### Flatten Nested List and Find most common/most repeated genres
Since we now have a list of genres, the most repeated one would be the genres we like/watch the most! I used recursion algorithm to flatten the list.

In [149]:
def flatten(listoflists):
    rt = []
    for i in listoflists:
        if isinstance(i,list):
            # if i is an instance of the list (checking if its a value or another list)
            # if its another list den flatten
            rt.extend(flatten(i)) #goes back up and adds it to the final list aft the end of the call tree
        else: 
            # else append
            rt.append(i)
    return rt

### Driver code to get Favourite Genres

In [150]:
genreList = flatten(genres)

c = Counter(genreList)
print (c.most_common(3))

# More Analytics
- pie chart showing genres of movies watched
- most watched/least watched shows

In [151]:
# print(genreList)
genreList = c.items()
names = []
num = []

for gen in genreList:
    if gen[0] != 'no data found':
        names.append(gen[0])
        num.append(gen[1])
    
print(names,num)

# sns.heatmap(genres, linewidths=0, linecolor='yellow')

# Create a circle at the center of the plot
fig = plt.figure()
fig.patch.set_facecolor('white')
my_circle = plt.Circle( (0,0), 0.7, color='white')

# Give color names
plt.pie(num, labels=names, colors=['#D77FA1', '#92A9BD', '#9D9D9D', '#BCCC9A', '#D1E8E4','#5E454B'])
p = plt.gcf()
p.gca().add_artist(my_circle)
plt.show()

Most Watched Shows

In [152]:
movie_views = df.groupby(['Title_clean'])['Duration'].sum().sort_values(ascending = False)
print(movie_views.head(10))

Least Watched Shows

In [153]:
movie_views.tail(10)