In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tabulate import tabulate


import seaborn as sns

plt.style.use('fivethirtyeight')

from bs4 import BeautifulSoup
import requests

In [None]:
data = pd.read_csv('../data/data_file.csv')
movies = data

## Dropping unwanted columns, only keeping full length movies, creating score metric

In [None]:
#drop cols with very little data
movies = data.drop(['region', 'language', 'attributes', 'endYear','Unnamed: 0', 
                  'types','isOriginalTitle', 'primaryTitle', 'isAdult', 'ordering'], axis=1)
#drop rows with Nan values
movies = movies.dropna()

#only keep full length movies, no TV episodes, no short films, no video games, etc
movies = movies[movies['titleType'] == 'movie']

#making two cols integers instead of floats
movies.startYear = movies.startYear.astype(int)
movies.runtimeMinutes = movies.runtimeMinutes.astype(int)

#creating a score metric that places movies voted on more higher than movies voted on less
movies['score'] = (movies['averageRating']-5.0) * movies['numVotes']

In [None]:
movies['writers'] = movies['writers'].str.split(',', expand=True)
movies['directors'] = movies['directors'].str.split(',', expand=True)
movies['genres'] = movies['genres'].str.split(',', expand=True)

## Functions to make it easier to get information

In [None]:
def frequency_dict(pandas_series):
    out_dict = {}
    for x in pandas_series:
        if x in out_dict:
            out_dict[x] +=1
        else:
            out_dict[x] = 1
    return {k: v for k, v in sorted(out_dict.items(), key=lambda item: item[1], reverse=True)}

In [None]:
#all the writers, directors, and title are identified by a unique ID in the IMDB database
#ex: Martin Scorsese is identified as 'nm0000217'
#this is a function that takes in a unique ID,
# web scrapes that IMDB url for a speficic html tag that 
#contains the person's/movie's name
def get_name(ID):
    url = 'https://www.imdb.com/name/'+ ID
    r  = requests.get(url)
    data = r.text
    soup = BeautifulSoup(data, "html.parser")
    mydivs = soup.find("span", {"class": "itemprop"})
    return str(mydivs.contents[0])

In [None]:
def homemade_hist(dictionary, title, color, fig_w=13, fig_h=10):
    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
    names = []
    counts = []
    for name, count in zip(dictionary.keys(), dictionary.values()):
        names.append(get_name(name))
        counts.append(count)
    
    out = ax.bar(names, counts, color = color)
    ax.set_xticklabels(names, fontsize=10)
    ax.set_title(title)

    return out

def homemade_hist_with_name(dictionary, title, color, fig_w=13, fig_h=10):
    fig, ax = plt.subplots(figsize=(fig_w, fig_h))
    names = []
    counts = []
    for name, count in zip(dictionary.keys(), dictionary.values()):
        names.append(name)
        counts.append(count)
    
    out = ax.bar(names, counts, color = color)
    ax.set_xticklabels(names, fontsize=10)
    ax.set_title(title)

    return out

## Finding notables

Top x movies, bottom x movies, most occuring writers/directors, best years, worst years

In [None]:
top = movies.sort_values(by=['score'], ascending=False).head(15)
bot = movies.sort_values(by=['score'], ascending=True).head(15)

In [None]:
top_directors = frequency_dict(top_20['directors'])
top_writers = frequency_dict(top_20['writers'])

In [None]:
most_frequent_genres = frequency_dict(top_20['genres'])

In [None]:
bot_directors = frequency_dict(bot_20['directors'])
bot_writers = frequency_dict(bot_20['writers'])

In [None]:
best_years = frequency_dict(top_20['startYear'])
worst_years = frequency_dict(bot_20['startYear'])

## Official EDA Graphs and tables

In [None]:
movies

In [None]:
bot

In [None]:
homemade_hist(top_directors, 'Directors by # of Titles in Top 15', '#F58426')

In [None]:
homemade_hist(top_writers, 'Writers by # of Titles in Top 15', '#552583')

In [None]:
homemade_hist_with_name(most_frequent_genres, 'Most Frequent Genres in Top 15', '#00788C')

In [None]:
x = list(range(movies['score'].size))

In [None]:
y = movies['score']

In [None]:
y.size

In [None]:
len(x)

In [None]:
fig, ax = plt.subplots(figsize=(13, 10))
ax.scatter(x, y, color='#BA9653')
ax.set_title('Scores of Titles')