In [1]:
import pandas as pd
import numpy as np

In [19]:
def format_stars(stars):
    stars = stars.split(', \'')
    stars = [star for star in stars if star != '' and star != '    Stars:\'']
    stars[0] = stars[0][2:]
    stars[-1] = stars[-1][:-2]

    dir_index = -1
    for j in range(len(stars)):
        stars[j] = stars[j].replace(', "', '')
        if 'Stars:\'' in stars[j]:
            stars[j] = stars[j].split(':\'')[1]
        
        if '|' in stars[j]:
            dir_index = j
    
    stars = stars[dir_index + 1:]

    return stars if stars != [] else None

def format_genres(genre):
    genres = genre.split(', ')
    return genres[0] if genres != [] else None

def format_years(year):
    if len(year) < 5 or year[1] not in ['1', '2']:
        return None
    return int(year[1:5])

def format_votes(votes):
    if (type(votes) == str):
        votes = votes.replace(",", "")
        votes = float(votes)
    return votes

def set_is_tv(duration):
    duration = int(duration[0: -4])

    if duration < 75:
        return True
    return False

def get_cleaned_dataframe():
    df = pd.read_csv('./n_movies.csv')
    df = df.drop(columns=['description', 'certificate'])
    df = df.dropna(subset=['stars', 'duration', 'year', 'genre'])
    df = df.drop_duplicates(subset=['title'])

    df['stars'] = df['stars'].apply(format_stars)
    df['genre'] = df['genre'].apply(format_genres)
    df['year'] = df['year'].apply(format_years)
    df['is_tv'] = df['duration'].apply(set_is_tv)
    df['votes'] = df['votes'].apply(format_votes)
    
    df = df.dropna()
    df = df.reset_index()
    
    return df


In [23]:
df = get_cleaned_dataframe()

df_sorted = df.sort_values(by='votes', ascending=False)
df_top = df_sorted.iloc[:500]
actors = []
for index, row in df_top.iterrows():
    for star in row['stars']:
        if star not in actors:
            actors.append(star)


      index                           title    year duration      genre  \
1254   1440             Dolemite Is My Name  2019.0  118 min  Biography   
627     722         I Am Not Okay with This  2020.0   30 min     Comedy   
871    1000           Spy Kids 3: Game Over  2003.0   84 min     Action   
390     436  A Series of Unfortunate Events  2017.0   50 min  Adventure   
460     519         The Shannara Chronicles  2016.0   42 min  Adventure   

      rating                                              stars    votes  \
1254     7.2  [Eddie Murphy, Keegan-Michael Key, Mike Epps, ...  60868.0   
627      7.5  [Sophia Lillis, Wyatt Oleff, Sofia Bryant, Kat...  60738.0   
871      4.3  [Daryl Sabara, Alexa PenaVega, Antonio Bandera...  60653.0   
390      7.8  [Neil Patrick Harris, Patrick Warburton, Malin...  60640.0   
460      7.1  [Austin Butler, Ivana Baquero, Manu Bennett, A...  60428.0   

      is_tv  
1254  False  
627    True  
871   False  
390    True  
460    True  
['Elijah

In [6]:
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
from pyvis.network import Network

# Network Visualization of Media with shared actors
def create_media_network(df, network, num_nodes):

    # Creating Nodes
    for i in tqdm(range(num_nodes)):
        network.add_node(df.loc[i,'title'])

    # Creating Edges
    for i in tqdm(range(num_nodes)):
        for j in range(i + 1, num_nodes):
            w = 0
            for actor in df.loc[i, 'stars']:
                if actor in df.loc[j, 'stars']:
                    if w == 0:
                        w += 2
                    else:
                        w += 5
            if w > 0:
                network.add_edge(df.loc[i, 'title'], df.loc[j, 'title'], weight=w,)
    return network

movies = df.where(df['is_tv'] == False).dropna().reset_index(drop=True)
tv_shows = df.where(df['is_tv'] == True).dropna().reset_index(drop=True)
media = df.dropna().reset_index(drop=True)

# Creating the networks
movie_network = nx.MultiGraph()
tv_show_network = nx.MultiGraph()
combine_network = nx.MultiGraph()

movie_network = create_media_network(movies, movie_network, 200)
tv_show_network = create_media_network(tv_shows, tv_show_network, 200)
combine_network = create_media_network(media, combine_network, 200)

# Scaling the nodes based on their degree
scale=4
d_movie = dict(movie_network.degree)
d_movie.update((x, scale*y) for x, y in d_movie.items())
d_tv_show = dict(tv_show_network.degree)
d_tv_show.update((x, scale*y) for x, y in d_tv_show.items())
d_combine = dict(combine_network.degree)
d_combine.update((x, scale*y) for x, y in d_combine.items())

# Setting up size attribute
nx.set_node_attributes(movie_network, d_movie,'size')
nx.set_node_attributes(tv_show_network, d_tv_show,'size')
nx.set_node_attributes(combine_network, d_combine,'size')

# Creating the Pyvis Networks
nt_movie = Network("500px", "500px")
nt_tv_show = Network("800px", "800px")
nt_combine = Network("800px", "800px")

nt_movie.from_nx(movie_network)
nt_tv_show.from_nx(tv_show_network)
nt_combine.from_nx(combine_network)

nt_movie.show("nx_movie.html")
nt_tv_show.show("nx_tv_show.html")
nt_combine.show("nx_combine.html")

100%|█████████████████████████████████████| 200/200 [00:00<00:00, 116025.01it/s]
100%|████████████████████████████████████████| 200/200 [00:00<00:00, 442.47it/s]
100%|█████████████████████████████████████| 200/200 [00:00<00:00, 126505.93it/s]
100%|████████████████████████████████████████| 200/200 [00:00<00:00, 459.71it/s]
100%|█████████████████████████████████████| 200/200 [00:00<00:00, 174182.06it/s]
100%|████████████████████████████████████████| 200/200 [00:00<00:00, 478.25it/s]


In [27]:
# Media Visualization of Actors with Shared Media

def create_actor_network(actors, df, network, num_nodes):
    # Creating Nodes
    for actor in range(num_nodes):
        network.add_node(actors[actor])

    # Creating Edges
    for i in range(num_nodes):
        for j in range(len(df)):
            if actors[i] in df.loc[j, 'stars']:
                for actor_2 in df.loc[j, 'stars']:
                    if (actor_2 != actors[i]) and (actor_2 in actors):
                        network.add_edge(actors[i], actor_2)

    return network

network = nx.MultiGraph()
network = create_actor_network(actors, df, network, len(actors))

# Scaling the nodes based on their degree
scale=4
d_actor = dict(network.degree)
d_actor.update((x, scale*y) for x, y in d_actor.items())

# Setting up size attribute
nx.set_node_attributes(network, d_actor,'size')

nt_actor = Network("500px", "500px")
nt_actor.from_nx(network)
nt_actor.show_buttons(filter_='physics')
nt_actor.show("nx_actor.html")

In [None]:

def plot_genre_v_rating(df):
    grouped_df = df.groupby('genre')

    labels = grouped_df.groups.keys()
    means = grouped_df['rating'].mean().tolist()
    stds = grouped_df['rating'].std().tolist()

    plt.figure(figsize=(30, 10))
    plt.bar(x=labels, height=means)
    plt.errorbar(range(len(labels)), means, yerr=stds, fmt="or")
    plt.xlim(-0.5,len(labels)-.5)

    plt.show()

plot_genre_v_rating(df)
plot_genre_v_rating(df.where(df['is_tv'] == True))
plot_genre_v_rating(df.where(df['is_tv'] == False))