In [1]:
import pandas as pd
import numpy as np

In [51]:
def format_stars(stars):
    stars = stars.split(', \'')
    stars = [star for star in stars if star != '' and ':\'' not in star]
    stars[0] = stars[0][2:]
    stars[-1] = stars[-1][:-2]

    dir_index = -1
    for j in range(len(stars)):
        stars[j] = stars[j].replace(', "', '')
        if ':\'' in stars[j]:
            stars[j] = stars[j].split(':\'')[1]
        if '|' in stars[j]:
            dir_index = j
    
    stars = stars[dir_index + 1:]
    return stars if stars != [] else None

def format_genres(genre):
    genres = genre.split(', ')
    return genres if genres != [] else None

def format_years(year):
    if len(year) < 5 or year[1] not in ['1', '2']:
        return None
    return int(year[1:5])

def format_votes(votes):
    if (type(votes) == str):
        votes = votes.replace(",", "")
        votes = float(votes)
    return votes

def set_is_tv(duration):
    duration = int(duration[0: -4])

    if duration < 70:
        return True
    return False

def get_cleaned_dataframe():
    df = pd.read_csv('./n_movies.csv')
    df = df.drop(columns=['description', 'certificate'])
    df = df.dropna(subset=['stars', 'duration', 'year', 'genre'])
    df = df.drop_duplicates(subset=['title'])

    df['stars'] = df['stars'].apply(format_stars)
    df['genre'] = df['genre'].apply(format_genres)
    df['year'] = df['year'].apply(format_years)
    df['is_tv'] = df['duration'].apply(set_is_tv)
    df['votes'] = df['votes'].apply(format_votes)
    
    df = df.dropna()
    df = df.reset_index()
    
    return df


In [52]:
df = get_cleaned_dataframe()

def get_top_media(df, n):
    # Creating dataframes of the top 500 rows sorted by number of votes
    top_media = media.sort_values(by='votes', ascending=False).iloc[:n]
    top_actors = []
    for _, row in top_media.iterrows():
        for star in row['stars']:
            if star not in top_actors:
                top_actors.append(star)
    return top_media

def get_unique_actors(df):
    actors = []
    for _, row in df.iterrows():
        for star in row['stars']:
            if star not in actors:
                actors.append(star)

    return actors


# Making seperate dataframes for movies, tv shows, and combined
movies = df.where(df['is_tv'] == False).dropna().reset_index(drop=True)
tv_shows = df.where(df['is_tv'] == True).dropna().reset_index(drop=True)
media = df.dropna().reset_index(drop=True)


# Creating lists of the actors found in the top 500 movies/tv shows/combined media
tv_top = get_top_media(tv_shows, 500)
tv_actors = get_unique_actors(tv_top)

movie_top = get_top_media(movies, 500)
movie_actors = get_unique_actors(movie_top)

media_top = get_top_media(media, 500)
media_actors = get_unique_actors(media_top)

all_actors = get_unique_actors(df)


Peter O'Toole
Manti Te'o
Dylan O'Brien
Cristiana Dell'Anna




Jermaine O'Neal
Junming 'Jimmy' Wang



Benjamin O'Keefe
Auli'i Cravalho
Jun'ichi Suwabe




























Nigel O'Neill











Lucas 'Koka' Penteado










Cam'ron






Maureen O'Hara


Tatum O'Neal












Anya O'Connor





















Chris D'Elia


Matt O'Leary





















Emilio D'Alessandro






Tybie O'Bard



































Chris D'Elia

























Chris D'Elia































































Rodrigo Sant'anna










Soledad O'Brien
























































































In [31]:
import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm
from pyvis.network import Network

colors = {000: "#FF8AA7",
    196: "#FFBB8A",
    197: "#F5FF8A",
    198: "#ACFF8A",
    199: "#8AF0FF",
    200: "#8A99FF",
    201: "#D18AFF",
    202: "#FF8ADF"}

# Network Visualization of Media with shared actors
def create_media_network(df, network, num_nodes):
   
    # Creating Nodes
    for i in tqdm(range(num_nodes)):
        year = df.loc[i, 'year']
        year = int(str(year)[:3])
        if year not in colors:
            year=000
        network.add_node(df.loc[i,'title'], color=colors[year])

    # Creating Edges
    for i in tqdm(range(num_nodes)):
        for j in range(i + 1, num_nodes):
            w = 0
            for actor in df.loc[i, 'stars']:
                if actor in df.loc[j, 'stars']:
                    if w == 0:
                        w += 2
                    else:
                        w += 5
            if w > 0:
                network.add_edge(df.loc[i, 'title'], df.loc[j, 'title'], weight=w, color='black')
    return network

# Creating the networks
movie_network = nx.MultiGraph()
tv_show_network = nx.MultiGraph()
combine_network = nx.MultiGraph()

movie_network = create_media_network(movies, movie_network, 100)
tv_show_network = create_media_network(tv_shows, tv_show_network, 50)
combine_network = create_media_network(media, combine_network, 50)

# Scaling the nodes based on their degree
scale=4
d_movie = dict(movie_network.degree)
d_movie.update((x, scale*y) for x, y in d_movie.items())
d_tv_show = dict(tv_show_network.degree)
d_tv_show.update((x, scale*y) for x, y in d_tv_show.items())
d_combine = dict(combine_network.degree)
d_combine.update((x, scale*y) for x, y in d_combine.items())

# Setting up size attribute
nx.set_node_attributes(movie_network, d_movie,'size')
nx.set_node_attributes(tv_show_network, d_tv_show,'size')
nx.set_node_attributes(combine_network, d_combine,'size')

# Creating the Pyvis Networks
nt_movie = Network("1000px", "1000px")
nt_tv_show = Network("1000px", "1000px")
nt_combine = Network("1000px", "1000px")

nt_movie.from_nx(movie_network)
nt_tv_show.from_nx(tv_show_network)
nt_combine.from_nx(combine_network)

nt_movie.show_buttons(filter_='physics')
nt_tv_show.show_buttons(filter_='physics')
nt_combine.show_buttons(filter_='physics')

nt_movie.show("nx_movie.html")
nt_tv_show.show("nx_tv_show.html")
nt_combine.show("nx_combine.html")

100%|███████████████████████████████████████| 100/100 [00:00<00:00, 3512.46it/s]
100%|███████████████████████████████████████| 100/100 [00:00<00:00, 1305.07it/s]
100%|████████████████████████████████████████| 50/50 [00:00<00:00, 30808.76it/s]
100%|█████████████████████████████████████████| 50/50 [00:00<00:00, 2890.15it/s]
100%|████████████████████████████████████████| 50/50 [00:00<00:00, 78486.23it/s]
100%|█████████████████████████████████████████| 50/50 [00:00<00:00, 2790.81it/s]


In [30]:
# Media Visualization of Actors with Shared Media

def create_actor_network(actors, df, network, num_nodes):
    # Creating Nodes
    for actor in range(num_nodes):
        network.add_node(actors[actor])

    # Creating Edges
    for i in range(num_nodes):
        for j in range(len(df)):
            if actors[i] in df.loc[j, 'stars']:
                for actor_2 in df.loc[j, 'stars']:
                    if (actor_2 != actors[i]) and (actor_2 in actors):
                        w=1
                        if (network.has_edge(actors[i], actor_2)):
                            e = network.get_edge_data(actors[i], actor_2)
                            w += e[0]["weight"]
                            w += 2
                            network.remove_edge(actors[i], actor_2)
                            
                        network.add_edge(actors[i], actor_2, weight=w)
    return network

movie_actor_network = nx.MultiGraph()
movie_actor_network = create_actor_network(movie_actors, movies, movie_actor_network, len(movie_actors))

tv_actor_network = nx.MultiGraph()
tv_actor_network = create_actor_network(tv_actors, tv_shows, tv_actor_network, len(tv_actors))

media_actor_network = nx.MultiGraph()
media_actor_network = create_actor_network(media_actors, media, media_actor_network, len(media_actors))

# Scaling the nodes based on their degree
scale=4
d_movie_actor = dict(movie_actor_network.degree)
d_movie_actor.update((x, scale*y) for x, y in d_movie_actor.items())

d_tv_actor = dict(tv_actor_network.degree)
d_tv_actor.update((x, scale*y) for x, y in d_tv_actor.items())

d_media_actor = dict(media_actor_network.degree)
d_media_actor.update((x, scale*y) for x, y in d_media_actor.items())

# Setting up size attribute
nx.set_node_attributes(movie_actor_network, d_movie_actor,'size')
nx.set_node_attributes(tv_actor_network, d_tv_actor,'size')

nx.set_node_attributes(media_actor_network, d_media_actor,'size')
nx.set_node_attributes(media_actor_network, d_media_actor,'size')
# Converting to Pyvis Network
nt_movie_actor = Network("1000px", "1000px")
nt_movie_actor.from_nx(movie_actor_network)
nt_movie_actor.show_buttons(filter_='physics')
nt_movie_actor.show("nx_movie_actor.html")

nt_tv_actor = Network("1000px", "1000px")
nt_tv_actor.from_nx(tv_actor_network)
nt_tv_actor.show_buttons(filter_='physics')
nt_tv_actor.show("nx_tv_actor.html")

nt_media_actor = Network("1000px", "1000px")
nt_media_actor.from_nx(media_actor_network)
nt_media_actor.show_buttons(filter_='physics')
nt_media_actor.show("nx_media_actor.html")

In [None]:
def plot_genre_v_rating(df, title):
    genre_ratings_dict = {}

    for _, row in df.iterrows():
        for genre in row['genre']:
            if genre not in genre_ratings_dict:
                genre_ratings_dict[genre] = []
            genre_ratings_dict[genre].append(row['rating'])

    genre_df = pd.DataFrame()

    for genre in genre_ratings_dict:
        ratings = genre_ratings_dict[genre]

        genre_df.loc[genre, 'mean'] = np.mean(ratings)
        genre_df.loc[genre, 'std'] = np.std(ratings)
        genre_df.loc[genre, 'count'] = len(ratings)


    plt.figure(figsize=(30, 10))
    plt.bar(x=genre_df.index, height=genre_df['mean'])
    plt.errorbar(range(len(genre_df)), genre_df['mean'], yerr=genre_df['std'], fmt='or')
    plt.xlim(-0.5,len(genre_df)-.5)
    plt.title(title)
    plt.xlabel('genre')
    plt.ylabel('rating')
    plt.show()

    return genre_df

combined_genre_df = plot_genre_v_rating(df, 'Combined')
tv_genre_df = plot_genre_v_rating(df.drop(df.loc[df['is_tv'] != True].index), 'TV Shows')
movie_genre_df = plot_genre_v_rating(df.drop(df.loc[df['is_tv'] != False].index), 'Movies')

In [None]:
actor_df = pd.DataFrame(index=all_actors, columns=combined_genre_df.index)

actor_df = actor_df.fillna(0)

for _, row in tqdm(df.iterrows()):
    for actor in row['stars']:
        if actor in row['stars']:
            for genre in row['genre']:
                actor_df.loc[actor, genre] += 1;

for genre in actor_df.columns:
    counts = [count for count in actor_df[genre] if count != 0] 
    print(f'Mean of {genre.lower()} movies for actors with at least 1: {np.mean(counts)}')
    print(f'Std dev of {genre.lower()} movies for actors with at least 1: {np.std(counts)}\n')
    