In [1]:
import pandas as pd
from d3blocks import D3Blocks

# Load - Data

In [99]:
def read_data(path):
    return pd.read_pickle(path)

In [100]:
df_cast = read_data('/Users/saho/Documents/sam/imdb_actor_graph/cached_data/cast.pickle')


In [101]:
df_movies = read_data('/Users/saho/Documents/sam/imdb_actor_graph/cached_data/movies.pickle')

# Transform - Select Data

In [102]:
def _mask_range(df,col,start,end):
    return df[(df[col] >= start) & (df[col] <= end)]

In [103]:
def _mask_value(df,col,value):
    return df[(df[col] == value)]

In [104]:
# df_movies = read_data('/Users/saho/Documents/sam/imdb_actor_graph/cached_data/movies.pickle')
year_start, year_end = 1990,2023

In [105]:
df_movies = _mask_range(df_movies,'m_release_year',year_start,year_end)

In [106]:
df_movies

Unnamed: 0,m_tmdb_id,m_movie,m_overview,m_popularity,m_release_date,m_vote_average,m_poster_path,m_release_year,m_keywords,m_results,m_budget,m_genres,m_production_countries,m_revenue,m_tagline
0,12445,Harry Potter and the Deathly Hallows: Part 2,"Harry, Ron and Hermione continue their quest t...",182.818,2011-07-07,8.1,/c54HpQmuwXjHq2C9wmoACjxoom3.jpg,2011,"['saving the world', 'witch', 'self sacrifice'...","[""It is the quality of one's convictions that ...",125000000.0,"['Fantasy', 'Adventure']","['United Kingdom', 'United States of America']",1.341511e+09,It all ends here.
1,49013,Cars 2,Star race car Lightning McQueen and his pal Ma...,169.573,2011-06-11,6.1,/okIz1HyxeVOMzYwwHUjH2pHi74I.jpg,2011,"['car race', 'sequel', 'anthropomorphism', 'be...",['Lasseter is smart enough to tell us a comple...,200000000.0,"['Animation', 'Family', 'Adventure', 'Comedy']",['United States of America'],5.598524e+08,Ka-ciao!
2,50014,The Help,Aibileen Clark is a middle-aged African-Americ...,168.317,2011-08-09,8.2,/3kmfoWWEc9Vtyuaf9v5VipRgdjx.jpg,2011,"['mississippi river', 'based on novel or book'...","['Great setting, cast, story and performances....",25000000.0,['Drama'],['United States of America'],2.166391e+08,Change begins with a whisper.
3,1865,Pirates of the Caribbean: On Stranger Tides,Captain Jack Sparrow crosses paths with a woma...,160.529,2011-05-14,6.5,/keGfSvCmYj7CvdRx36OdVrAEibE.jpg,2011,"['england', 'spain', 'sea', 'captain', 'mutiny...",['More of the same ... but it is not funny any...,380000000.0,"['Adventure', 'Action', 'Fantasy']","['United Kingdom', 'United States of America']",1.045714e+09,Live Forever Or Die Trying.
4,39254,Real Steel,Charlie Kenton is a washed-up fighter who reti...,130.738,2011-09-28,7.0,/4GIeI5K5YdDUkR3mNQBoScpSFEf.jpg,2011,"['parent child relationship', 'fight', 'future...","['No splitting this Atom, it has got a rock so...",110000000.0,"['Action', 'Science Fiction', 'Drama']","['United States of America', 'India']",2.992685e+08,"If you get one shot, make it real."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9808,818612,Wedding Season,Pressured by their immigrant parents to find s...,11.387,2022-08-04,6.3,/mFeXAZ1oOECPqEu8c2i4L5LmNyY.jpg,2022,[],[],0.0,"['Romance', 'Comedy']","['Canada', 'United States of America']",0.000000e+00,
9809,973164,Ricky Gervais: SuperNature,"With his signature pitch-black sense of humor,...",10.436,2022-05-24,7.3,/ry2HDP5KyEV3R7htXlFAeQbbXgf.jpg,2022,['stand-up comedy'],[],0.0,['Comedy'],['United Kingdom'],0.000000e+00,
9810,957258,What is a Woman?,Political commentator Matt Walsh explores the ...,10.299,2022-06-01,6.9,/iiP8Sq7yWoYsKx9EVfyRPno8Un9.jpg,2022,"['womanhood', 'gender', 'social documentary', ...",[],0.0,['Documentary'],['United States of America'],0.000000e+00,One man's journey to answer the question of a ...
9811,971961,Gabriel's Rapture: Part III,In the sixth installment of the Gabriel's Infe...,9.950,2022-08-12,8.1,/vyLRyHRrPy9zTf6t0sS4aTbF9gl.jpg,2022,[],[],0.0,['Romance'],[],0.000000e+00,


In [107]:
def _select_movie_data(df_movies,
                          revenue_low,revenue_high,
                          budget_low,budget_high,
                         vote_low, vote_high):
    
    if year_start:
        df_movies = _mask_range(df_movies,'m_release_year',year_start,year_end)
    if revenue_low:
        df_movies = _mask_range(df_movies,'m_revenue',revenue_low, revenue_high)
    if budget_low:
        df_movies = _mask_range(df_movies,'m_budget',budget_low, budget_high)
    if vote_low:
        df_movies = _mask_range(df_movies,'m_vote_average',vote_low, vote_high)
    
    return df_movies

In [108]:
revenue_low = df_movies.m_revenue.min()
revenue_high = df_movies.m_revenue.max()
budget_low = df_movies.m_budget.min()
budget_high = df_movies.m_budget.max()
vote_low = df_movies.m_vote_average.min()
vote_high = df_movies.m_vote_average.max()

In [109]:
df_masked_movies = _select_movie_data(df_movies,
                                      revenue_low,
                                      revenue_high,
                                      budget_low,
                                      budget_high,
                                      vote_low,
                                      vote_high)

In [110]:
def _select_cast_data(df_cast,popularity_low=None,popularity_high=None):
    if popularity_low:
        df_cast = _mask_range(df_cast,'c_popularity',popularity_low, popularity_high)
    return df_cast

In [111]:
popularity_low=5
popularity_high=100

df_masked_cast = _select_cast_data(df_cast,popularity_low,popularity_high)
df_masked_cast.shape

(11373, 13)

# Transform - Join Movie and Cast Data

In [112]:
def join_movies_cast(df_masked_cast, df_masked_movies):
    df_cast_movies = pd.merge(df_masked_cast,df_masked_movies,how='inner',left_on='c_tmdb_id',right_on='m_tmdb_id')
    return df_cast_movies
    

In [113]:
df_cast_movies = join_movies_cast(df_masked_cast, df_masked_movies)
df_cast_movies.shape

(11373, 28)

In [17]:
df_cast_movies.head()

Unnamed: 0,c_id,c_known_for_department,c_name,c_original_name,c_popularity,c_profile_path,c_cast_id,c_character,c_credit_id,c_order,...,m_vote_average,m_poster_path,m_release_year,m_keywords,m_results,m_budget,m_genres,m_production_countries,m_revenue,m_tagline
0,6384,Acting,Keanu Reeves,Keanu Reeves,78.87,/4D0PpNI0kmP58hgrwGC3wCjxhnm.jpg,6,John Wick,52fe4f0cc3a36847f82b9c41,0,...,7.4,/nCzzQKGijVzfGFg42id7uDLOjY5.jpg,2014,"['hitman', 'gangster', 'secret organization', ...",['This is very much my kind of movie. Straight...,20000000.0,"['Action', 'Thriller']",['United States of America'],88761661.0,Don't set him off.
1,6972,Acting,Ian McShane,Ian McShane,40.512,/q9qKbux5Jo76Sj8g3luxBt6rYtz.jpg,24,Winston,544b9592c3a3686ac1000876,2,...,7.4,/nCzzQKGijVzfGFg42id7uDLOjY5.jpg,2014,"['hitman', 'gangster', 'secret organization', ...",['This is very much my kind of movie. Straight...,20000000.0,"['Action', 'Thriller']",['United States of America'],88761661.0,Don't set him off.
2,5723,Acting,John Leguizamo,John Leguizamo,43.854,/kEbXDtcPJ46GEhj5QlCTLZS6PWE.jpg,19,Aurelio,544ab810c3a3680fbb004280,3,...,7.4,/nCzzQKGijVzfGFg42id7uDLOjY5.jpg,2014,"['hitman', 'gangster', 'secret organization', ...",['This is very much my kind of movie. Straight...,20000000.0,"['Action', 'Thriller']",['United States of America'],88761661.0,Don't set him off.
3,5293,Acting,Willem Dafoe,Willem Dafoe,40.503,/ui8e4sgZAwMPi3hzEO53jyBJF9B.jpg,9,Marcus,52fe4f0cc3a36847f82b9c4d,4,...,7.4,/nCzzQKGijVzfGFg42id7uDLOjY5.jpg,2014,"['hitman', 'gangster', 'secret organization', ...",['This is very much my kind of movie. Straight...,20000000.0,"['Action', 'Thriller']",['United States of America'],88761661.0,Don't set him off.
4,129101,Acting,Lance Reddick,Lance Reddick,33.201,/22mVtEXZbpt0J7S0LhIhdkfRrZV.jpg,26,Hotel Manager / Charon,544b95bac3a3680fb1006fd5,8,...,7.4,/nCzzQKGijVzfGFg42id7uDLOjY5.jpg,2014,"['hitman', 'gangster', 'secret organization', ...",['This is very much my kind of movie. Straight...,20000000.0,"['Action', 'Thriller']",['United States of America'],88761661.0,Don't set him off.


# Transform Cast Data

In [18]:
import itertools

In [19]:
def _get_nested_cast(df_cast_movies):
    df_nested = pd.DataFrame(df_cast_movies.groupby('m_movie')['c_name'].agg(lambda x: list(x)))
    solo_cast_mask = df_nested.c_name.apply(lambda x: len(x)>=2)
    df_nested = df_nested[solo_cast_mask]
    return df_nested

In [20]:
df_nested = _get_nested_cast(df_cast_movies)

In [21]:
df_nested

Unnamed: 0_level_0,c_name
m_movie,Unnamed: 1_level_1
10 Cloverfield Lane,"[Mary Elizabeth Winstead, John Goodman, Bradle..."
"10,000 Saints","[Hailee Steinfeld, Emile Hirsch]"
100 Streets,"[Idris Elba, Gemma Arterton]"
10x10,"[Luke Evans, Kelly Reilly]"
12 Strong,"[Chris Hemsworth, Elsa Pataky, Taylor Sheridan..."
...,...
Zoe,"[Ewan McGregor, Theo James]"
Zombieland: Double Tap,"[Woody Harrelson, Emma Stone, Rosario Dawson, ..."
Zoolander 2,"[Owen Wilson, Penélope Cruz, Milla Jovovich, B..."
Zootopia,"[Jason Bateman, Idris Elba, Jenny Slate, Bonni..."


In [22]:
def _get_nested_cast_combinations(cast):
    if len(cast) > 1:
        actor_combinations = itertools.combinations(cast, 2)
        actor_combinations_sorted = [tuple(sorted(combination)) for combination in actor_combinations]
        return actor_combinations_sorted
    else:
        return cast


In [23]:
df_nested['all_combinations'] = df_nested.c_name.apply(_get_nested_cast_combinations)

In [24]:
df_nested.loc['Wonder Woman 1984']['all_combinations']

[('Chris Pine', 'Gal Gadot'),
 ('Connie Nielsen', 'Gal Gadot'),
 ('Chris Pine', 'Connie Nielsen')]

In [25]:
def _flatten_nested_cast_combinations(df_nested):
    flattened_combination_tuples = []
    for combinations_in_film in df_nested.all_combinations.values:
        for combination in combinations_in_film:
            flattened_combination_tuples.append(combination)
    return flattened_combination_tuples

In [26]:
flattened_combination_tuples = _flatten_nested_cast_combinations(df_nested)

In [27]:
flattened_combination_tuples

[('John Goodman', 'Mary Elizabeth Winstead'),
 ('Bradley Cooper', 'Mary Elizabeth Winstead'),
 ('Bradley Cooper', 'John Goodman'),
 ('Emile Hirsch', 'Hailee Steinfeld'),
 ('Gemma Arterton', 'Idris Elba'),
 ('Kelly Reilly', 'Luke Evans'),
 ('Chris Hemsworth', 'Elsa Pataky'),
 ('Chris Hemsworth', 'Taylor Sheridan'),
 ('Chris Hemsworth', 'William Fichtner'),
 ('Elsa Pataky', 'Taylor Sheridan'),
 ('Elsa Pataky', 'William Fichtner'),
 ('Taylor Sheridan', 'William Fichtner'),
 ('Mark Strong', 'Richard Madden'),
 ('Benedict Cumberbatch', 'Mark Strong'),
 ('Benedict Cumberbatch', 'Richard Madden'),
 ('J.K. Simmons', 'Taylor Kitsch'),
 ('J.K. Simmons', 'Keith David'),
 ('Keith David', 'Taylor Kitsch'),
 ('Channing Tatum', 'Peter Stormare'),
 ('Hailee Steinfeld', 'Kevin Costner'),
 ('Connie Nielsen', 'Kevin Costner'),
 ('Amber Heard', 'Kevin Costner'),
 ('Connie Nielsen', 'Hailee Steinfeld'),
 ('Amber Heard', 'Hailee Steinfeld'),
 ('Amber Heard', 'Connie Nielsen'),
 ('Elle Fanning', 'Susan Saran

In [28]:
def _get_combinations_dict(flattened_combination_tuples):
    combinations_dict = {}
    for flattened_combination_tuple in flattened_combination_tuples:
        if flattened_combination_tuple in combinations_dict:
            combinations_dict[flattened_combination_tuple] +=1
        else:
            combinations_dict[flattened_combination_tuple] =1
    return combinations_dict
    

In [29]:
combinations_dict = _get_combinations_dict(flattened_combination_tuples)

In [30]:
combinations_dict


{('John Goodman', 'Mary Elizabeth Winstead'): 1,
 ('Bradley Cooper', 'Mary Elizabeth Winstead'): 1,
 ('Bradley Cooper', 'John Goodman'): 1,
 ('Emile Hirsch', 'Hailee Steinfeld'): 1,
 ('Gemma Arterton', 'Idris Elba'): 1,
 ('Kelly Reilly', 'Luke Evans'): 1,
 ('Chris Hemsworth', 'Elsa Pataky'): 3,
 ('Chris Hemsworth', 'Taylor Sheridan'): 1,
 ('Chris Hemsworth', 'William Fichtner'): 1,
 ('Elsa Pataky', 'Taylor Sheridan'): 1,
 ('Elsa Pataky', 'William Fichtner'): 1,
 ('Taylor Sheridan', 'William Fichtner'): 1,
 ('Mark Strong', 'Richard Madden'): 1,
 ('Benedict Cumberbatch', 'Mark Strong'): 2,
 ('Benedict Cumberbatch', 'Richard Madden'): 1,
 ('J.K. Simmons', 'Taylor Kitsch'): 1,
 ('J.K. Simmons', 'Keith David'): 1,
 ('Keith David', 'Taylor Kitsch'): 1,
 ('Channing Tatum', 'Peter Stormare'): 1,
 ('Hailee Steinfeld', 'Kevin Costner'): 1,
 ('Connie Nielsen', 'Kevin Costner'): 2,
 ('Amber Heard', 'Kevin Costner'): 2,
 ('Connie Nielsen', 'Hailee Steinfeld'): 1,
 ('Amber Heard', 'Hailee Steinfeld'

In [31]:
def _get_d3_dataframe(combinations_dict):
    df_d3 = pd.DataFrame(combinations_dict.items())
    df_d3 = pd.concat([df_d3[0].apply(pd.Series),df_d3[1]],axis=1)
    df_d3.columns =['source','target','weight']
    return df_d3
    

In [32]:
df_d3 = _get_d3_dataframe(combinations_dict)

In [33]:
def _get_edge_frequency_dict(df_d3):
    x = df_d3.source.value_counts().to_dict()
    y = df_d3.target.value_counts().to_dict()
    result = {key: x.get(key, 0) + y.get(key, 0)for key in x}
    return result

In [34]:
edge_frequency_dict = _get_edge_frequency_dict(df_d3)

In [35]:
def _mask_on_actor_edge_frequency(df_d3, edge_frequency_dict, min_threshold):
    actors_to_mask = [k for k, v in edge_frequency_dict.items() if v > min_threshold]
    mask = df_d3['source'].isin(actors_to_mask) & df_d3['target'].isin(actors_to_mask)
    return df_d3[mask].reset_index(drop=True)

    
    

In [36]:
df_d3

Unnamed: 0,source,target,weight
0,John Goodman,Mary Elizabeth Winstead,1
1,Bradley Cooper,Mary Elizabeth Winstead,1
2,Bradley Cooper,John Goodman,1
3,Emile Hirsch,Hailee Steinfeld,1
4,Gemma Arterton,Idris Elba,1
...,...,...,...
4320,Donnie Yen,Vin Diesel,1
4321,Nina Dobrev,Vin Diesel,1
4322,Donnie Yen,Nina Dobrev,1
4323,Donnie Yen,Samuel L. Jackson,1


In [37]:
df_d3_masked = _mask_on_actor_edge_frequency(df_d3, edge_frequency_dict,6)

In [38]:
df_d3_masked

Unnamed: 0,source,target,weight
0,John Goodman,Mary Elizabeth Winstead,1
1,Bradley Cooper,Mary Elizabeth Winstead,1
2,Bradley Cooper,John Goodman,1
3,Emile Hirsch,Hailee Steinfeld,1
4,Gemma Arterton,Idris Elba,1
...,...,...,...
3605,Bonnie Hunt,Jenny Slate,1
3606,J.K. Simmons,Jenny Slate,1
3607,Bonnie Hunt,J.K. Simmons,1
3608,Nina Dobrev,Vin Diesel,1


# Graph Stuff

In [89]:
import networkx as nx
class ActorGraphMetrics:
    def __init__(self, df):
        self.df = df
        self.graph = self._create_graph()
        self.df_actor_attributes = self._calculate_actor_attributes()

    def _create_graph(self):
        graph = nx.from_pandas_edgelist(self.df, 'source', 'target', 'weight')
        return graph

    def _calculate_actor_attributes(self):
        actor_attributes = pd.DataFrame(index=self.graph.nodes)
        
        
        actor_attributes['DegreeCentrality'] = pd.Series(nx.degree_centrality(self.graph))
        actor_attributes['BetweennessCentrality'] = pd.Series(nx.betweenness_centrality(self.graph))
        actor_attributes['EigenvectorCentrality'] = pd.Series(nx.eigenvector_centrality(self.graph))
        actor_attributes['ClusteringCoefficient'] = pd.Series(nx.clustering(self.graph))
    
        return round(actor_attributes,3).reset_index().rename(columns={'index':'Actor'})

    def get_actor_attributes(self):
        return self.df_actor_attributes


In [90]:
# Assuming you have a dataframe called 'df' representing the actor graph
actor_metrics = ActorGraphMetrics(df_d3_masked)
df_actor_attributes = actor_metrics.get_actor_attributes()


def create_actor_graph_metrics_dict(df):
#     df = df.reset_index()
    actor_dict = {}
    for _, row in df.iterrows():
        actor = row[0].replace(" ","_")
        attributes = {col: val for col, val in row[1:].items()}
        actor_dict[actor] = attributes
    return actor_dict

actor_graph_metrics_dict = create_actor_graph_metrics_dict(df_actor_attributes)




In [None]:
def append_actor_url(df_actor_attributes,df_cast_movies, imdb_image_path):
    df_actor_attributes = df_actor_attributes.merge(df_cast_movies[['c_name','c_profile_path']].drop_duplicates\
                    (subset='c_name',keep='first'),left_on='Actor',right_on='c_name').drop(columns='c_name')
    df_actor_attributes['c_profile_path'] = df_actor_attributes['c_profile_path'].apply(lambda x: x+imdb_image_path)
    return df_actor_attributes
    

In [92]:
df_actor_attributes

Unnamed: 0,Actor,DegreeCentrality,BetweennessCentrality,EigenvectorCentrality,ClusteringCoefficient
0,John Goodman,0.088,0.004,0.046,0.185
1,Mary Elizabeth Winstead,0.050,0.002,0.020,0.133
2,Bradley Cooper,0.185,0.011,0.160,0.369
3,Emile Hirsch,0.060,0.003,0.023,0.292
4,Hailee Steinfeld,0.100,0.005,0.058,0.234
...,...,...,...,...,...
315,Aimee Garcia,0.022,0.000,0.012,0.476
316,Denzel Washington,0.025,0.001,0.008,0.143
317,Asher Angel,0.019,0.000,0.004,1.000
318,Kate Beckinsale,0.009,0.000,0.002,0.000


Unnamed: 0,Actor,DegreeCentrality,BetweennessCentrality,EigenvectorCentrality,ClusteringCoefficient,c_profile_path
0,John Goodman,0.088,0.004,0.046,0.185,/yyYqoyKHO7hE1zpgEV2XlqYWcNV.jpg
1,Mary Elizabeth Winstead,0.050,0.002,0.020,0.133,/vQn6IGsClpyhV6KTba9EDqSK7e2.jpg
2,Bradley Cooper,0.185,0.011,0.160,0.369,/DPnessSsWqVXRbKm93PtMjB4Us.jpg
3,Emile Hirsch,0.060,0.003,0.023,0.292,/Ah8BYOB7P8tX9jnQVrnAbL2kxYJ.jpg
4,Hailee Steinfeld,0.100,0.005,0.058,0.234,/q4UpZMEuvNCN5lL5L6xa3ICpheJ.jpg
...,...,...,...,...,...,...
315,Aimee Garcia,0.022,0.000,0.012,0.476,/4z3mkoa8jHpqjzfbhzYMeKe2Mf7.jpg
316,Denzel Washington,0.025,0.001,0.008,0.143,/cEU2Vrdo83izpGmOvbVAOz5jCof.jpg
317,Asher Angel,0.019,0.000,0.004,1.000,/lgBt67iggDs0d8QBSyjdk2ytHtK.jpg
318,Kate Beckinsale,0.009,0.000,0.002,0.000,/CzTuSQ7jfqBF3x4SX7NQH79BKf.jpg


In [41]:
actor_graph_metrics_dict

{'John_Goodman': {'DegreeCentrality': 0.088,
  'BetweennessCentrality': 0.004,
  'EigenvectorCentrality': 0.046,
  'ClusteringCoefficient': 0.185},
 'Mary_Elizabeth_Winstead': {'DegreeCentrality': 0.05,
  'BetweennessCentrality': 0.002,
  'EigenvectorCentrality': 0.02,
  'ClusteringCoefficient': 0.133},
 'Bradley_Cooper': {'DegreeCentrality': 0.185,
  'BetweennessCentrality': 0.011,
  'EigenvectorCentrality': 0.16,
  'ClusteringCoefficient': 0.369},
 'Emile_Hirsch': {'DegreeCentrality': 0.06,
  'BetweennessCentrality': 0.003,
  'EigenvectorCentrality': 0.023,
  'ClusteringCoefficient': 0.292},
 'Hailee_Steinfeld': {'DegreeCentrality': 0.1,
  'BetweennessCentrality': 0.005,
  'EigenvectorCentrality': 0.058,
  'ClusteringCoefficient': 0.234},
 'Gemma_Arterton': {'DegreeCentrality': 0.038,
  'BetweennessCentrality': 0.001,
  'EigenvectorCentrality': 0.017,
  'ClusteringCoefficient': 0.333},
 'Idris_Elba': {'DegreeCentrality': 0.185,
  'BetweennessCentrality': 0.016,
  'EigenvectorCentrali

## Show

In [42]:
def _cache_d3_network_plot(df_d3_masked,
                           actor_graph_metrics_dict,
                           edge_distance=100,
                           node_size=10,
                           fontsize=8,
                           graph_metric=None):

    d3 = D3Blocks()
    d3.d3graph(df_d3_masked,title='dsfsdf')
    d3.D3graph.set_edge_properties(directed=False,edge_distance=edge_distance)

    d3.D3graph.set_node_properties(color='cluster',size=node_size,edge_size=0.2,fontsize=fontsize)
    if graph_metric:
        
        for k,v in actor_graph_metrics_dict.items():
            try:
                d3.D3graph.node_properties[k]['size']=v[graph_metric]
            except:
                pass
    

    d3.D3graph.show(filepath='/Users/saho/Documents/sam/imdb_actor_graph/experimentl_html/test.html')
    return

In [43]:
x = _cache_d3_network_plot(df_d3_masked,actor_graph_metrics_dict,graph_metric='ClusteringCoefficient')

[d3blocks] >INFO> Cleaning edge_properties and config parameters..
[d3blocks] >INFO> Set directed=True to see the markers!
[d3blocks] >INFO> Keep only edges with weight>0
[d3blocks] >INFO> Number of unique nodes: 320
[d3blocks] >INFO> Slider range is set to [0, 8]
[d3blocks] >INFO> Write to path: [/var/folders/l9/6wvd07bx0t59jfdht0vhh5h40000gq/T/tmpbx57m6ma/d3graph.html]
[d3blocks] >INFO> File already exists and will be overwritten: [/var/folders/l9/6wvd07bx0t59jfdht0vhh5h40000gq/T/tmpbx57m6ma/d3graph.html]
[d3blocks] >INFO> Set directed=True to see the markers!
[d3blocks] >INFO> Keep only edges with weight>0
[d3blocks] >INFO> Number of unique nodes: 320
[d3blocks] >INFO> Slider range is set to [0, 8]
[d3blocks] >INFO> Write to path: [/Users/saho/Documents/sam/imdb_actor_graph/experimentl_html/test.html]
[d3blocks] >INFO> File already exists and will be overwritten: [/Users/saho/Documents/sam/imdb_actor_graph/experimentl_html/test.html]


In [44]:
def _load_cached_file(save_path):
    with open(save_path, "r") as f:
        file = f.read()
    return file

# Show Common Movies

In [59]:
def _get_actor_co_star_dict(df_d3_masked):
    dic_source_target = dict(df_d3_masked.groupby('source')['target'].agg(lambda x: x.to_list()))
    dic_target_source = dict(df_d3_masked.groupby('target')['source'].agg(lambda x: x.to_list()))
    dic_source_target.update(dic_target_source)
    return dic_source_target

In [90]:
from collections import defaultdict

def _get_actor_co_star_dict(df_d3_masked):
    dic_merged = defaultdict(list)
    
    for source, target in zip(df_d3_masked['source'], df_d3_masked['target']):
        dic_merged[source].append(target)
        dic_merged[target].append(source)
    
    dic_source_target = dict(dic_merged)
    return dic_source_target


In [91]:
actor_co_star_dict = _get_actor_co_star_dict(df_d3_masked)

In [93]:
def _find_common_movies(df, actor1, actor2):
    actor1_movies = set(df[df['c_name'] == actor1]['m_movie'])
    actor2_movies = set(df[df['c_name'] == actor2]['m_movie'])
    common_movies = actor1_movies.intersection(actor2_movies)
    return list(common_movies)

In [94]:
common_movies = _find_common_movies(df_cast_movies,'Chris Evans','Chris Hemsworth')

In [95]:
def _get_poster_paths(df, common_movies):
    res = []
    for movie in common_movies:
        poster_path = df[df['m_movie'] == movie]['m_poster_path'].unique()[0]
        res.append((movie,poster_path))
    return res
    
    

In [96]:
_get_poster_paths(df_cast_movies,common_movies)

[('Avengers: Infinity War', '/7WsyChQLEftFiDOVTGkv3hFpyyt.jpg'),
 ('Avengers: Endgame', '/or06FN3Dka5tukK1e9sl16pB3iy.jpg')]


1. Degree Centrality: Degree centrality measures how connected an actor is by looking at the number of co-stars they have worked with. Actors with a higher degree centrality value have worked with more co-stars.

2. Betweenness Centrality: Betweenness centrality quantifies the extent to which an actor acts as a bridge or intermediary between other actors in the graph. In real terms, an actor who has starred in a number of different franchises is likely to have a high betweeness centrality. 

3. Eigenvector Centrality: Eigenvector centrality measures the influence of an actor based on both their direct connections and the connections of their co-stars. Actors with high eigenvector centrality are not only connected to many other actors but are also connected to other influential actors.

4. Clustering Coefficient: The clustering coefficient measures the tendency of actors to form tightly-knit groups or clusters. It quantifies how likely an actor's co-stars are also connected to each other. Actors with high clustering coefficients are part of cohesive groups within the network.

These metrics provide insights into various aspects of an actor's centrality, influence, connectedness, and clustering behavior within the actor graph. By analyzing these metrics, you can gain a better understanding of the importance and role of each actor in the network of collaborations.

# Gender Analysis

In [118]:
df_cast_movies[['c_gender','c_name','m_revenue']]

Unnamed: 0,c_gender,c_name,m_revenue
0,2.0,Johnny Depp,1.045714e+09
1,1.0,Penélope Cruz,1.045714e+09
2,2.0,Ian McShane,1.045714e+09
3,2.0,Gary Oldman,1.004558e+09
4,2.0,Aaron Eckhart,1.004558e+09
...,...,...,...
11368,2.0,Charlie Day,0.000000e+00
11369,,Jenny Slate,0.000000e+00
11370,,Isabel May,0.000000e+00
11371,2.0,Keanu Reeves,1.068063e+07
