In [1]:
import pandas as pd
from d3blocks import D3Blocks

# Load Data

In [2]:
def read_data(path):
    return pd.read_pickle(path)

In [3]:
df_cast = read_data('/Users/saho/Documents/sam/imdb_actor_graph/cached_data/cast.pickle')

In [4]:
df_movies = read_data('/Users/saho/Documents/sam/imdb_actor_graph/cached_data/movies.pickle')

# Select Data

In [5]:
def _mask_dataframe_time_span(df,year_start,year_end):
    return df[(df['release_year'] >= year_start) & (df['release_year'] <= year_end)]

In [6]:
def _mask_dataframe_financial(df,col,low):
    return df[(df[col] >= low) & (df[col] <= df[col].max())]

In [7]:
def select_data(df,year_start=None,year_end=None,revenue_low=None,budget_low=None):
    
    if year_start and year_end:
        df = _mask_dataframe_time_span(df,year_start,year_end)
    if revenue_low:
        df = _mask_dataframe_financial(df,'revenue',revenue_low)
    if budget_low:
        df = _mask_dataframe_financial(df,'budget',budget_low)
    
    return df

In [8]:
def select_cols(df,cols):
    return df[cols]

In [9]:
movies_cols = ['id','vote_average','budget','revenue','genres','movie','release_year']

In [10]:
cast_cols = ['tmdb_id','name','popularity']

In [11]:
df_masked_movies = select_data(df_movies,2010,2020,budget_low=20000000)
df_masked_movies = select_cols(df_masked_movies,movies_cols)
df_masked_cast = select_cols(df_cast,cast_cols)

In [12]:
df_masked_movies

Unnamed: 0,id,vote_average,budget,revenue,genres,movie,release_year
0,12445,8.1,125000000.0,1.341511e+09,"['Fantasy', 'Adventure']",Harry Potter and the Deathly Hallows: Part 2,2011
1,49013,6.1,200000000.0,5.598524e+08,"['Animation', 'Family', 'Adventure', 'Comedy']",Cars 2,2011
2,50014,8.2,25000000.0,2.166391e+08,['Drama'],The Help,2011
3,1865,6.5,380000000.0,1.045714e+09,"['Adventure', 'Action', 'Fantasy']",Pirates of the Caribbean: On Stranger Tides,2011
4,39254,7.0,110000000.0,2.992685e+08,"['Action', 'Science Fiction', 'Drama']",Real Steel,2011
...,...,...,...,...,...,...,...
9736,744594,5.6,100000000.0,7.147600e+04,"['Comedy', 'Drama', 'Science Fiction']",White Noise,2022
9737,820446,7.2,40000000.0,8.800000e+07,"['Drama', 'Romance']",Downton Abbey: A New Era,2022
9756,245842,7.2,40500000.0,2.182492e+06,"['Fantasy', 'Adventure', 'Family']",The King's Daughter,2022
9764,614939,7.0,22000000.0,1.480000e+07,"['Comedy', 'Romance']",Bros,2022


# Join Movie and Cast Data

In [13]:
def join_movies_cast(df_masked_cast, df_masked_movies):
    df_cast_movies = pd.merge(df_masked_cast,df_masked_movies,how='inner',left_on='tmdb_id',right_on='id')
    return df_cast_movies.drop(columns=['id'])
    

In [14]:
df_masked_movies.shape

(2433, 7)

In [15]:
df_cast_movies = join_movies_cast(df_masked_cast, df_masked_movies)

In [16]:
df_cast_movies

Unnamed: 0,tmdb_id,name,popularity,vote_average,budget,revenue,genres,movie,release_year
0,1865,Johnny Depp,35.463,6.5,380000000.0,1.045714e+09,"['Adventure', 'Action', 'Fantasy']",Pirates of the Caribbean: On Stranger Tides,2011
1,1865,Penélope Cruz,27.880,6.5,380000000.0,1.045714e+09,"['Adventure', 'Action', 'Fantasy']",Pirates of the Caribbean: On Stranger Tides,2011
2,1865,Ian McShane,40.512,6.5,380000000.0,1.045714e+09,"['Adventure', 'Action', 'Fantasy']",Pirates of the Caribbean: On Stranger Tides,2011
3,155,Gary Oldman,46.498,8.5,185000000.0,1.004558e+09,"['Drama', 'Action', 'Crime', 'Thriller']",The Dark Knight,2008
4,155,Aaron Eckhart,26.468,8.5,185000000.0,1.004558e+09,"['Drama', 'Action', 'Crime', 'Thriller']",The Dark Knight,2008
...,...,...,...,...,...,...,...,...,...
5283,766475,Adrien Brody,29.138,6.5,40000000.0,2.192155e+07,"['Mystery', 'Comedy', 'Thriller']",See How They Run,2022
5284,744594,Adam Driver,27.188,5.6,100000000.0,7.147600e+04,"['Comedy', 'Drama', 'Science Fiction']",White Noise,2022
5285,245842,Pierce Brosnan,34.340,7.2,40500000.0,2.182492e+06,"['Fantasy', 'Adventure', 'Family']",The King's Daughter,2022
5286,245842,Kaya Scodelario,36.157,7.2,40500000.0,2.182492e+06,"['Fantasy', 'Adventure', 'Family']",The King's Daughter,2022


# Transform Cast Data

In [17]:
import itertools

In [18]:
df_nested = pd.DataFrame(df_cast_movies.groupby('movie')['name'].agg(lambda x: list(x)))
solo_cast_mask = df_nested.name.apply(lambda x: len(x)>=2)
df_nested = df_nested[solo_cast_mask]

In [19]:
def calculate_combinations(cast):
    if len(cast) > 1:
        actor_combinations = itertools.combinations(cast, 2)
        actor_combinations_sorted = [tuple(sorted(combination)) for combination in actor_combinations]
        return actor_combinations_sorted
    else:
        return cast


In [20]:
df_nested['all_combinations'] = df_nested.name.apply(calculate_combinations)

In [21]:
flattened_combination_tuples = []
for combinations_in_film in df_nested.all_combinations.values:
    for combination in combinations_in_film:
        flattened_combination_tuples.append(combination)

In [22]:
combinations_dict = {}
for flattened_combination_tuple in flattened_combination_tuples:
    if flattened_combination_tuple in combinations_dict:
        combinations_dict[flattened_combination_tuple] +=1
    else:
        combinations_dict[flattened_combination_tuple] =1
    

In [31]:
df_combination_counts = pd.DataFrame(combinations_dict.items())

In [32]:
df_combination_counts = pd.concat([df_combination_counts[0].apply(pd.Series),df_combination_counts[1]],axis=1)
df_combination_counts.columns =['source','target','weight']

In [33]:
df_combination_counts = df_combination_counts.sort_values(by='weight',ascending=False)
df_combination_counts.shape

(5733, 3)

In [34]:
df_combination_counts

Unnamed: 0,source,target,weight
62,Adam Sandler,Jackie Sandler,13
2300,Emma Watson,Rupert Grint,8
2299,Daniel Radcliffe,Emma Watson,8
400,Chris Evans,Scarlett Johansson,8
2298,Daniel Radcliffe,Rupert Grint,8
...,...,...,...
2218,Michelle Yeoh,Zoe Saldaña,1
2217,Sylvester Stallone,Zoe Saldaña,1
2216,Kurt Russell,Zoe Saldaña,1
2214,Chris Pratt,Michelle Yeoh,1


In [35]:
df_combination_counts[df_combination_counts.source.str.contains('Margo')]

Unnamed: 0,source,target,weight
3481,Margot Robbie,Sam Neill,2
4157,Margot Robbie,Viola Davis,2
3358,Margot Robbie,Sydney Sweeney,1
3357,Margot Robbie,Victoria Pedretti,1
4765,Margot Robbie,Samuel L. Jackson,1
5140,Margot Robbie,Matthew McConaughey,1
5049,Margot Robbie,Sylvester Stallone,1
5050,Margot Robbie,Nathan Fillion,1
4315,Margot Robbie,Marisa Tomei,1
4312,Margot Robbie,Ryan Gosling,1


In [36]:
edge_frequencies = df_combination_counts.source.value_counts().to_dict()

In [37]:
d3 = D3Blocks()
d3.d3graph(df_combination_counts,title='dsfsdf')
d3.D3graph.set_edge_properties(directed=False,edge_distance=1500,marker_color ='#f2f2f2')
                              
d3.D3graph.set_node_properties(color='cluster',size=10,edge_size=0.2,fontsize=14)
for k,v in edge_frequencies.items():
    m = k.replace(" ","_")
    try:
        d3.D3graph.node_properties[m]['size']=v/1.25
    except:
        pass
# d3.D3graph.set_node_properties(color=None)
# d3.D3graph.node_properties['background']['size']=30
# d3.D3graph.node_properties['background']['color']='#FF0000'
# d3.D3graph.node_properties['background']['edge_color']='#000000'
# d3.D3graph.node_properties['background']['edge_size']=5

d3.D3graph.show(filepath='/Users/saho/Documents/sam/imdb_actor_graph/experimentl_html/test.html')

[d3blocks] >INFO> Cleaning edge_properties and config parameters..
[d3blocks] >INFO> Set directed=True to see the markers!
[d3blocks] >INFO> Keep only edges with weight>0
[d3blocks] >INFO> Number of unique nodes: 441
[d3blocks] >INFO> Slider range is set to [0, 13]
[d3blocks] >INFO> Write to path: [/var/folders/l9/6wvd07bx0t59jfdht0vhh5h40000gq/T/tmpxe2akc6i/d3graph.html]
[d3blocks] >INFO> File already exists and will be overwritten: [/var/folders/l9/6wvd07bx0t59jfdht0vhh5h40000gq/T/tmpxe2akc6i/d3graph.html]
[d3blocks] >INFO> Set directed=True to see the markers!
[d3blocks] >INFO> Keep only edges with weight>0
[d3blocks] >INFO> Number of unique nodes: 441
[d3blocks] >INFO> Slider range is set to [0, 13]
[d3blocks] >INFO> Write to path: [/Users/saho/Documents/sam/imdb_actor_graph/experimentl_html/test.html]
[d3blocks] >INFO> File already exists and will be overwritten: [/Users/saho/Documents/sam/imdb_actor_graph/experimentl_html/test.html]
