In [14]:
import pandas as pd  # data wrangling
from pyvis.network import Network  # drawing graph 

# Avengers Endgame Social Network Analysis

A notebook to run a network analysis chart based on the actors in Avengers: Endgame.

In [15]:
df = pd.read_csv('data.csv')  # read in data 
df = pd.merge(df, df, 'inner', on='FilmID')  # join data to itself to get a table of all actors connected to eachother 

# drop repeated actors
df['duplicate'] = df.apply(lambda row: 1 if row['ActorID_y'] == row['ActorID_x'] else 0, axis=1)  
df = df[df['duplicate'] == 0][['Actor_x', 'Actor_y', 'Film_x', 'Rating_x']] 

# drop Endgame and Inifity War to avoid crowding the network with edges
df = df[df['Film_x'] != 'Avengers: Endgame']
df = df[df['Film_x'] != 'Avengers: Infinity War']

# rename columns
df.rename(columns={
    'Rating_x': 'Rating',
    'Film_x': 'Film'
}, inplace=True)

## Nodes 

The nodes are the unique actors that are drawn and connections made to.

In [3]:
# add features for dimensions
group_x = df.groupby('Actor_x').count().reset_index()
group_y = df.groupby('Actor_y').mean().reset_index()
group_y

Unnamed: 0,Actor_y,index,Rating,appearances
0,Angela Bassett,7335.454545,6.618182,47.272727
1,Anthony Mackie,7165.676056,7.243662,55.591549
2,Benedict Cumberbatch,9096.962963,7.303704,38.925926
3,Benedict Wong,8896.733333,7.053333,33.866667
4,Bradley Cooper,8538.480000,7.320000,35.600000
...,...,...,...,...
59,Vin Diesel,8321.368421,7.489474,43.473684
60,William Hurt,7449.913043,7.508696,64.173913
61,Winston Duke,7742.142857,7.314286,37.857143
62,Yvette Nicole Brown,8697.666667,6.350000,44.333333


In [4]:
# build node store 
nodes = group_x['Actor_x'].tolist()

for y in group_y['Actor_y'].tolist():
    if y not in nodes:
        nodes.append(y) # only append an actor node if they are not already in x column
        
nodes[:5]

['Angela Bassett',
 'Anthony Mackie',
 'Benedict Cumberbatch',
 'Benedict Wong',
 'Bradley Cooper',
 'Brie Larson',
 'Callan Mulvey',
 'Carrie Coon',
 'Chadwick Boseman',
 'Chris Evans',
 'Chris Hemsworth',
 'Chris Pratt',
 'Cobie Smulders',
 'Danai Gurira',
 'Dave Bautista',
 'Don Cheadle',
 'Elizabeth Olsen',
 'Evangeline Lilly',
 'Frank Grillo',
 'Gwyneth Paltrow',
 'Hayley Atwell',
 'Hiroyuki Sanada',
 'Jacob Batalon',
 "James D'Arcy",
 'Jeremy Renner',
 'Joe Russo',
 'John Slattery',
 'Jon Favreau',
 'Josh Brolin',
 'Karen Gillan',
 'Ken Jeong',
 'Kerry Condon',
 'Letitia Wright',
 'Linda Cardellini',
 'Marisa Tomei',
 'Mark Ruffalo',
 'Michael Douglas',
 'Michelle Pfeiffer',
 'Monique Ganderton',
 'Natalie Portman',
 'Patrick Gorman',
 'Paul Rudd',
 'Pom Klementieff',
 'Rene Russo',
 'Robert Downey Jr.',
 'Robert Redford',
 'Sam Hargrave',
 'Samuel L. Jackson',
 'Scarlett Johansson',
 'Sean Gunn',
 'Sebastian Stan',
 'Stan Lee',
 'Taika Waititi',
 'Terry Notary',
 'Tessa Thompson

In [5]:
# build node dict 
nodes = [{'Actor': n} for n in nodes]
i = 1
for n in nodes:
    n.update({'Index': i})
    i += 1
nodes[:5]

[{'Actor': 'Angela Bassett', 'Index': 1},
 {'Actor': 'Anthony Mackie', 'Index': 2},
 {'Actor': 'Benedict Cumberbatch', 'Index': 3},
 {'Actor': 'Benedict Wong', 'Index': 4},
 {'Actor': 'Bradley Cooper', 'Index': 5}]

In [6]:
# convert to df and add measure fields select only the fields needed
node_store = pd.DataFrame(nodes)
node_store = node_store.merge(group_x, how='left', left_on='Actor', right_on='Actor_x')
node_store = node_store.merge(group_y, how='left', left_on='Actor', right_on='Actor_y')
node_store = node_store[['Actor', 'Index', 'Rating_y', 'appearances_x']]
node_store

Unnamed: 0,Actor,Index,Rating_y,appearances_x
0,Angela Bassett,1,6.618182,11
1,Anthony Mackie,2,7.243662,71
2,Benedict Cumberbatch,3,7.303704,27
3,Benedict Wong,4,7.053333,15
4,Bradley Cooper,5,7.320000,25
...,...,...,...,...
59,Vin Diesel,60,7.489474,19
60,William Hurt,61,7.508696,23
61,Winston Duke,62,7.314286,7
62,Yvette Nicole Brown,63,6.350000,6


In [7]:
def rating_grouper(rat: float) -> str:
    """
    Function to convert av rating to a hex code shade of orange 
    """
    if rat > 7.5:
        return '#FF6200'
    elif rat < 7.6 and rat > 5.0:
        return '#FD9346'
    else:
        return '#FDB777'

# apply lambda to convert color
node_store['color'] = node_store.apply(lambda row: rating_grouper(row['Rating_y']), axis=1) 

net = Network(notebook=True, bgcolor='#3A3B3C', font_color='white') # set up a Network with a font color of white and gray background 


# add nodes by looping through the node data frame 
for n in range(0, len(node_store['Actor'])):
    net.add_node(node_store['Actor'][n], # node id is the actor name 
                 label=node_store['Actor'][n], # label is also actor name 
                 value=node_store['appearances_x'][n] - min(node_store['appearances_x'])/max(node_store['appearances_x']- min(node_store['appearances_x'])), # min max scale the node sizes as all nodes need a size
                color = node_store['color'][n]) # set color to be hex color from df 

## Edges 

The logic to collect edge measures and draw the edges.

In [8]:
# zip all actors together as relationships 
all_edges = list(zip(df['Actor_x'].tolist(), df['Actor_y'].tolist(), df['Film'].tolist()))

# a starting edge structure from existing data frame to begin the loop 
edges = [['Michael Douglas', 'Michelle Pfeiffer', 'Ant-Man and the Wasp']]

# append only unique edges to the edge store 
# if this step is not taken you end up with two sets (Actor_X, Actor_Y) as well as (Actor_Y, Actor_X)
for e in all_edges:
    if list(e) in edges or [e[1], e[0], e[2]] in edges:
        pass
    else:
        edges.append(list(e))

edges[:5]

[['Michael Douglas', 'Michelle Pfeiffer', 'Ant-Man and the Wasp'],
 ['Michael Douglas', 'Stan Lee', 'Ant-Man and the Wasp'],
 ['Michael Douglas', 'Paul Rudd', 'Ant-Man and the Wasp'],
 ['Michael Douglas', 'Evangeline Lilly', 'Ant-Man and the Wasp'],
 ['Michelle Pfeiffer', 'Stan Lee', 'Ant-Man and the Wasp']]

In [9]:
edges = [{'Actor_x': e[0], 'Actor_y': e[1], 'Film': e[2]} for e in edges]

In [10]:
# convert edge store to dataframe and merge with measures 
edge_store = pd.DataFrame(edges)
edge_store = edge_store.groupby(['Actor_x', 'Actor_y']).count().reset_index()
av_ratings = df.groupby(['Actor_x', 'Actor_y']).mean().reset_index()

edge_store = edge_store.merge(av_ratings, how='left', left_on=['Actor_x', 'Actor_y'], right_on=['Actor_x', 'Actor_y'])
edge_store = edge_store[['Actor_x', 'Actor_y', 'Rating', 'Film']]

edge_store = edge_store.merge(node_store, how='inner', left_on='Actor_x', right_on='Actor')
edge_store = edge_store[['Actor_x', 'Actor_y', 'Rating', 'Film', 'Index']]

edge_store = edge_store.merge(node_store, how='inner', left_on='Actor_y', right_on='Actor')
edge_store = edge_store[['Actor_x', 'Actor_y', 'Rating', 'Film', 'Index_x', 'Index_y']]
edge_store

Unnamed: 0,Actor_x,Actor_y,Rating,Film,Index_x,Index_y
0,Angela Bassett,Anthony Mackie,6.700000,1,1,2
1,Callan Mulvey,Anthony Mackie,7.700000,1,7,2
2,Chris Evans,Anthony Mackie,7.033333,6,10,2
3,Chris Pratt,Anthony Mackie,6.100000,2,12,2
4,Don Cheadle,Anthony Mackie,7.550000,2,16,2
...,...,...,...,...,...,...
566,Natalie Portman,Rene Russo,6.950000,2,40,44
567,Samuel L. Jackson,Rene Russo,7.000000,1,48,44
568,Robert Redford,Patrick Gorman,7.400000,1,46,41
569,Samuel L. Jackson,Angela Bassett,5.600000,2,48,1


In [11]:
# apply rate lambda again for edge ratings
edge_store['color'] = edge_store.apply(lambda row: rating_grouper(row['Rating']), axis=1)

# add edges by looping through edge data frame
for e in range(0, len(edge_store['Actor_x'])):
    net.add_edge(edge_store['Actor_x'][e], # actor to draw from 
                 edge_store['Actor_y'][e], # actor to draw to 
                 value=edge_store['Film'][e] - min(edge_store['Film'])/max(edge_store['Film']- min(edge_store['Film'])), # min max scale edges so all have weights
                color = edge_store['color'][e]) # add hex code color from df 

## Network Graph

In [12]:
net.toggle_physics(True) # changes the physics option in network (edges spring back and network spins or moves when nodes are dragged)
net.force_atlas_2based() # changes network layout based on force atlas algorithm 
net.show('net.html') # shows the network and saves as .html file 

And that's all there is to it! For a full interpretation of the network visual produced please check out my Medium page.