In [6]:
import numpy as np
import pandas as pd
 

from IPython.display import Image
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from PIL import Image

import networkx as nx

from sklearn.preprocessing import MinMaxScaler
import random
import pickle

TEMPLATE = 'simple_white'

node_df = pd.read_csv("edges.csv")
edge_df = pd.read_csv("nodes.csv")
hero_net_df = pd.read_csv("hero-network.csv")


In [7]:
temp = pd.DataFrame({'edges.csv':sorted([h for h in edge_df['hero'].unique() if 'SPIDER' in h]),
                     'hero-network.csv':sorted([h for h in hero_net_df['hero1'].unique() if 'SPIDER' in h])})

display(temp)

for c in ['hero1', 'hero2']:
    print(f"{c} name max in hero-network.csv : {max(hero_net_df[c].apply(lambda x : len(x)))}")
    

# Name Preprocessing
## only use len 20 & only use left string bas on "/"
for c in ['hero1', 'hero2']:
    hero_net_df[c] = hero_net_df[c].apply(lambda x : x[:20].split("/")[0])
edge_df['hero'] = edge_df['hero'].apply(lambda x : x[:20].split("/")[0])

KeyError: 'hero'

In [None]:
print("SPIDER & HULK in hero-network.csv")
print(f"hero1=SPIDER-MAN, hero2=HULK : {len(hero_net_df[(hero_net_df['hero1']=='SPIDER-MAN')&(hero_net_df['hero2']=='HULK')])}")
print(f"hero1=HULK, hero2=SPIDER-MAN : {len(hero_net_df[(hero_net_df['hero2']=='SPIDER-MAN')&(hero_net_df['hero1']=='HULK')])}")

temp1 = set(edge_df[edge_df['hero']=='SPIDER-MAN']['comic'])
temp2 = set(edge_df[edge_df['hero']=='HULK']['comic'])
print(f"Intersection in edges.csv : {len(temp1.intersection(temp2))}")

In [None]:
topn = 25
topn_hero = edge_df.groupby(['hero'])[['comic']].count().sort_values(by=['comic'], ascending=False).head(topn).index

h1_ = []; h2_ = []; cnt_ = [];
for comb in list(combinations(topn_hero, 2)):    
    temp1 = set(edge_df[edge_df['hero']==comb[0]]['comic'])
    temp2 = set(edge_df[edge_df['hero']==comb[1]]['comic'])
    cnt = len(temp1.intersection(temp2)) # Appear Together    
    h1_.append(comb[0]); h2_.append(comb[1]); cnt_.append(cnt);
appto_df = pd.DataFrame({'H1':h1_, 'H2':h2_, 'CNT':cnt_})

display(appto_df.head())

In [None]:
HERO_COLOR = {
    'CAPTAIN AMERICA':'darkblue',
    'IRON MAN':'gold',
    'SPIDER-MAN':'darkred',
    'HULK':'forestgreen',
    'THOR':'lightblue',
    'DR. STRANGE':'purple'
}

# Make network
## Initialize graph
## - https://towardsdatascience.com/tutorial-network-visualization-basics-with-networkx-and-plotly-and-a-little-nlp-57c9bbb55bb9
marvel_net = nx.Graph() 
for i, row in appto_df.iterrows():
    marvel_net.add_edge(row['H1'], row['H2'], weight=row['CNT'])  # specify edge data

## - Inference
# aspl = nx.average_shortest_path_length(marvel_net) # no weight
# adgr = sum(dict(marvel_net.degree()).values())/float(len(marvel_net)) # no weight
        
# Visualization
## Get positions for the nodes in network
# pos_ = nx.kamada_kawai_layout(marvel_net) # sample layout : spring_layout ...
pos_ = nx.spring_layout(marvel_net, seed=11)
cent_ = nx.pagerank(marvel_net, weight='weight') # page rank
cent_top = sorted(cent_.items(), key=lambda item: item[1], reverse=True)[:1] # page rank top 1

## Custom function to create an edge between node x and node y, with a given text and width
def make_edge(x, y, text, width):
    return  go.Scatter(x=x, y=y, line=dict(width=width, color='lightgray'), hoverinfo='text', text=([text]), mode='lines')

## For each edge, make an edge_trace, append to list
edge_trace = []
for edge in marvel_net.edges():    
    if marvel_net.edges()[edge]['weight'] > 0:
        char_1 = edge[0]
        char_2 = edge[1]
        x0, y0 = pos_[char_1]
        x1, y1 = pos_[char_2]
        trace  = make_edge([x0, x1, None], [y0, y1, None], None, width=5*(marvel_net.edges()[edge]['weight']/appto_df['CNT'].max()))
        edge_trace.append(trace)
                
## Make a node trace
node_trace = go.Scatter(x=[], y=[], text=[], textposition="top center", textfont_size=10, mode='markers+text', hoverinfo='none',
                        marker=dict(color=[], size=[], line_width=[], line_color=[]))

## For each node in network, get the position and size and add to the node_trace
for node in marvel_net.nodes():
    x, y = pos_[node]
    node_trace['x'] += tuple([x])
    node_trace['y'] += tuple([y])
    color = 'gray'
    line_width = 2
    line_color = 'darkgray'
    name_text = node
    
    if node in HERO_COLOR:
        color = HERO_COLOR[node]; line_color='black';
        
    if node in [v[0] for v in cent_top]:
        name_text = '<b>' + node + '</b>'
        
    node_trace['marker']['color'] += tuple([color])
    node_trace['marker']['size'] += tuple([int(400*cent_[node])]) # node size is proportional to page rank
    node_trace['marker']['line_width'] += tuple([line_width])
    node_trace['marker']['line_color'] += tuple([line_color])
    node_trace['text'] += tuple([name_text])
    
    
## Customize layout
layout = go.Layout(
    paper_bgcolor='rgba(0,0,0,0)', # transparent background
    plot_bgcolor='rgba(0,0,0,0)', # transparent 2nd background
    xaxis =  {'showgrid': False, 'zeroline': False}, # no gridlines
    yaxis = {'showgrid': False, 'zeroline': False}, # no gridlines
)

## Create figure
fig = go.Figure(layout = layout)
## Add all edge traces
for trace in edge_trace:
    fig.add_trace(trace)
fig.add_trace(node_trace)
fig.update_layout(showlegend = False)
fig.update_xaxes(showticklabels = False)
fig.update_yaxes(showticklabels = False)
fig.update_layout(title=f"<b>Top {topn} Heroes Network</b>")
fig.show()

In [None]:
print(f"The number of hero pairs that never came out together : {len(appto_df[appto_df['CNT']==0])}")

cent_df = pd.DataFrame(index=list(marvel_net.nodes()))

# pagerank
cent_ = nx.pagerank(marvel_net, weight='weight')
cent_df['w_pagerank_cent'] = pd.Series(index=[k for k, v in cent_.items()], data=[float(v) for k, v in cent_.items()])

# eigenvalue centrality
cent_ = nx.eigenvector_centrality(marvel_net, weight='weight')
cent_df['w_eigenvector_cent'] = pd.Series(index=[k for k, v in cent_.items()], data=[float(v) for k, v in cent_.items()])

# degree centrality
cent_ = {h:0.0 for h in marvel_net.nodes()}
for u, v, d in marvel_net.edges(data=True):
    cent_[u]+=d['weight']; cent_[v]+=d['weight'];
cent_df['w_degree_cent'] = pd.Series(index=[k for k, v in cent_.items()], data=[float(v) for k, v in cent_.items()])

# closeness centrality
temp_net = marvel_net.copy()
for u,v,d in temp_net.edges(data=True):
    if 'distance' not in d:
        d['distance'] = 1.0/d['weight']
cent_ = nx.closeness_centrality(temp_net, distance='distance')
cent_df['w_closeness_cent'] = pd.Series(index=[k for k, v in cent_.items()], data=[float(v) for k, v in cent_.items()])

# betweenness centrality
cent_ = nx.betweenness_centrality(marvel_net, weight='weight')
cent_df['w_betweenness_cent'] = pd.Series(index=[k for k, v in cent_.items()], data=[float(v) for k, v in cent_.items()])

display(cent_df)
cent_df = cent_df.drop(columns=['w_betweenness_cent'])

In [None]:
# Scaling
for c in cent_df.columns:
    s = MinMaxScaler()
    cent_df[[c]] = s.fit_transform(cent_df[[c]])  
cent_df['mean_cent'] = cent_df.mean(axis=1)
cent_df = cent_df.sort_values(by=['mean_cent'], ascending=False)

# Visualization
fig = go.Figure(data=[go.Bar(
    x=cent_df.index,
    y=cent_df['mean_cent'],
    marker_color=[HERO_COLOR['CAPTAIN AMERICA']]+['lightgray']*2+\
                 [HERO_COLOR['IRON MAN']]+['lightgray']*5+\
                 [HERO_COLOR['THOR']]+['lightgray']*2+\
                 [HERO_COLOR['SPIDER-MAN']]+['lightgray']*3+\
                 [HERO_COLOR['HULK']]+['lightgray']*5+\
                 [HERO_COLOR['DR. STRANGE']]+['lightgray']*2
)])
fig.update_layout(title_text='<b>Mean Centrality of Heros</b>', template=TEMPLATE)

In [None]:
def show_network(input_, title=""):
    input_net = input_.copy()
    
    HERO_COLOR = {
        'CAPTAIN AMERICA':'darkblue',
        'IRON MAN':'gold',
        'SPIDER-MAN':'darkred',
        'HULK':'forestgreen',
        'THOR':'lightblue',
        'DR. STRANGE':'purple'
    }

    # Visualization
    ## Get positions for the nodes in network
    pos_ = nx.spring_layout(input_net, seed=11)
    cent_ = nx.pagerank(input_net, weight='weight') # page rank
    cent_top = sorted(cent_.items(), key=lambda item: item[1], reverse=True)[:1] # page rank top 1

    ## Custom function to create an edge between node x and node y, with a given text and width
    def make_edge(x, y, text, width):
        return  go.Scatter(x=x, y=y, line=dict(width=width, color='lightgray'), hoverinfo='text', text=([text]), mode='lines')

    ## For each edge, make an edge_trace, append to list
    edge_trace = []
    for edge in input_net.edges():    
        if input_net.edges()[edge]['weight'] > 0:
            char_1 = edge[0]
            char_2 = edge[1]
            x0, y0 = pos_[char_1]
            x1, y1 = pos_[char_2]
            trace  = make_edge([x0, x1, None], [y0, y1, None], None, width=5*(input_net.edges()[edge]['weight']/appto_df['CNT'].max()))
            edge_trace.append(trace)

    ## Make a node trace
    node_trace = go.Scatter(x=[], y=[], text=[], textposition="top center", textfont_size=10, mode='markers+text', hoverinfo='none',
                            marker=dict(color=[], size=[], line_width=[], line_color=[]))

    ## For each node in network, get the position and size and add to the node_trace
    for node in input_net.nodes():
        x, y = pos_[node]
        node_trace['x'] += tuple([x])
        node_trace['y'] += tuple([y])
        color = 'gray'
        line_width = 2
        line_color = 'darkgray'
        name_text = ''

        if node in HERO_COLOR:
            color = HERO_COLOR[node]; line_color='black';
            name_text = node

        node_trace['marker']['color'] += tuple([color])
        node_trace['marker']['size'] += tuple([int(400*cent_[node])]) # node size is proportional to page rank
        node_trace['marker']['line_width'] += tuple([line_width])
        node_trace['marker']['line_color'] += tuple([line_color])
        node_trace['text'] += tuple([name_text])


    ## Customize layout
    layout = go.Layout(
        paper_bgcolor='rgba(0,0,0,0)', # transparent background
        plot_bgcolor='rgba(0,0,0,0)', # transparent 2nd background
        xaxis =  {'showgrid': False, 'zeroline': False}, # no gridlines
        yaxis = {'showgrid': False, 'zeroline': False}, # no gridlines
    )

    ## Create figure
    fig = go.Figure(layout = layout)
    ## Add all edge traces
    for trace in edge_trace:
        fig.add_trace(trace)
    fig.add_trace(node_trace)
    fig.update_layout(showlegend = False)
    fig.update_xaxes(showticklabels = False)
    fig.update_yaxes(showticklabels = False)
    fig.update_layout(title=title)
    fig.show()

In [None]:
# prepare test data
with open("/kaggle/input/marvel-net/all_appto.pickle", "rb") as fr:
    all_appto = pickle.load(fr)
with open("/kaggle/input/marvel-net/all_net.pickle", "rb") as fr:
    all_net = pickle.load(fr)

# # long time
# all_appto = {}
# all_net = {}
# for topn in [25, 50, 100, 200, 500]:
#     top_hero = edge_df.groupby(['hero'])[['comic']].count().sort_values(by=['comic'], ascending=False).head(topn).index

#     h1_ = []; h2_ = []; cnt_ = [];
#     for comb in list(combinations(top_hero, 2)):    
#         temp1 = set(edge_df[edge_df['hero']==comb[0]]['comic'])
#         temp2 = set(edge_df[edge_df['hero']==comb[1]]['comic'])
#         cnt = len(temp1.intersection(temp2)) # Appear Together    
#         h1_.append(comb[0]); h2_.append(comb[1]); cnt_.append(cnt);
#     appto_df = pd.DataFrame({'H1':h1_, 'H2':h2_, 'CNT':cnt_})
#     all_appto[topn] = appto_df.copy()

    
#     marvel_net = nx.Graph() 
#     for i, row in appto_df.iterrows():
#         if row['CNT'] > 0:
#             marvel_net.add_edge(row['H1'], row['H2'], weight=row['CNT'])
#     all_net[topn] = marvel_net.copy()
    
# with open("all_appto.pickle", "wb") as fw:
#     pickle.dump(all_appto, fw)
# with open("all_net.pickle", "wb") as fw:
#     pickle.dump(all_net, fw)

In [None]:
# Too Slow
# for topn in [25, 50, 100, 200, 500]:
#     show_network(all_net[topn], title=f"<b>Top {topn} Heroes Network</b>")
    
for topn in [25, 50, 100]:
    show_network(all_net[topn], title=f"<b>Top {topn} Heroes Network</b>")

In [None]:
def get_cent(input_net):
    marvel_net = input_net.copy()
    cent_df = pd.DataFrame(index=list(marvel_net.nodes()))

    # pagerank
    cent_ = nx.pagerank(marvel_net, weight='weight')
    cent_df['w_pagerank_cent'] = pd.Series(index=[k for k, v in cent_.items()], data=[float(v) for k, v in cent_.items()])

    # eigenvalue centrality
    cent_ = nx.eigenvector_centrality(marvel_net, weight='weight')
    cent_df['w_eigenvector_cent'] = pd.Series(index=[k for k, v in cent_.items()], data=[float(v) for k, v in cent_.items()])

    # degree centrality
    cent_ = {h:0.0 for h in marvel_net.nodes()}
    for u, v, d in marvel_net.edges(data=True):
        cent_[u]+=d['weight']; cent_[v]+=d['weight'];
    cent_df['w_degree_cent'] = pd.Series(index=[k for k, v in cent_.items()], data=[float(v) for k, v in cent_.items()])

    # closeness centrality
    temp_net = marvel_net.copy()
    for u,v,d in temp_net.edges(data=True):
        if 'distance' not in d:
            if d['weight'] != 0:
                d['distance'] = 1.0/d['weight']
            else:
                d['distance'] = 2
    cent_ = nx.closeness_centrality(temp_net, distance='distance')
    cent_df['w_closeness_cent'] = pd.Series(index=[k for k, v in cent_.items()], data=[float(v) for k, v in cent_.items()])

#     # betweenness centrality
#     cent_ = nx.betweenness_centrality(marvel_net, weight='weight')
#     cent_df['w_betweenness_cent'] = pd.Series(index=[k for k, v in cent_.items()], data=[float(v) for k, v in cent_.items()])
    # Scaling
    for c in cent_df.columns:
        s = MinMaxScaler()
        cent_df[[c]] = s.fit_transform(cent_df[[c]])  
    cent_df['mean_cent'] = cent_df.mean(axis=1)
    cent_df = cent_df.sort_values(by=['mean_cent'], ascending=False)

    return cent_df

# prepare Data
mean_cent_df = pd.DataFrame(index=HERO_COLOR.keys())
for topn in [25, 50, 100, 200, 500]:
    mean_cent_df[f"mean_cent_{topn}"] = get_cent(all_net[topn]).loc[HERO_COLOR.keys(), :]['mean_cent']
    
# visualization
fig = go.Figure()
for c in mean_cent_df.T.columns:
    temp_ = mean_cent_df.T[c]
    fig.add_trace(go.Scatter(x=[f"In Top {n} Heroes" for n in [25, 50, 100, 200, 500]], y=temp_, mode='lines+markers', name=c,
                             line=dict(color=HERO_COLOR[c])
                            ))

fig.update_layout(title_text='<b>Mean Centrality</b>', template=TEMPLATE)
fig.show()

In [None]:
# prepare data
ncr_ = []; spl_ = []; dia_ = []; rad_ = [];
for n in [25, 50, 100, 200, 500]:
    test_appto = all_appto[n].copy()
    test_net = all_net[n].copy()

    ncr_.append(len(test_appto[test_appto['CNT']==0])/len(test_appto))
    spl_.append(nx.average_shortest_path_length(test_net))
    dia_.append(nx.diameter(test_net))
    rad_.append(nx.radius(test_net))
ch_df = pd.DataFrame({'Heroes':[25, 50, 100, 200, 500], 'Non-Connection Ratio':ncr_, 'Avg Shortest Path Length':spl_, 'Diameter':dia_, 'Radius':rad_})

# visualization
fig = make_subplots(specs=[[{"secondary_y": True}]])
for c in ['Non-Connection Ratio', 'Avg Shortest Path Length', 'Diameter', 'Radius']:
    temp_ = ch_df[c]
    if c == 'Non-Connection Ratio':
        fig.add_trace(go.Scatter(x=[f"In Top {n} Heroes" for n in [25, 50, 100, 200, 500]], y=temp_, name=c, line=dict(color='darkgray')),
                      secondary_y=True)
    else:
        fig.add_trace(go.Bar(x=[f"In Top {n} Heroes" for n in [25, 50, 100, 200, 500]], y=temp_, name=c
                                ))

fig.update_layout(barmode='group')
fig.update_yaxes(title_text="length", secondary_y=False)
fig.update_yaxes(title_text="ratio", secondary_y=True)
fig.update_layout(title_text='<b>Changes in Network Characteristics according to the number of heroes</b>', template=TEMPLATE)
fig.show()

In [None]:
# prepare data
min_node_ = []; r_ = [];
for n in [25, 50, 100, 200, 500]:
    test_appto = all_appto[n].copy()
    test_net = all_net[n].copy()

    cnt = len(nx.minimum_node_cut(test_net))
    min_node_.append(cnt)
    r_.append(cnt/n)
ch_df = pd.DataFrame({'Heroes':[25, 50, 100, 200, 500], 'Number of Nodes':min_node_, 'Cut Node Ratio':r_})

# visualization
fig = make_subplots(specs=[[{"secondary_y": True}]])
for c in ['Number of Nodes', 'Cut Node Ratio']:
    temp_ = ch_df[c]
    if c == 'Cut Node Ratio':
        fig.add_trace(go.Scatter(x=[f"In Top {n} Heroes" for n in [25, 50, 100, 200, 500]], y=temp_, name=c, line=dict(color='darkgray')),
                      secondary_y=True)
    else:
        fig.add_trace(go.Bar(x=[f"In Top {n} Heroes" for n in [25, 50, 100, 200, 500]], y=temp_, name=c
                                ))

fig.update_layout(barmode='group')
fig.update_yaxes(title_text="count", secondary_y=False)
fig.update_yaxes(title_text="ratio", secondary_y=True)
fig.update_layout(title_text="<b>Changes in Network's Robustness according to the number of heroes</b>", template=TEMPLATE)
fig.show()

test_appto = all_appto[500].copy()
print(f"Unconnected proportion of 500 hero pairs : {len(test_appto[test_appto['CNT']==0]) / len(test_appto):.3f}")

In [None]:
topn = 100
top_hero = edge_df.groupby(['hero'])[['comic']].count().sort_values(by=['comic'], ascending=False).head(topn).index
top_hero = [h for i, h in enumerate(top_hero) if i%2 == 0] # after finger snap

h1_ = []; h2_ = []; cnt_ = [];
for comb in list(combinations(top_hero, 2)):    
    temp1 = set(edge_df[edge_df['hero']==comb[0]]['comic'])
    temp2 = set(edge_df[edge_df['hero']==comb[1]]['comic'])
    cnt = len(temp1.intersection(temp2)) # Appear Together    
    h1_.append(comb[0]); h2_.append(comb[1]); cnt_.append(cnt);
appto_df = pd.DataFrame({'H1':h1_, 'H2':h2_, 'CNT':cnt_})
all_appto['fs50'] = appto_df.copy()


marvel_net = nx.Graph() 
for i, row in appto_df.iterrows():
    if row['CNT'] > 0:
        marvel_net.add_edge(row['H1'], row['H2'], weight=row['CNT'])
all_net['fs50'] = marvel_net.copy()

for topn in [100, 'fs50', 50]:
    show_network(all_net[topn], title=f"<b>Top {topn} Heroes Network</b>")

In [None]:
test_mean_cent_df = pd.DataFrame()
for topn in [50, 'fs50']:
    test_mean_cent_df[f"mean_cent_{topn}"] = get_cent(all_net[topn]).loc[:, :]['mean_cent']

print("<The existing 50 Hero Network>")
display(test_mean_cent_df[["mean_cent_50"]].sort_values(by=["mean_cent_50"], ascending=False).head(10))

print("<50 Hero Networks After Finger Snap>")
display(test_mean_cent_df[["mean_cent_fs50"]].sort_values(by=["mean_cent_fs50"], ascending=False).head(10))

In [None]:
# prepare data
ncr_ = []; spl_ = []; dia_ = []; rad_ = [];
for n in [50, 'fs50']:
    test_appto = all_appto[n].copy()
    test_net = all_net[n].copy()

    ncr_.append(len(test_appto[test_appto['CNT']==0])/len(test_appto))
    spl_.append(nx.average_shortest_path_length(test_net))
    dia_.append(nx.diameter(test_net))
    rad_.append(nx.radius(test_net))
ch_df = pd.DataFrame({'Heroes':[50, 'fs50'], 'Non-Connection Ratio':ncr_, 'Avg Shortest Path Length':spl_, 'Diameter':dia_, 'Radius':rad_})

# visualization
fig = make_subplots(specs=[[{"secondary_y": True}]])
for c in ['Non-Connection Ratio', 'Avg Shortest Path Length', 'Diameter', 'Radius']:
    temp_ = ch_df[c]
    if c == 'Non-Connection Ratio':
        fig.add_trace(go.Scatter(x=[f"In Top {n} Heroes" for n in [50, 'fs50']], y=temp_, name=c, line=dict(color='darkgray')),
                      secondary_y=True)
    else:
        fig.add_trace(go.Bar(x=[f"In Top {n} Heroes" for n in [50, 'fs50']], y=temp_, name=c
                                ))

fig.update_layout(barmode='group')
fig.update_yaxes(title_text="length", secondary_y=False)
fig.update_yaxes(title_text="ratio", secondary_y=True)
fig.update_layout(title_text='<b>Changes in Network Characteristics according to the number of heroes</b>', template=TEMPLATE, height=400)
fig.show()

In [None]:
# prepare data
min_node_ = []; r_ = [];
for n in [50, 'fs50']:
    test_appto = all_appto[n].copy()
    test_net = all_net[n].copy()

    cnt = len(nx.minimum_node_cut(test_net))
    min_node_.append(cnt)
    r_.append(cnt/50)
ch_df = pd.DataFrame({'Heroes':[50, 'fs50'], 'Number of Nodes':min_node_, 'Cut Node Ratio':r_})

# visualization
fig = make_subplots(specs=[[{"secondary_y": True}]])
for c in ['Number of Nodes', 'Cut Node Ratio']:
    temp_ = ch_df[c]
    if c == 'Cut Node Ratio':
        fig.add_trace(go.Scatter(x=[f"In Top {n} Heroes" for n in [50, 'fs50']], y=temp_, name=c, line=dict(color='darkgray')),
                      secondary_y=True)
    else:
        fig.add_trace(go.Bar(x=[f"In Top {n} Heroes" for n in [50, 'fs50']], y=temp_, name=c
                                ))

fig.update_layout(barmode='group')
fig.update_yaxes(title_text="count", secondary_y=False)
fig.update_yaxes(title_text="ratio", secondary_y=True)
fig.update_layout(title_text="<b>Changes in Network's Robustness according to the number of heroes</b>", template=TEMPLATE, height=400)
fig.show()