In [2]:
import pickle
import pandas as pd
import numpy as np
import networkx as nx
import re
from operator import itemgetter 

sns.set()
import urllib
import time
#from urllib.parse import urlparse
import json

### Get the superheros

In [3]:
marvel_url = 'https://raw.githubusercontent.com/SocialComplexityLab/socialgraphs2020/master/files/marvel_characters.csv'
df_marvel = (
    pd.read_csv(marvel_url, sep = '|')
    .assign(Universe = 'Marvel')
)

dc_url = 'https://raw.githubusercontent.com/SocialComplexityLab/socialgraphs2020/master/files/dc_characters.csv'
df_dc = (
    pd.read_csv(dc_url, sep = '|', lineterminator=';')
    .assign(Universe = 'DC')
)
df_dc.drop(df_dc.tail(1).index,inplace=True)

df_superheros_raw = pd.concat([df_marvel, df_dc])

In [4]:
df_superheros = (
    df_superheros_raw
    .drop(df_superheros_raw.columns[0], 1)
    .assign(
        **dict(zip(['Link', 'Section'], df_superheros_raw['WikiLink'].str.split("#", n =1, expand=True).values.T))
    )
    .drop('WikiLink', 1)
    .reset_index(drop=True)
)
df_superheros = (df_superheros
    .assign(Link = lambda x: np.where(df_superheros.Link.notnull(), df_superheros.Link, None))
    .assign(Section = lambda x: np.where(df_superheros.Section.notnull(), df_superheros.Section, None))     
)
df_superheros = df_superheros.drop_duplicates(subset=['CharacterName'])
df_superheros = df_superheros[df_superheros.CharacterName != 'Dorling Kindersley']
df_superheros

Unnamed: 0,CharacterName,Universe,Link,Section
0,A-Bomb,Marvel,Rick Jones (character),
1,Abomination,Marvel,Abomination (comics),
2,Abraxas,Marvel,,
3,Abyss,Marvel,,
4,Access,Marvel,,
...,...,...,...,...
3052,Zor-El,DC,Zor-El,
3053,"Zor-El, Kara",DC,Supergirl_(Kara_Zor-El),
3054,Zor-L,DC,Zor-El,Earth-Two
3055,"Zor-L II, Kara(DC Elseworlds)",DC,Power_Girl,Other_versions


### Get the wiki data

In [5]:
def construct_Wiki_API_link(link, section=None):
    
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    
    if section:
        content = "prop=revisions&rvprop=content&section=" + urllib.parse.quote(section)
    else:
        content = "prop=revisions&rvprop=content&"
        
    title = "titles=" + urllib.parse.quote(link)
    dataformat ="format=json"
    
    title = "titles=" + urllib.parse.quote(link)
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    return query

def get_raw_text(query):
    wikiresponse = urllib.request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
    wikidict = json.loads(wikitext)

    
    page_id = list(wikidict['query']['pages'].keys())[0]
    
    if page_id == '-1':
        return None
    
    else:
        raw_text = wikidict['query']['pages'][page_id]['revisions'][0]['*']
        return(raw_text)

In [6]:
super_hero_dict = dict()

for i, (_, row) in enumerate(df_superheros.iterrows()):
    character_name, universe, wiki_character_name, section = row
    
    if wiki_character_name is not None:
        query = construct_Wiki_API_link(wiki_character_name, section)
        wiki_text = get_raw_text(query)

        super_hero_dict[character_name] = {
            'Wiki_Name' : wiki_character_name,
            'Wiki_text' : wiki_text,
            'Universe'  : universe,
            'index'     : i,
        }
    else:
        super_hero_dict[character_name] = {
            'Wiki_Name' : None,
            'Wiki_text' : None,
            'Universe'  : universe,
            'index'     : i,
        }
    
    print("{} out of {} ... ".format(i, len(df_superheros)), end = '\r')

3033 out of 3034 ... 

In [7]:
with open('super_hero_dict_unformatted.pickle', 'wb') as handle:
    pickle.dump(super_hero_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Buildin the network

In [8]:
# Initialize adjancency matrix
all_keys = super_hero_dict.keys()
Adj_matrix = pd.DataFrame(data = 0, index = all_keys, columns = all_keys)

In [9]:
# Initialize adjancency matrix
all_keys

dict_keys(['A-Bomb', 'Abomination', 'Abraxas', 'Abyss', 'Access', 'Achebe', 'Adam', 'Aero', 'Agamemnon', 'Aginar', 'Ajak', 'Ajax', 'Amphibius', 'Anansi', 'Anelle', 'Arishem the Judge', 'Atum', 'Avoe', 'Bakuto', 'Brian Banner', 'Banshee', 'Barbarus', 'Baron Blood', 'Baron Brimstone', 'Baron Mordo', 'Baron Strucker', 'Baron Zemo', 'Barracuda', 'Barrage', 'Breeze Barton', 'Basilisk', 'Bast', 'Bastion', 'Batroc the Leaper', 'Battleaxe', 'Battlestar', 'Batwing', 'Baymax', 'Beast', 'Beautiful Dreamer', 'Bedlam', 'Beef', 'Beetle', 'Bela', 'Belasco', 'Bella Donna', 'Bengal', 'Bereet', 'Berzerker', 'Beta Ray Bill', 'Beyonder', 'Bi-Beast', 'Big Bertha', 'Big Man', 'Big Wheel', 'Bird-Brain', 'Bird-Man', 'Bishop', 'Bison', 'Black Ant', 'Black Bolt', 'Black Box', 'Black Cat', 'Black Crow', 'Black Dwarf', 'Black Fox', 'Black Jack Tarr', 'Black Knight', 'Black Mamba', 'Black Marvel', 'Black Panther', 'Black Racer', 'Black Rider', 'Black Spectre', 'Black Swan', 'Black Talon', 'Black Tarantula', 'Black

In [10]:
# Populate adjancency matrix
for key, value in super_hero_dict.items():
    if pd.isna(value):
        continue
    
    links = re.findall("\[\[(.*?)[\||\]\]]", str(value['Wiki_text']))
    intersect = set(all_keys).intersection(set(links))
    Adj_matrix.loc[key, intersect] = 1

In [18]:
G = nx.from_numpy_matrix(Adj_matrix.values, create_using=nx.DiGraph())

# Because we use numpy, labels need to be reset
label_mapping = {idx: val for idx, val in enumerate(Adj_matrix.columns)}
G = nx.relabel_nodes(G, label_mapping)

In [32]:
for node in G.nodes:
    if super_hero_dict[node]['Wiki_text']:
        length_of_content = len(re.findall(r'\b[\w-]+\b', re.sub(r'(<ref.+?(<\/ref>|\/>)|\n)', "", super_hero_dict[node]['Wiki_text'])))
    else:
        length_of_content = 0
        
    G.nodes[node].update({
        "Universe" : super_hero_dict[node]['Universe'],
        "length_of_content" : length_of_content
    })


In [35]:
# Get GCC
G_undirected = G.to_undirected()

con_comp = sorted(nx.connected_components(G_undirected), key = len, reverse = True)
GCC = G_undirected.subgraph(con_comp[0]).copy()

In [36]:
nx.write_gpickle(GCC, 'full_graph_undirect.pickle', protocol=pickle.HIGHEST_PROTOCOL)
nx.write_gpickle(GCC, 'GCC_direct.pickle', protocol=pickle.HIGHEST_PROTOCOL)

### Get formated wiki_text

In [None]:
def construct_Wiki_API_link(link, section=None):
    
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    
    if section:
        content = "prop=extracts&exlimit=max&explaintext&rvprop=content&section=" + urllib.parse.quote(section)
    else:
        content = "prop=extracts&exlimit=max&explaintext&rvprop=content&"
        
    title = "titles=" + urllib.parse.quote(link)
    dataformat ="format=json"
    
    title = "titles=" + urllib.parse.quote(link)
    query = "{}{}&{}&{}&{}".format(baseurl, action, content, title, dataformat)
    return query


def get_raw_text(query):
    wikiresponse = urllib.request.urlopen(query)
    wikidata = wikiresponse.read()
    wikitext = wikidata.decode('utf-8')
    wikidict = json.loads(wikitext)

    
    page_id = list(wikidict['query']['pages'].keys())[0]
    
    if page_id == '-1':
        return None
    
    else:
        raw_text = wikidict['query']['pages'][page_id]['extract']
        return(raw_text)

In [None]:
super_hero_dict_formatted = dict()

for i, (_, row) in enumerate(df_superheros.iterrows()):
    character_name, universe, wiki_character_name, section = row
    
    if wiki_character_name is not None:
        query = construct_Wiki_API_link(wiki_character_name, section)
        wiki_text = get_raw_text(query)

        super_hero_dict_formatted[character_name] = {
            'Wiki_Name' : wiki_character_name,
            'Wiki_text' : wiki_text,
            'Universe'  : universe,
            'index'     : i,
        }
    else:
        super_hero_dict_formatted[character_name] = {
            'Wiki_Name' : None,
            'Wiki_text' : None,
            'Universe'  : universe,
            'index'     : i,
        }
    
    print("{} out of {} ... ".format(i, len(df_superheros)), end = '\r')

In [None]:
with open('super_hero_dict_formatted.pickle', 'wb') as handle:
    pickle.dump(super_hero_dict_formatted, handle, protocol=pickle.HIGHEST_PROTOCOL)