In [None]:
import os
import json
import requests
import re 
import random
from time import sleep
import pandas as pd
import numpy as np
from datetime import datetime
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait, TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from lxml import html

# Network Data Load

We are loading the network data and extract all character names.

In [None]:
DATA_PATH = './data'
FILES = dict()
for file_name in os.listdir(DATA_PATH):
    if file_name.startswith('gender_mapping'):
        continue
    full_file_path = os.path.join(DATA_PATH, file_name)
    FILES[os.path.splitext(file_name)[0]] = full_file_path

for k, v in FILES.items():
    print(k, '->', v)

starwars-episode-5-interactions -> ./data/starwars-episode-5-interactions.json
starwars-episode-3-interactions -> ./data/starwars-episode-3-interactions.json
starwars-episode-1-mentions -> ./data/starwars-episode-1-mentions.json
starwars-episode-4-mentions -> ./data/starwars-episode-4-mentions.json
starwars-full-interactions-allCharacters-merged -> ./data/starwars-full-interactions-allCharacters-merged.json
starwars-episode-6-mentions -> ./data/starwars-episode-6-mentions.json
starwars-episode-4-interactions -> ./data/starwars-episode-4-interactions.json
starwars-episode-3-mentions -> ./data/starwars-episode-3-mentions.json
starwars-episode-2-interactions -> ./data/starwars-episode-2-interactions.json
starwars-episode-7-interactions -> ./data/starwars-episode-7-interactions.json
starwars-episode-1-interactions-allCharacters -> ./data/starwars-episode-1-interactions-allCharacters.json
starwars-episode-3-interactions-allCharacters -> ./data/starwars-episode-3-interactions-allCharacters.j

In [None]:
character_names = list()

def extract_names_from_network_definition(network):
    if 'nodes' not in network:
        return []
    characters = list()
    for node in network['nodes']:
        characters.append(node['name'])
    return characters

for k, v in FILES.items():
    with open(FILES[k]) as f:
        network = json.load(f)
    character_names.extend(extract_names_from_network_definition(network))

character_names = list(set(character_names))

In [None]:
len(character_names)

113

In [None]:
character_names = sorted(character_names)

# Scrape the gender data of the characters

The data can be found on the starwars "databank" and also sometimes the "wookieepedia".

In [None]:
databank = 'https://www.starwars.com/databank'
databank_tricky = 'https://www.starwars.com/search?q=####&f%5Bsearch_section%5D=Databank'
wookieepedia = 'https://starwars.fandom.com/wiki'
wookieepedia_tricky = 'https://starwars.fandom.com/wiki/Special:Search?query=####'

random.seed(1337)

In [None]:
def prepare_url(term, destination):
    if destination == 'databank':
        return f'{databank}/{term}'
    elif destination == 'databank_tricky':
        return databank_tricky.replace('####', term)
    elif destination == 'wookieepedia':
        return f'{wookieepedia}/{term}'
    elif destination == 'wookieepedia_tricky':
        return wookieepedia_tricky.replace('####', term)
    else:
        return term

The query function. It introduces a random timeout after each request to prevent the blocking of our IP.

In [None]:
def query(url):
    """Scraper method that adds a random delay to prevent DOSing the target.£"""
    response = requests.get(url)
    sleep(random.randint(1,3))
    return response

In [None]:
def convert_name(name, destination='databank'):
    """Converter method to make sure the name is properly formatted for the target website."""
    name = re.sub(r'\/', ' and ', name)
    if destination == 'databank':
        return name.lower().replace(' ', '-')
    elif destination == 'databank_tricky':
        return name.lower()
    elif destination == 'wookieepedia':
        return name.title().replace(' ', '_')
    elif destination == 'wookieepedia_tricky':
        return name.lower()
    else:
        return name

## Databank query

First we will query the databank. It should contain most of the information.

Certain characters may not be found easily in the databank or are not available. Thus, we will override them.

There are two way of searching:
First, we attempt a direct method, i.e. we guess the right URL of the character detail page. 
Normally, if it is not a droid, then we will receive a "gender" in the stats section of the detail page. 
Gender information for droids will be extracted later on from the wookieepedia. 
Droids have either masculine or feminine programming, effectively making them either male or female.
If the guessed URL returns a 404, we will use another method through the search functionality of the database. 
This way we try to extract the correct URL of the detail page from the search results. 
The tricky part here is, that the search results are sideloaded after the page load (AJAX), which prevents us from using the "request" module, we have to use "selenium" in that instance. 
Selenium waits until all results are visible and extracts then the URLs. 
We will only check the first 3 URLs, because if we don't find it in them, the search term is probably useless. 
Each URL will be trialed similar to the first direct method. Each page is checked whether it is the correct character page, and discarded if it is not.
If no results can be found in either methods, then we will use the search term later on to search the wookieepedia.

Some characters have been found especially tricky in either websites we are looking at. For those we consider "overrides" and directly assume their gender by a manual lookup.


In [None]:
# the following characters are known to contain a weird spelling on the database page. thus we will relax the verification steps.
weird_spellings_on_database = ['DARTH MAUL']
overrides = {
    'ANAKIN': 'Male', # database does not contain gender
    'YOUNG GIRL': 'Female', # implicit gender
    'RED LEADER': 'Male', # callsign of pilot
    'RED TEN': 'Male', # callsign of pilot
    'GOLD FIVE': 'Male', # callsign of pilot
    'GOLD LEADER': 'Male', # callsign of pilot
    'BRAVO THREE': 'Male', # callsign of pilot
    'BRAVO TWO': 'Male', # callsign of pilot
    'HAN': 'Male', # tricky to find
    'PK-4': 'Male', # tricky to find
    'FODE/BEED': 'Male', # tricky to find
    'SENATOR ASK AAK': 'Male' # tricky to find
}

In [None]:
def find_database_stat(container, lookup_stat):
    stat = None
    categories = container.find_all('div', {'class': 'category'})
    for category in categories:
        if category.find('div', {'class': 'heading'}).text.lower() == lookup_stat.lower():
            stat = category.find('li', {'class': 'data'}).text
            stat = re.sub(r'[^A-z\s]', '', stat.strip())
            break
    return stat

def extract_links_from_results_page(term, n=0):
    # we are using selenium here since the search results get side loaded with ajax after original page load
    options = Options()
    options.headless = True
    options.add_argument('--no-sandbox') 
    options.add_argument('--disable-dev-shm-usage') 
    try:
        if driver is not None:
            driver.quit()
    except:
        pass
    driver = webdriver.Chrome(options=options)
    driver.get(prepare_url(term, 'databank_tricky'))
    try:
        wait = WebDriverWait(driver, 5)
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.search_results')))
        urls = [element.get_attribute('href') for element in driver.find_elements(By.CSS_SELECTOR, '.result-title')]
    except TimeoutException:
        return []
    driver.quit()
    return urls if n == 0 else urls[:n]

def extract_gender_from_character_page(soup, character, check_correct_page=True):
    gender = None
    # verify we are on the right page
    if check_correct_page:
        title = soup.find_all('h3', {'class': 'title'})
        if not title:
            print('\tDid not find title element')
            return gender
        full_title = title[0].find_all('span', {'class': 'long-title'})
        needle = re.sub(r'[^A-z]', '', character).lower()
        haystack = re.sub(r'[^A-z]', '', full_title[0].text).lower()
        if not full_title or needle not in haystack or f'{needle}\'s' in haystack:
            # we are on the wrong page
            print('\tWrong page')
            return gender
    # extract
    stats_container = soup.find_all('div', {'class': 'stats-container'})
    if stats_container:
        container = stats_container[0]
        gender = find_database_stat(container, 'Gender')
        if gender is None:
            print('\tCould not extract gender from container')
            # most likely a droid
            droid = find_database_stat(container, 'Droid')
            if droid is not None:
                print('\tAssuming droid')
                gender = 'Droid'
            else:
                print('\tCould not determine whether droid or not')
        else:
            print(f'\tGender found: {gender}')
    else:
        print('\tNo Stats container found')
    return gender

def scrape_databank(characters):
    result = dict()
    for idx, character in enumerate(characters):
        print(f'Looking at character {str(idx+1)}/{len(characters)}: {character}')
        if character in overrides:
            print('\tUsing override')
            result[character] = overrides[character]
            print('Success!')
            continue
        # Build search url
        name = convert_name(name=character, destination='databank')
        # Query
        search_url = prepare_url(name, destination='databank')
        print(f'\tURL: {search_url}')
        page = query(search_url)
        # Verify Response
        not_found = page.url.replace('/','') == databank.replace('/','') or page.status_code == 404
        # Query Method
        if not not_found:
            print('\tGoing for the direct way')
            # Easy way if we find a direct hit
            soup = BeautifulSoup(page.content, 'html.parser')
            gender = extract_gender_from_character_page(soup, character, character not in weird_spellings_on_database)
            result[character] = gender
        else:
            print('\tGoing for the search result way')
            # Alternative way when not found initially
            search_name = convert_name(name=character, destination='databank_tricky')
            print(f'\tSearchterm: {search_name}')
            links = extract_links_from_results_page(name, 0)
            if not links:
                print('\tNo hits')
                result[character] = None
            else:
                for link in links[:min(len(links),3)]:
                    print(f'\tURL: {link}')
                    page = query(link) # no need to check if found, since it is provided by the engine
                    soup = BeautifulSoup(page.content, 'html.parser')
                    gender = extract_gender_from_character_page(soup, character)
                    if gender is not None:
                        break
                result[character] = gender
        print('Success!' if result[character] is not None else 'Nothing extracted.')
    return result



In [None]:
# runs for roughly 15min
gender_mapping_database = scrape_databank(character_names)

Looking at character 1/113: ADMIRAL ACKBAR
	URL: https://www.starwars.com/databank/admiral-ackbar
	Going for the direct way
	Gender found: Male
Success!
Looking at character 2/113: ADMIRAL STATURA
	URL: https://www.starwars.com/databank/admiral-statura
	Going for the direct way
	Gender found: Male
Success!
Looking at character 3/113: ANAKIN
	Using override
Success!
Looking at character 4/113: BAIL ORGANA
	URL: https://www.starwars.com/databank/bail-organa
	Going for the direct way
	Gender found: Male
Success!
Looking at character 5/113: BALA-TIK
	URL: https://www.starwars.com/databank/bala-tik
	Going for the direct way
	Gender found: Male
Success!
Looking at character 6/113: BB-8
	URL: https://www.starwars.com/databank/bb-8
	Going for the direct way
	Could not extract gender from container
	Assuming droid
Success!
Looking at character 7/113: BERU
	URL: https://www.starwars.com/databank/beru
	Going for the search result way
	Searchterm: beru
	URL: https://www.starwars.com/databank/beru-

### Get an Overview

Let's get an overview what we have found so far.

In [None]:
counter = 0
for k,v in gender_mapping_database.items():
    if v is None:
        continue
    if 'Droid' in v:
        counter += 1
        print(f'{counter}:\t{k} - {v}')
print('-'*20)
counter = 0
for k,v in gender_mapping_database.items():
    if v is None:
        counter += 1
        print(f'{counter}:\t{k} - {v}')

1:	BB-8 - Droid
2:	C-3PO - Droid
3:	R2-D2 - Droid
4:	TC-14 - Droid
--------------------
1:	BOUSHH - None
2:	CAMIE - None
3:	DACK - None
4:	FANG ZAR - None
5:	FODE/BEED - None
6:	GENERAL CEEL - None
7:	GIDDEAN DANU - None
8:	JOBAL - None
9:	NIV LEK - None
10:	PADME - None
11:	RABE - None
12:	RUWEE - None
13:	SENATOR ASK AAK - None
14:	SOLA - None
15:	SUN RIT - None
16:	TEY HOW - None
17:	YOLO ZIFF - None


### Query the Droids

The droid programming information can be found on wookieepedia. From there we derive if it is a "Male" droid or a "Female" droid from the type of programming (voice).

In [None]:
def extract_gender_from_droid_page(soup, droid, check_correct_page=True):
    gender = None
    # verify we are on the right page
    if check_correct_page:
        title = soup.find_all('h2', attrs={'data-source' : 'name'})
        if not title:
            print('\tDid not find title element')
            return gender
        full_title = title[0].text
        needle = re.sub(r'[^A-z]', '', droid).lower()
        haystack = re.sub(r'[^A-z]', '', full_title).lower()
        if not full_title or needle not in haystack:
            # we are on the wrong page
            print('\tWrong page')
            return gender
    # extract
    gender_element = soup.find_all('a', attrs={'title' : 'Sexes'})
    if gender_element:
        element = gender_element[0].text
        if 'Masculin' in element or 'Male' in element:
            gender = 'Male'
        elif 'Feminin' in element or 'Female' in element:
            gender = 'Female'
    else:
        print('\tNo Gender Element found')
    return gender

def scrape_wookieepedia_direct_for_droids(characters):
    result = dict()
    for idx, character in enumerate(characters):
        print(f'Looking at droid {str(idx+1)}/{len(characters)}: {character}')
        # Query
        search_url = prepare_url(character, destination='wookieepedia')
        print(f'\tURL: {search_url}')
        page = query(search_url)
        # Easy way if we find a direct hit
        soup = BeautifulSoup(page.content, 'html.parser')
        # extract
        gender = extract_gender_from_droid_page(soup, character)
        if gender is None:
            gender = 'Droid'
        result[character] = gender
    return result 


droids = dict(filter(lambda character: character[1] == 'Droid' , gender_mapping_database.items()))
droid_genders = scrape_wookieepedia_direct_for_droids(droids)
# update master mapping
gender_mapping_database = {**gender_mapping_database, **droid_genders}
# checking if we still have droids unresolved
assert dict(filter(lambda character: character[1] == 'Droid' , gender_mapping_database.items())) == {}, 'There are still droids you are looking for.'

Looking at droid 1/4: BB-8
	URL: https://starwars.fandom.com/wiki/BB-8
Looking at droid 2/4: C-3PO
	URL: https://starwars.fandom.com/wiki/C-3PO
Looking at droid 3/4: R2-D2
	URL: https://starwars.fandom.com/wiki/R2-D2
Looking at droid 4/4: TC-14
	URL: https://starwars.fandom.com/wiki/TC-14


## Query the remaining characters

In [None]:
def extract_links_from_wookieepedia_results_page(term, n=0):
    # we are using selenium here for no apparent reason
    options = Options()
    options.headless = True
    options.add_argument('--no-sandbox') 
    options.add_argument('--disable-dev-shm-usage') 
    try:
        if driver is not None:
            driver.quit()
    except:
        pass
    driver = webdriver.Chrome(options=options)
    driver.get(prepare_url(term, 'wookieepedia_tricky'))
    try:
        wait = WebDriverWait(driver, 5)
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, '.unified-search__results')))
        urls = [element.get_attribute('href') for element in driver.find_elements(By.CSS_SELECTOR, '.unified-search__result__title')]
    except TimeoutException:
        return []
    driver.quit()
    return urls if n == 0 else urls[:n]

def extract_gender_from_wookieepedia_character_page(soup, character, check_correct_page=True):
    gender = None
    # verify we are on the right page
    if check_correct_page:
        character_info = soup.find_all('aside', {'class': 'pi-theme-character'})
        if not character_info:
            print('\tDid not find character info')
            return gender
        title = soup.find_all('h2', attrs={'data-source' : 'name'})
        if not title:
            print('\tDid not find title element')
            return gender
        full_title = title[0].text
        needle = re.sub(r'[^A-z]', '', character).lower()
        haystack = re.sub(r'[^A-z]', '', full_title).lower()
        if not full_title or needle not in haystack:
            # we are on the wrong page
            print('\tWrong page')
            return gender
    # extract
    gender_element = soup.find_all('a', attrs={'title' : ['Sexes', 'Sexes/Legends']})
    if gender_element:
        element = gender_element[0].text
        if 'Male' in element:
            gender = 'Male'
        elif 'Female' in element:
            gender = 'Female'
    else:
        print('\tNo Gender Element found')
    return gender

def scrape_wookieepedia_search_for_remaining(characters):
    result = dict()
    for idx, character in enumerate(characters):
        print(f'Looking at character {str(idx+1)}/{len(characters)}: {character}')
        if character in overrides:
            print('\tUsing override')
            result[character] = overrides[character]
            print('Success!')
            continue
        # Build search url
        name = convert_name(name=character, destination='wookieepedia')
        # Query
        search_url = prepare_url(name, destination='wookieepedia')
        print(f'\tURL: {search_url}')
        page = query(search_url)
        # Verify Response
        not_found = page.url.replace('/','') == databank.replace('/','') or page.status_code == 404
        # Query Method
        if not not_found:
            print('\tGoing for the direct way')
            # Easy way if we find a direct hit
            soup = BeautifulSoup(page.content, 'html.parser')
            gender = extract_gender_from_wookieepedia_character_page(soup, character, False)
            result[character] = gender
        else:
            print('\tGoing for the search result way')
            # Alternative way when not found initially
            search_name = convert_name(name=character, destination='wookieepedia_tricky')
            print(f'\tSearchterm: {search_name}')
            links = extract_links_from_wookieepedia_results_page(name, 0)
            if not links:
                print('\tNo hits')
                result[character] = None
            else:
                for link in links[:min(len(links),3)]:
                    print(f'\tURL: {link}')
                    page = query(link) # no need to check if found, since it is provided by the engine
                    soup = BeautifulSoup(page.content, 'html.parser')
                    gender = extract_gender_from_wookieepedia_character_page(soup, character)
                    if gender is not None:
                        break
                result[character] = gender
        print('Success!' if result[character] is not None else 'Nothing extracted.')
    return result

In [None]:
remaining = dict(filter(lambda character: character[1] is None , gender_mapping_database.items()))
remaining_genders = scrape_wookieepedia_search_for_remaining(remaining)

Looking at character 1/7: DACK
	URL: https://starwars.fandom.com/wiki/Dack
	Going for the direct way
Success!
Looking at character 2/7: GENERAL CEEL
	URL: https://starwars.fandom.com/wiki/General_Ceel
	Going for the direct way
Success!
Looking at character 3/7: JOBAL
	URL: https://starwars.fandom.com/wiki/Jobal
	Going for the direct way
Success!
Looking at character 4/7: RABE
	URL: https://starwars.fandom.com/wiki/Rabe
	Going for the direct way
Success!
Looking at character 5/7: SENATOR ASK AAK
	Using override
Success!
Looking at character 6/7: SOLA
	URL: https://starwars.fandom.com/wiki/Sola
	Going for the direct way
Success!
Looking at character 7/7: SUN RIT
	URL: https://starwars.fandom.com/wiki/Sun_Rit
	Going for the direct way
Success!


### Update

Let's update what we have found.

In [None]:
gender_mapping_database = {**gender_mapping_database, **remaining_genders}

There might be still some genders left. Let's have a look and add them manually.

In [None]:
still_unresolved = dict(filter(lambda character: character[1] is None , gender_mapping_database.items()))
print(still_unresolved)
assert still_unresolved == {}, 'Still some left. They might be added in the overrides.'


{}


In [None]:
def store(mapping):
    data = dict()
    nodes = list()
    for k, v in mapping.items():
        nodes.append({'name': k, 'gender': v})
    data['nodes'] = nodes
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = os.path.join(DATA_PATH, f'gender_mapping_{timestamp}.json')
    with open(filename, 'w') as fp:
        json.dump(data, fp)

store(gender_mapping_database)


In [None]:
os.listdir(DATA_PATH)

['starwars-episode-5-interactions.json',
 'starwars-episode-3-interactions.json',
 'starwars-episode-1-mentions.json',
 'starwars-episode-4-mentions.json',
 'starwars-full-interactions-allCharacters-merged.json',
 'starwars-episode-6-mentions.json',
 'starwars-episode-4-interactions.json',
 'starwars-episode-3-mentions.json',
 'starwars-episode-2-interactions.json',
 'starwars-episode-7-interactions.json',
 'starwars-episode-1-interactions-allCharacters.json',
 'starwars-episode-3-interactions-allCharacters.json',
 'starwars-episode-2-mentions.json',
 'gender_mapping_20201205_112121.json',
 'starwars-episode-2-interactions-allCharacters.json',
 'starwars-episode-1-interactions.json',
 'starwars-episode-7-mentions.json',
 'starwars-full-interactions.json',
 'starwars-episode-6-interactions.json',
 'starwars-full-interactions-allCharacters.json',
 'starwars-episode-5-mentions.json',
 'starwars-episode-6-interactions-allCharacters.json',
 'starwars-episode-7-interactions-allCharacters.jso