In [None]:
from bs4 import BeautifulSoup
import requests
import json
import pandas as pd

In [None]:
# ************ MAIN() ************

def main():
    name_set = grab_link_titles()
    crawling()
    manually_delete_data_from_df()
    
    
# ************ GRAB_LINK_TITLES() ************
    
def grab_link_titles():
    '''Function returns set with all names contained in overview pages on Perrypedia by letter.'''
    print("************ GRAB_LINK_TITLES() ************")
    letters = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", 
               "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"]
    name_set = set([])
    for letter in letters:
        print(f'Buchstabe: {letter}')
        url = generate_URLs(letter)
        page = get_page(url)
        if page:
            titles_by_page = get_link_titles(page)
            for title in titles_by_page:
                name_set.add(title)
    print("Checking for bad data")
    name_set, bad_data = check_for_bad_data(name_set, letters)
    with open("PR_person_namelist.txt", "w") as file:
        print("Saving as json")
        json.dump(list(name_set), file)
    with open("bad_data.txt", "w") as file:
        json.dump(list(bad_data), file)
    return name_set

def generate_URLs(letter):
    '''Function generates URL to overview page on Perrypedia for given letter.'''
    root = "https://www.perrypedia.de/wiki/Personen_"
    return root + letter


def get_page(url):
    '''Function returns page content for given URL as a parseable BeautifulSoup object.'''
    page = requests.get(url)
    if page.status_code != 200:
        print(f"{url}: {page.status_code}")
        soup = None
    else:
        soup = BeautifulSoup(page.content, 'html.parser')
    return soup


def get_link_titles(soup):
    '''Function searches given overview site as BeautifulSoup object for link titles containing character names.'''
    titles = set([])
    tables = soup.find_all("table")
    for table in tables:
        trs = table.find_all("tr")
        for row in trs:
            page_content = row.find("td")
            if page_content != None:
                link = page_content.find("a")
                if link != None:
                    title = link.get("title")
                    if title != None and "Seite nicht vorhanden" not in title:
                        titles.add(title)
    return titles


def check_for_bad_data(name_set, letters):
    '''Function finds links that refer back to https://www.perrypedia.de/wiki/Personen_XYZ.'''
    bad_data = set([])
    for letter in letters:
        url = f"https://www.perrypedia.de/mediawiki/index.php?title=Spezial:Linkliste/Personen_{letter}&limit=1000&hidelinks=1"
        page = get_page(url)
        if page:
            name_set, doublettes = delete_doublettes(page, name_set)
            bad_data.update(doublettes)
    return name_set, bad_data


def delete_doublettes(page, name_set):
    '''Function deletes doublettes from name_set.'''
    bad_data = set([])
    name_list = page.find("ul")
    links = name_list.find_all("a")
    for link in links:
        title = link.get("title")
        if title in name_set:
            bad_data.add(title)
            name_set.remove(title)
    return name_set, bad_data


# ************ CRAWLING() ************

def crawling():
    '''Function iterates through name set generating a Soup object for the corresponding URL and saves results of crawling as adjacency matrix and as URL list.'''
    print("************ CRAWLING() ************")
    print("Preparing data")
    name_list, name_set, length = data_preparation()
    global df
    df = pd.DataFrame(columns=name_set, index=name_set)
    i = 0
    url_mapping = {}
    for name in name_list:
            url = "https://www.perrypedia.de/wiki/" + name.replace(" ", "_")
            url_mapping[name] = url
            try:
                page = get_page(url)
                if page:
                    crawl_for_names_in_text(page, name, name_set)
            except:
                print(f"Fehler bei {i}: {name}")
            finally:
                i += 1
                if i%100==0:
                  print(f"{i} / {length} pages processed.")
    print("Saving as .csv")
    df.to_csv("PR_person_matrix.csv")
    with open("url_mapping.json", "w") as file:
        json.dump(url_mapping, file)
    print("Crawling finished.")
    

def data_preparation():
    '''Function loads and returns the namelist and its length from file.'''
    with open("./PR_person_namelist.txt", "r") as file:
        name_list = json.load(file)
    name_set = set(name_list)
    length = len(name_set)
    return name_list, name_set, length


def crawl_for_names_in_text(page, name, name_set):
    '''Function crawls for link titles for a given page, matches them with corresponding names in the namelist and manages entry in adjacency matrix'''
    global df
    links = page.find_all("a")
    if links:
        for link in links:
            title = link.get("title")
            if title in name_set and title is not name:
                # df[name][title] = 1
                df[title][name] = 1
            

# ************ MANUALLY_DELETE_DATA_FROM_DF() ************            
            
def manually_delete_data_from_df():
    '''Function deletes bad data given in bad_data.txt from adjacency matrix.'''
    df = pd.read_csv("PR_person_matrix.csv", sep = ",", index_col="00")
    print("Finished loading df")
    with open("bad_data.txt", "r") as file:
        bad_data = json.load(file)
    for name in bad_data:
        df.drop(labels=name, axis="index")
        df.drop(labels=name, axis="columns")
    print("Saving to csv")
    df.to_csv("PR_person_matrix_clean.csv", sep = ",")

In [None]:
if __name__ == "__main__":
    main()