In [1]:
import wikipedia as w
import pandas as pd

from time import sleep

# Wikipedia Edgelist Generator

Feel free to use this Wikipedia edgelist generator for your own research uses. However, please keep a few things in mind.
- For the function 'create_edgelist_df', the default iterations has been set to 2, and I did 4 for this example. The higher the number, the more you will query the Wikipedia API. Even at 4, we ended up with a network of 9,000+ nodes, so don't go overboard. Use this responsibly. Start at 2, check the results, if you need more increase to 3, check results, etc.
- No matter what, do include a sleep() command. I have it set to 0.3 seconds between each search. A higher number will be better but take longer.
- Always crawl responsibly. If you are too aggressive, your IP will be blocked by Wikipedia and you will get nothing.
- Check the day_5_analysis notebook to see how the data can be used.

In [2]:
def search_wiki(search_list):

    origin_pages = []
    found_pages = []

    # later, we will drop these to keep the properly named page in the network

    for search in search_list:

        print('searching topic: {}'.format(search))
        
        try:
            #page = w.page(search)
            #search_results = page.links
            search_results = w.search(search)

            for found_page in search_results:

                origin_pages.append(search)
                found_pages.append(found_page)

        except:
            
            print('not found')
            
        sleep(0.3)
        
    return origin_pages, found_pages

In [3]:
def create_edgelist_df(seed_searches, iterations=2):

    completed = []
    sources = []
    targets = []

    for i in range(iterations):
        
        print('starting iteration: {}'.format(i))
        print()

        if i == 0:

            search_list = seed_searches

        origin_pages, found_pages = search_wiki(search_list)

        completed.extend(origin_pages)
        sources.extend(origin_pages)
        targets.extend(found_pages)

        search_list = sorted(set([t for t in targets if t not in completed]))
        print(len(search_list))
        print()
        print(search_list)
        print()
        
    df = pd.DataFrame(list(zip(sources, targets)))
    df.columns = ['source', 'target']
                  
    return df

In [10]:
seed_searches = ['wilco']

df = create_edgelist_df(seed_searches, iterations=4) # keep it low (1-3) or this will take forever

starting iteration: 0

searching topic: wilco
10

['Jeff Tweedy', 'Procedure word', 'Roger Wilco', 'Space Quest', 'Wilco', 'Wilco (The Album)', 'Wilco (disambiguation)', 'Wilco (given name)', 'Wilco Kelderman', 'Wilco discography']

starting iteration: 1

searching topic: Jeff Tweedy
searching topic: Procedure word
searching topic: Roger Wilco
searching topic: Space Quest
searching topic: Wilco
searching topic: Wilco (The Album)
searching topic: Wilco (disambiguation)
searching topic: Wilco (given name)
searching topic: Wilco Kelderman
searching topic: Wilco discography
56

["2020 Giro d'Italia", '2021 Tour de France, Stage 1 to Stage 11', '2021 Tour de France, Stage 12 to Stage 21', '2022 Vuelta a España, Stage 12 to Stage 21', '2023 Tour de Suisse', 'A Ghost Is Born', 'A.M. (Wilco album)', 'AGD Interactive', 'AM', 'Anadenanthera colubrina', 'Being There (Wilco album)', 'Billy Bragg discography', 'Cate Le Bon', 'Code word (disambiguation)', 'Cousin (album)', 'Function word', 'Get Fuzz

searching topic: .am
searching topic: 1976 Vuelta a España
searching topic: 20 (number)
searching topic: 2014 Tour de France, Stage 1 to Stage 11
searching topic: 2014 Tour de France, Stage 12 to Stage 21
searching topic: 2016 Tour de France, Stage 1 to Stage 11
searching topic: 2017 Giro d'Italia
searching topic: 2018 Giro d'Italia
searching topic: 2018 Tour de France, Stage 1 to Stage 11
searching topic: 2018 Vuelta a España
searching topic: 2019 Giro d'Italia
searching topic: 2019 Team Jumbo–Visma season
searching topic: 2020 Giro
searching topic: 2020 Giro d'Italia, Stage 1 to Stage 11
searching topic: 2020 Giro d'Italia, Stage 12 to Stage 21
searching topic: 2020 Team Jumbo–Visma season
searching topic: 2020 Tour de France, Stage 1 to Stage 11
searching topic: 2020 Vuelta a España
searching topic: 2020 Vuelta a España, Stage 1 to Stage 9
searching topic: 2021 Giro d'Italia
searching topic: 2021 Giro d'Italia, Stage 1 to Stage 11
searching topic: 2021 Team Jumbo–Visma (men's team) 

searching topic: Manx cat
searching topic: Mattias Skjelmose
searching topic: Max Johnston (musician)
searching topic: Mayday (2021 film)
searching topic: Mayday (Canadian TV series)
searching topic: Mayday (disambiguation)
searching topic: Mayday Parade
searching topic: Mayday, Colorado
searching topic: Melvyn Bragg
searching topic: Members of Mayday
searching topic: Mermaid Avenue Vol. II
searching topic: Mermaid Avenue: The Complete Sessions
searching topic: Mermaid's Avenue
searching topic: Microsoft Flight Simulator
searching topic: Microsoft Flight Simulator (2020 video game)
searching topic: Microsoft Word
searching topic: Miguel Ángel López (cyclist)
searching topic: Mike Heidorn
searching topic: Mission: Impossible – Ghost Protocol
searching topic: Moby Project
searching topic: Moodswings (band)
searching topic: Morse code
searching topic: Morse code abbreviations
searching topic: Mount Blue Sky
searching topic: Mountains classification in the Tour de France
searching topic: M

In [11]:
df.shape

(4359, 2)

In [12]:
df.head()

Unnamed: 0,source,target
0,wilco,Wilco
1,wilco,Procedure word
2,wilco,Roger Wilco
3,wilco,Wilco (disambiguation)
4,wilco,Jeff Tweedy


In [13]:
outfile = 'data/wilco_edgelist.csv'

df.to_csv(outfile, index=False)

In [14]:
# verify the outfile looks good

df = pd.read_csv(outfile)
df.head()

Unnamed: 0,source,target
0,wilco,Wilco
1,wilco,Procedure word
2,wilco,Roger Wilco
3,wilco,Wilco (disambiguation)
4,wilco,Jeff Tweedy


In [15]:
df.shape

(4359, 2)