In [1]:
import wikipedia as w
import pandas as pd

from time import sleep

# Wikipedia Edgelist Generator

Feel free to use this Wikipedia edgelist generator for your own research uses. However, please keep a few things in mind.
- For the function 'create_edgelist_df', the default iterations has been set to 2, and I did 4 for this example. The higher the number, the more you will query the Wikipedia API. Even at 4, we ended up with a network of 9,000+ nodes, so don't go overboard. Use this responsibly. Start at 2, check the results, if you need more increase to 3, check results, etc.
- No matter what, do include a sleep() command. I have it set to 0.3 seconds between each search. A higher number will be better but take longer.
- Always crawl responsibly. If you are too aggressive, your IP will be blocked by Wikipedia and you will get nothing.
- Check the day_5_analysis notebook to see how the data can be used.

In [2]:
def search_wiki(search_list):

    origin_pages = []
    found_pages = []

    # later, we will drop these to keep the properly named page in the network

    for search in search_list:

        print('searching topic: {}'.format(search))
        
        try:
            #page = w.page(search)
            #search_results = page.links
            search_results = w.search(search)

            for found_page in search_results:

                origin_pages.append(search)
                found_pages.append(found_page)

        except:
            
            print('not found')
            
        sleep(0.3)
        
    return origin_pages, found_pages

In [3]:
def create_edgelist_df(seed_searches, iterations=2):

    completed = []
    sources = []
    targets = []

    for i in range(iterations):
        
        print('starting iteration: {}'.format(i))
        print()

        if i == 0:

            search_list = seed_searches

        origin_pages, found_pages = search_wiki(search_list)

        completed.extend(origin_pages)
        sources.extend(origin_pages)
        targets.extend(found_pages)

        search_list = sorted(set([t for t in targets if t not in completed]))
        print(len(search_list))
        print()
        print(search_list)
        print()
        
    df = pd.DataFrame(list(zip(sources, targets)))
    df.columns = ['source', 'target']
                  
    return df

In [12]:
seed_searches = ['biosafety']

df = create_edgelist_df(seed_searches, iterations=4) # keep it low (1-3) or this will take forever

starting iteration: 0

searching topic: biosafety
10

['American Biological Safety Association', 'Biocontainment', 'Biological hazard', 'Biosafety', 'Biosafety Clearing-House', 'Biosafety cabinet', 'Biosafety level', 'Cartagena Protocol on Biosafety', 'Lentilactobacillus buchneri', 'List of biosafety level 4 organisms']

starting iteration: 1

searching topic: American Biological Safety Association
searching topic: Biocontainment
searching topic: Biological hazard
searching topic: Biosafety
searching topic: Biosafety Clearing-House
searching topic: Biosafety cabinet
searching topic: Biosafety level
searching topic: Cartagena Protocol on Biosafety
searching topic: Lentilactobacillus buchneri
searching topic: List of biosafety level 4 organisms
49

['ABSA', 'Agar plate', 'American Conference of Governmental Industrial Hygienists', 'BCH', 'Bacillus atrophaeus', 'Biocontainment of genetically modified organisms', 'Biological agent', 'Biosafety Level 4 Zoonotic Laboratory Network', 'Biosecu

searching topic: 1967 Marburg virus disease outbreak
searching topic: 1976 Zaire Ebola virus outbreak
searching topic: 2-Butoxyethanol
searching topic: 2025 in the United Nations
searching topic: 7th CBRN Defense Regiment "Cremona"
searching topic: ABS
searching topic: ABSA Cup
searching topic: Abdi-Heba
searching topic: Absa Bank
searching topic: Absa Bank Uganda Limited
searching topic: Absa Bank Zambia
searching topic: Absa Group
searching topic: Absa Tower
searching topic: Access and Benefit Sharing Agreement
searching topic: Acetyl-CoA synthetase
searching topic: Agar
searching topic: Agar.io
searching topic: Agricultural biodiversity
searching topic: Air filter
searching topic: Air purifier
searching topic: Alejandro Cartagena
searching topic: Alfonso de Cartagena
searching topic: Ananda Lewis
searching topic: Annalena Baerbock
searching topic: Anthrax weaponization
searching topic: Antiarrhythmic agent
searching topic: Apple Cider Vinegar (TV series)
searching topic: Apple cider

searching topic: List of foliage plant diseases (Arecaceae)
searching topic: List of miscellaneous fake news websites
searching topic: List of strains of Escherichia coli
searching topic: List of tallest buildings in Omaha, Nebraska
searching topic: List of waste types
searching topic: Lists of environmental publications
searching topic: Lists of environmental topics
searching topic: Lists of websites
searching topic: Long Island
searching topic: MHC class I
searching topic: MOPP (protective gear)
searching topic: MOV
searching topic: Marburg
searching topic: Marburg virus disease
searching topic: Marburgvirus
searching topic: Medicine Creek
searching topic: Microbiological culture
searching topic: Minimum efficiency reporting value
searching topic: Minister for Biosecurity
searching topic: Ministry for Primary Industries
searching topic: Modifications (genetics)
searching topic: Monsanto
searching topic: Montauk Monster
searching topic: Mother (disambiguation)
searching topic: Mozzy
s

In [13]:
df.shape

(4091, 2)

In [14]:
df.head()

Unnamed: 0,source,target
0,biosafety,Biosafety
1,biosafety,Biosafety level
2,biosafety,Cartagena Protocol on Biosafety
3,biosafety,Biosafety cabinet
4,biosafety,Biocontainment


In [15]:
outfile = 'data/biosafety.csv'

df.to_csv(outfile, index=False)

In [16]:
# verify the outfile looks good

df = pd.read_csv(outfile)
df.head()

Unnamed: 0,source,target
0,biosafety,Biosafety
1,biosafety,Biosafety level
2,biosafety,Cartagena Protocol on Biosafety
3,biosafety,Biosafety cabinet
4,biosafety,Biocontainment


In [17]:
df.shape

(4091, 2)