In [1]:
import wikipedia as w
import pandas as pd

from time import sleep

# Wikipedia Edgelist Generator

Feel free to use this Wikipedia edgelist generator for your own research uses. However, please keep a few things in mind.
- For the function 'create_edgelist_df', the default iterations has been set to 2, and I did 4 for this example. The higher the number, the more you will query the Wikipedia API. Even at 4, we ended up with a network of 9,000+ nodes, so don't go overboard. Use this responsibly. Start at 2, check the results, if you need more increase to 3, check results, etc.
- No matter what, do include a sleep() command. I have it set to 0.3 seconds between each search. A higher number will be better but take longer.
- Always crawl responsibly. If you are too aggressive, your IP will be blocked by Wikipedia and you will get nothing.
- Check the day_5_analysis notebook to see how the data can be used.

In [2]:
def search_wiki(search_list):

    origin_pages = []
    found_pages = []

    # later, we will drop these to keep the properly named page in the network

    for search in search_list:

        print('searching topic: {}'.format(search))
        
        try:
            #page = w.page(search)
            #search_results = page.links
            search_results = w.search(search)

            for found_page in search_results:

                origin_pages.append(search)
                found_pages.append(found_page)

        except:
            
            print('not found')
            
        sleep(0.3)
        
    return origin_pages, found_pages

In [3]:
def create_edgelist_df(seed_searches, iterations=2):

    completed = []
    sources = []
    targets = []

    for i in range(iterations):
        
        print('starting iteration: {}'.format(i))
        print()

        if i == 0:

            search_list = seed_searches

        origin_pages, found_pages = search_wiki(search_list)

        completed.extend(origin_pages)
        sources.extend(origin_pages)
        targets.extend(found_pages)

        search_list = sorted(set([t for t in targets if t not in completed]))
        print(len(search_list))
        print()
        print(search_list)
        print()
        
    df = pd.DataFrame(list(zip(sources, targets)))
    df.columns = ['source', 'target']
                  
    return df

In [13]:
seed_searches = ['network science', 'artificial life']

df = create_edgelist_df(seed_searches, iterations=4) # keep it low (1-3) or this will take forever

starting iteration: 0

searching topic: network science
searching topic: artificial life
20

['Artificial chemistry', 'Artificial general intelligence', 'Artificial life', 'Artificial life (disambiguation)', 'Artificial reproduction', 'Efficiency (network science)', 'Energy Sciences Network', 'Hub (network science)', 'Life', 'Life simulation game', 'Life support', 'National Science Foundation Network', 'Network', 'Network science', 'Network theory', 'Quantum artificial life', 'Social Science Research Network', 'Social network', 'Social network analysis', 'Synthetic biology']

starting iteration: 1

searching topic: Artificial chemistry
searching topic: Artificial general intelligence
searching topic: Artificial life
searching topic: Artificial life (disambiguation)
searching topic: Artificial reproduction
searching topic: Efficiency (network science)
searching topic: Energy Sciences Network
searching topic: Hub (network science)
searching topic: Life
searching topic: Life simulation ga

searching topic: Progress in artificial intelligence
searching topic: Propagation
searching topic: Quantinuum
searching topic: Quantum Aspects of Life
searching topic: Quantum Break
searching topic: Quantum dot
searching topic: Quantum mind
searching topic: Real Chemistry
searching topic: Real Life
searching topic: Regulation of artificial intelligence
searching topic: Reproduction
searching topic: Reproduction and pregnancy in speculative fiction
searching topic: SURAnet
searching topic: Sci-Hub
searching topic: Science Publishing Group
searching topic: Selective breeding
searching topic: Semantic network
searching topic: Seven Network
searching topic: Simulation video game
searching topic: Small-world network
searching topic: Social Science Research Council
searching topic: Social Sciences and Humanities Research Council
searching topic: Social complexity
searching topic: Social network analysis software
searching topic: Social networking service
searching topic: Social research
sear

searching topic: 1337x
searching topic: 2016 in aviation
searching topic: 2021 Russian legislative election
searching topic: 2023 Chinese balloon incident
searching topic: 2024
searching topic: 2024 Nobel Prizes
searching topic: 211th Aviation Regiment (United States)
searching topic: 21st Century Fox
searching topic: 500 Days of Summer
searching topic: 7 Sins (video game)
searching topic: 8chan
searching topic: A Little Life
searching topic: A.I. Artificial Intelligence
searching topic: ACS Applied Energy Materials
searching topic: ACS Chemical Biology
searching topic: ACS Nano
searching topic: ACS Omega
searching topic: AI boom
searching topic: AI safety
searching topic: AI takeover
searching topic: ANT (network)
searching topic: AT Protocol
searching topic: Academy of the Social Sciences in Australia
searching topic: Acquisition of 21st Century Fox by Disney
searching topic: Actor (disambiguation)
searching topic: Adaptive immune system
searching topic: Advanced Crew Escape Suit
sea

searching topic: Electromagnetic tensor
searching topic: Electronic warfare support measures
searching topic: Eliezer Yudkowsky
searching topic: ElonJet
searching topic: Emerald Group Publishing
searching topic: Emergency medical technician
searching topic: Encyclopedia of Life Support Systems
searching topic: End-of-life
searching topic: End-to-end principle
searching topic: Endometriosis
searching topic: Energy conversion efficiency
searching topic: Energy efficiency
searching topic: Energy rating
searching topic: Ernest Lawrence
searching topic: Escherichia coli
searching topic: Escherichia coli in molecular biology
searching topic: Eternal life
searching topic: Ethernet hub
searching topic: Ethics of artificial intelligence
searching topic: European Conference on Wireless Sensor Networks
searching topic: Evolution of human intelligence
searching topic: Executive Office of the President of the United States
searching topic: Expeditions: A MudRunner Game
searching topic: Explainable 

searching topic: List of Fox Broadcasting Company affiliates (by U.S. state)
searching topic: List of Fox Broadcasting Company affiliates (table)
searching topic: List of Internet exchange points
searching topic: List of LGBT social networking services
searching topic: List of Mac models
searching topic: List of My Little Pony: Equestria Girls animations
searching topic: List of Nobel laureates
searching topic: List of Quantum Break episodes
searching topic: List of The Transformers episodes
searching topic: List of Transformers: Animated episodes
searching topic: List of Transformers: Armada episodes
searching topic: List of Transformers: Cybertron episodes
searching topic: List of Transformers: Cyberverse episodes
searching topic: List of Transformers: Energon episodes
searching topic: List of Transformers: Rescue Bots episodes
searching topic: List of academic fields
searching topic: List of airlines of the United States
searching topic: List of artificial intelligence projects
sear

searching topic: Phase qubit
searching topic: Philosophy of artificial intelligence
searching topic: Photosynthesis
searching topic: Physical and logical qubits
searching topic: Physical chemistry
searching topic: Pipe network analysis
searching topic: Planet Coaster 2
searching topic: Plant propagation
searching topic: Plant reproduction
searching topic: Plant-based leather
searching topic: Po Chai Pills
searching topic: Poison
searching topic: Police aviation
searching topic: Poliovirus
searching topic: Polymer
searching topic: Polymer chemistry
searching topic: Polymer science
searching topic: Pope John Paul I
searching topic: Population bottleneck
searching topic: Pornhub
searching topic: Port (computer networking)
searching topic: Post-quantum cryptography
searching topic: Postdoctoral researcher
searching topic: Postmarketing surveillance
searching topic: Potential cultural impact of extraterrestrial contact
searching topic: Power symbol
searching topic: Pregnancy
searching topic

searching topic: The Lobby (TV series)
searching topic: The Most Dangerous Animal in the World
searching topic: The New School
searching topic: The New School for Social Research
searching topic: The Open Network
searching topic: The Pirate Bay
searching topic: The Secret Life of Bees (novel)
searching topic: The Singularity Is Near
searching topic: The Strain
searching topic: Theoretical chemistry
searching topic: Theories of technology
searching topic: Thin-film solar cell
searching topic: Thing theory
searching topic: This Is Your Life
searching topic: Thoroughbred
searching topic: Thread
searching topic: Threads (1984 film)
searching topic: Tiger
searching topic: Timeline of artificial intelligence
searching topic: Timeline of quantum computing and communication
searching topic: Timeline of solar cells
searching topic: Top-level domain
searching topic: Tor (network)
searching topic: Tor Books
searching topic: Transformers: Prime
searching topic: Transmon
searching topic: Transport 

In [14]:
df.shape

(10632, 2)

In [15]:
df.head()

Unnamed: 0,source,target
0,network science,Network science
1,network science,Hub (network science)
2,network science,National Science Foundation Network
3,network science,Network theory
4,network science,Social network


In [16]:
outfile = 'data/wiki_network_science_artificial_life.csv'

df.to_csv(outfile, index=False)

In [17]:
# verify the outfile looks good

df = pd.read_csv(outfile)
df.head()

Unnamed: 0,source,target
0,network science,Network science
1,network science,Hub (network science)
2,network science,National Science Foundation Network
3,network science,Network theory
4,network science,Social network


In [18]:
df.shape

(10632, 2)