In [1]:
import wikipedia as w
import pandas as pd

from time import sleep

# Wikipedia Edgelist Generator

Feel free to use this Wikipedia edgelist generator for your own research uses. However, please keep a few things in mind.
- For the function 'create_edgelist_df', the default iterations has been set to 2, and I did 4 for this example. The higher the number, the more you will query the Wikipedia API. Even at 4, we ended up with a network of 9,000+ nodes, so don't go overboard. Use this responsibly. Start at 2, check the results, if you need more increase to 3, check results, etc.
- No matter what, do include a sleep() command. I have it set to 0.3 seconds between each search. A higher number will be better but take longer.
- Always crawl responsibly. If you are too aggressive, your IP will be blocked by Wikipedia and you will get nothing.
- Check the day_5_analysis notebook to see how the data can be used.

In [2]:
def search_wiki(search_list):

    origin_pages = []
    found_pages = []

    # later, we will drop these to keep the properly named page in the network

    for search in search_list:

        print('searching topic: {}'.format(search))
        
        try:
            #page = w.page(search)
            #search_results = page.links
            search_results = w.search(search)

            for found_page in search_results:

                origin_pages.append(search)
                found_pages.append(found_page)

        except:
            
            print('not found')
            
        sleep(0.3)
        
    return origin_pages, found_pages

In [3]:
def create_edgelist_df(seed_searches, iterations=2):

    completed = []
    sources = []
    targets = []

    for i in range(iterations):
        
        print('starting iteration: {}'.format(i))
        print()

        if i == 0:

            search_list = seed_searches

        origin_pages, found_pages = search_wiki(search_list)

        completed.extend(origin_pages)
        sources.extend(origin_pages)
        targets.extend(found_pages)

        search_list = sorted(set([t for t in targets if t not in completed]))
        print(len(search_list))
        print()
        print(search_list)
        print()
        
    df = pd.DataFrame(list(zip(sources, targets)))
    df.columns = ['source', 'target']
                  
    return df

In [7]:
seed_searches = ['conspiracy theory', 'deep state', 'qanon']

df = create_edgelist_df(seed_searches, iterations=4) # keep it low (1-3) or this will take forever

starting iteration: 0

searching topic: conspiracy theory
searching topic: deep state
searching topic: qanon
30

['9/11 conspiracy theories', 'Chemtrail conspiracy theory', 'Conspiracy theory', 'Cultural Marxism conspiracy theory', 'Deep State (TV series)', 'Deep state', 'Deep state (disambiguation)', 'Deep state in Turkey', 'Deep state in the United States', 'DeepStateMap.Live', 'Eurabia conspiracy theory', 'Jacob Chansley', 'Jim Caviezel', 'Jim Watkins (businessman)', 'John F. Kennedy assassination conspiracy theories', 'Lauren Witzke', 'List of conspiracy theories', 'Pastel QAnon', 'Patrick M. Byrne', 'Pizzagate conspiracy theory', 'QAnon', 'QAnon Anonymous', 'Reptilian conspiracy theory', 'Romana Didulo', 'Ron Watkins', 'Sean Hannity', 'Sound of Freedom (film)', 'Timeline of incidents involving QAnon', 'Turkish mafia', 'White genocide conspiracy theory']

starting iteration: 1

searching topic: 9/11 conspiracy theories
searching topic: Chemtrail conspiracy theory
searching topic: C

searching topic: CIA Kennedy assassination conspiracy theory
searching topic: Cabrini (film)
searching topic: California
searching topic: Caviezel
searching topic: Chris Coons
searching topic: Cicada 3301
searching topic: Cinematic techniques
searching topic: Clinton body count conspiracy theory
searching topic: Comet Ping Pong
searching topic: Conspiracy theories in United States politics
searching topic: Conspiracy theories in the Arab world
searching topic: Conspirituality
searching topic: Crime in Germany
searching topic: Cultural Bolshevism
searching topic: Dark (TV series)
searching topic: Dayton, Washington
searching topic: Deep
searching topic: Deep Creek State Park
searching topic: Deep Lake
searching topic: Deep River
searching topic: Deep time (disambiguation)
searching topic: Democratic Party (United States)
searching topic: Depression
searching topic: Diane Therrien
searching topic: Dominion Voting Systems v. Fox News Network
searching topic: Double genocide theory
searchi

searching topic: ...And Justice for All (album)
searching topic: ...And Justice for All (film)
searching topic: ...And Justice for All (song)
searching topic: 14
searching topic: 1883 (TV series)
searching topic: 1962 Turkish coup attempt
searching topic: 1972 United States Senate election in Delaware
searching topic: 1979 in the United Kingdom
searching topic: 1990 United States Senate election in Delaware
searching topic: 1995 Azerbaijani coup attempt
searching topic: 1999 Martha's Vineyard plane crash
searching topic: 2008 United States Senate election in Delaware
searching topic: 2010 Japan–South Korea cyber conflict
searching topic: 2010 United States Senate special election in Delaware
searching topic: 2014 United States Senate election in Delaware
searching topic: 2015 Bangkok bombing
searching topic: 2015–16 Cleveland Cavaliers season
searching topic: 2016 Mukilteo shooting
searching topic: 2018 United States Senate election in Delaware
searching topic: 2018–19 Los Angeles Lake

searching topic: Clerkenwell crime syndicate
searching topic: Cloudflare
searching topic: Cobell v. Salazar
searching topic: Cogito, ergo sum
searching topic: Comet (disambiguation)
searching topic: Common logarithm
searching topic: Communism
searching topic: Complex logarithm
searching topic: Conor Leslie
searching topic: Conspiracy Theory with Jesse Ventura
searching topic: Conspiracy theories about Adolf Hitler's death
searching topic: Contiguous United States
searching topic: Continuity of government
searching topic: Conversion of Paul the Apostle
searching topic: Coons
searching topic: Cosmology of Tolkien's legendarium
searching topic: Cotts (surname)
searching topic: Counter Intelligence (film)
searching topic: Counter-jihad
searching topic: Crime boss
searching topic: Crime in Hong Kong
searching topic: Crime in the United States
searching topic: Criminal proceedings in the January 6 United States Capitol attack
searching topic: Cristiana Dell'Anna
searching topic: Critical the

searching topic: Hate crime
searching topic: Head of Christ (disambiguation)
searching topic: Heavenly Kingdom of Everlasting Satisfaction
searching topic: Heavy Breathing
searching topic: Hector Monsegur
searching topic: Heinrich Parler
searching topic: Hercules: The Legendary Journeys
searching topic: Hereditary (film)
searching topic: Herschmann
searching topic: Hijackers in the September 11 attacks
searching topic: His Dark Materials (TV series)
searching topic: History
searching topic: History of Fox News
searching topic: History of Western civilization
searching topic: History of communism
searching topic: History of logarithms
searching topic: History of the Democratic Party (United States)
searching topic: History of the Italian Republic
searching topic: History of the Jews in Egypt
searching topic: History of the United States
searching topic: Holocaust denial
searching topic: Holocaust trivialization
searching topic: Holodomor
searching topic: Holos Krainy
searching topic: Ho

searching topic: List of aircraft losses during the Russo-Ukrainian War
searching topic: List of alien races in DC Comics
searching topic: List of alleged extraterrestrial beings
searching topic: List of alumni of Magdalen College, Oxford
searching topic: List of avian humanoids
searching topic: List of capitals in Pakistan
searching topic: List of career achievements by LeBron James
searching topic: List of communist ideologies
searching topic: List of communities in Saskatchewan
searching topic: List of coups and coup attempts
searching topic: List of coups and coup attempts by country
searching topic: List of coups and coup attempts since 2010
searching topic: List of current United States governors
searching topic: List of documentary films
searching topic: List of endorsements by Donald Trump
searching topic: List of extreme points of the United States
searching topic: List of fictional crime bosses and gang leaders
searching topic: List of fictional reptiles
searching topic: List

searching topic: Parler (disambiguation)
searching topic: Parler family
searching topic: Party divisions of United States Congresses
searching topic: Passion Play
searching topic: Passion of Jesus
searching topic: Patrick Byrne (architect)
searching topic: Patrick Byrne (musician)
searching topic: Patrick Michael Byrne (anthropologist)
searching topic: Patrick Wilson
searching topic: Patriot movement
searching topic: Paul Wahlberg
searching topic: Paul the Apostle
searching topic: Paul the Apostle and Jewish Christianity
searching topic: Paul the Apostle and women
searching topic: Peace at Home Council
searching topic: Pearl Harbor advance-knowledge conspiracy theory
searching topic: Peker
searching topic: Pentti Linkola
searching topic: Peter Fonda
searching topic: Peter Fonda filmography
searching topic: Peter Parler
searching topic: Peterborough City Council (Ontario)
searching topic: Physiographic regions of the United States
searching topic: Pitchford (surname)
searching topic: Pl

searching topic: The Final Problem (Sherlock)
searching topic: The Five (talk show)
searching topic: The Gambler (2014 film)
searching topic: The Girl Who Believes in Miracles
searching topic: The Goyim Know
searching topic: The Infidel (1922 film)
searching topic: The Infidel (2010 film)
searching topic: The International Jew
searching topic: The Ipcress File (TV series)
searching topic: The Laureate
searching topic: The Lord of the Rings: The Rings of Power
searching topic: The Madness of Crowds: Gender, Race and Identity
searching topic: The Midwich Cuckoos (TV series)
searching topic: The Narrow Road to the Deep North (miniseries)
searching topic: The Old Straight Track
searching topic: The Omen
searching topic: The Omen (franchise)
searching topic: The Outlander (film)
searching topic: The Passion of the Christ (soundtrack)
searching topic: The Passion of the Christ: Songs
searching topic: The Penguin (TV series)
searching topic: The Players (2012 film)
searching topic: The Ravine

In [13]:
df.shape

(13889, 2)

In [14]:
df.head()

Unnamed: 0,source,target
0,conspiracy theory,Conspiracy theory
1,conspiracy theory,List of conspiracy theories
2,conspiracy theory,Pizzagate conspiracy theory
3,conspiracy theory,Chemtrail conspiracy theory
4,conspiracy theory,Reptilian conspiracy theory


In [15]:
outfile = 'data/conspiracy_theory_edgelist.csv'

df.to_csv(outfile, index=False)

In [16]:
# verify the outfile looks good

df = pd.read_csv(outfile)
df.head()

Unnamed: 0,source,target
0,conspiracy theory,Conspiracy theory
1,conspiracy theory,List of conspiracy theories
2,conspiracy theory,Pizzagate conspiracy theory
3,conspiracy theory,Chemtrail conspiracy theory
4,conspiracy theory,Reptilian conspiracy theory


In [17]:
df.shape

(13889, 2)