In [4]:
import wikipedia as w
import pandas as pd

from time import sleep

# Wikipedia Edgelist Generator

Feel free to use this Wikipedia edgelist generator for your own research uses. However, please keep a few things in mind.
- For the function 'create_edgelist_df', the default iterations has been set to 2, and I did 4 for this example. The higher the number, the more you will query the Wikipedia API. Even at 4, we ended up with a network of 9,000+ nodes, so don't go overboard. Use this responsibly. Start at 2, check the results, if you need more increase to 3, check results, etc.
- No matter what, do include a sleep() command. I have it set to 0.3 seconds between each search. A higher number will be better but take longer.
- Always crawl responsibly. If you are too aggressive, your IP will be blocked by Wikipedia and you will get nothing.
- Check the day_5_analysis notebook to see how the data can be used.

In [5]:
def search_wiki(search_list):

    origin_pages = []
    found_pages = []

    # later, we will drop these to keep the properly named page in the network

    for search in search_list:

        print('searching topic: {}'.format(search))
        
        try:
            #page = w.page(search)
            #search_results = page.links
            search_results = w.search(search)

            for found_page in search_results:

                origin_pages.append(search)
                found_pages.append(found_page)

        except:
            
            print('not found')
            
        sleep(0.3)
        
    return origin_pages, found_pages

In [6]:
def create_edgelist_df(seed_searches, iterations=2):

    completed = []
    sources = []
    targets = []

    for i in range(iterations):
        
        print('starting iteration: {}'.format(i))
        print()

        if i == 0:

            search_list = seed_searches

        origin_pages, found_pages = search_wiki(search_list)

        completed.extend(origin_pages)
        sources.extend(origin_pages)
        targets.extend(found_pages)

        search_list = sorted(set([t for t in targets if t not in completed]))
        print(len(search_list))
        print()
        print(search_list)
        print()
        
    df = pd.DataFrame(list(zip(sources, targets)))
    df.columns = ['source', 'target']
                  
    return df

In [13]:
seed_searches = ['kpop']

df = create_edgelist_df(seed_searches, iterations=4) # keep it low (1-3) or this will take forever

starting iteration: 0

searching topic: kpop
10

['Guardian: The Lonely and Great God', 'K-pop', 'K-pop (song)', 'K-pop Selection', 'KPOP (FM)', 'KPOP (disambiguation)', 'KPOP (musical)', 'KPOP-LP', 'List of South Korean idol groups', 'Park Shin-hye']

starting iteration: 1

searching topic: Guardian: The Lonely and Great God
searching topic: K-pop
searching topic: K-pop (song)
searching topic: K-pop Selection
searching topic: KPOP (FM)
searching topic: KPOP (disambiguation)
searching topic: KPOP (musical)
searching topic: KPOP-LP
searching topic: List of South Korean idol groups
searching topic: Park Shin-hye
77

['134340', 'Alexander Lee (entertainer)', 'Ashley Park (actress)', 'Cha Tae-hyun', 'DWET-FM', 'Doctor Slump (TV series)', 'Eunice', 'G.o.d', 'Gong Yoo', 'Hamada (name)', 'Hard Rock FM', 'Hitz (radio station)', 'Honorific nicknames in popular music', 'Hori7on', 'Hwang Shin-hye', 'I Am a Man (TV series)', 'Iggy Pop', 'Ive (group)', 'Jason Tam', 'K-pop Cover Dance Festival', 'K-

searching topic: 100 Days Miracle
searching topic: 106.7 FM
searching topic: 16
searching topic: 1980s in music
searching topic: 2022 Cheez-It Bowl
searching topic: 2023 Pop-Tarts Bowl
searching topic: 2023 in South Korean music
searching topic: 2023–24 NCAA football bowl games
searching topic: 5-Star (Stray Kids album)
searching topic: 8th Asia Artist Awards
searching topic: A Chorus Line
searching topic: A Man and a Woman (2016 film)
searching topic: Academic dishonesty
searching topic: Ahn Jae-hyun
searching topic: Albert Suho
searching topic: Album-oriented rock
searching topic: Alexander Lee
searching topic: Along with the Gods: The Two Worlds
searching topic: Am I Actually the Strongest?
searching topic: Angel's Last Mission: Love
searching topic: Anishinaabe traditional beliefs
searching topic: Ashley Bell (actress)
searching topic: Ashley Liao
searching topic: Ashley Newbrough
searching topic: Ashley Olsen
searching topic: Ashley Spencer (actress)
searching topic: Astro (South 

searching topic: Les Misérables (musical)
searching topic: Lisa (rapper)
searching topic: Lisa Lisa
searching topic: Lisa Marie Presley
searching topic: List of 2010s deaths in popular music
searching topic: List of BoA concert tours
searching topic: List of Doc Martin episodes
searching topic: List of Dolce Amore episodes
searching topic: List of Dr. Slump Arale-chan episodes
searching topic: List of Dr. Slump chapters
searching topic: List of Dr. Slump episodes
searching topic: List of Internet radio stations
searching topic: List of J-pop artists
searching topic: List of K-pop Hot 100 number ones
searching topic: List of K-pop albums on the Billboard charts
searching topic: List of K-pop on the Billboard charts
searching topic: List of K-pop on the Billboard year-end charts
searching topic: List of K-pop songs on the Billboard Japan Hot 100
searching topic: List of My Little Old Boy episodes
searching topic: List of Neptune-crossing minor planets
searching topic: List of Pop-Tarts B

searching topic: Show Me the Money 10
searching topic: Show Me the Money 11
searching topic: Show Me the Money 4
searching topic: Show Me the Money 5
searching topic: Show Me the Money 777
searching topic: Show Me the Money 9
searching topic: Show Me the Money discography
searching topic: Silenced (film)
searching topic: Sirasa TV
searching topic: Smugglers (2023 film)
searching topic: Solo (Jennie song)
searching topic: Solo Leveling
searching topic: Son of the South (film)
searching topic: Song Hye-kyo
searching topic: Song of the Bandits
searching topic: Sonic the Hedgehog (2006 video game)
searching topic: Sound
searching topic: Soundtrack
searching topic: South Korea
searching topic: Spider-Man (1994 TV series)
searching topic: Spiritwalker (film)
searching topic: Splitting Heirs
searching topic: Standing Outside a Broken Phone Booth with Money in My Hand
searching topic: Stay (I Missed You)
searching topic: Still 17
searching topic: Storm Eunice
searching topic: Stray Kids (TV pr

In [18]:
df.shape

(6395, 2)

In [19]:
df.head()

Unnamed: 0,source,target
0,kpop,K-pop
1,kpop,KPOP (musical)
2,kpop,K-pop (song)
3,kpop,KPOP (disambiguation)
4,kpop,List of South Korean idol groups


In [20]:
outfile = 'data/kpop_edgelist.csv'

df.to_csv(outfile, index=False)

In [21]:
# verify the outfile looks good

df = pd.read_csv(outfile)
df.head()

Unnamed: 0,source,target
0,kpop,K-pop
1,kpop,KPOP (musical)
2,kpop,K-pop (song)
3,kpop,KPOP (disambiguation)
4,kpop,List of South Korean idol groups


In [22]:
df.shape

(6395, 2)