In [1]:
import networkx as nx
import pandas as pd
import wikipedia as w

In [2]:
data = r'data/wilco_edgelist_clean.csv'

df = pd.read_csv(data)

df.head()

Unnamed: 0,source,target
0,A.M. (Wilco album),Jeff Tweedy
1,A.M. (Wilco album),Wilco (The Album)
2,A.M. (Wilco album),I Am Trying to Break Your Heart: A Film About ...
3,A.M. (Wilco album),Being There (Wilco album)
4,A.M. (Wilco album),Cruel Country


In [3]:
G = nx.from_pandas_edgelist(df)

In [4]:
print(nx.info(G))

Graph with 41 nodes and 167 edges


# Content Crawling

Today, we're going to use the cleaned Wilco Edgelist, use it to list Wikipedia pages to crawl, crawl the content, and save the data for downstream use. The first thing we need to do is identify pages to crawl.

In [6]:
def crawl_wiki_pages(pages):

    page_data = {}

    for i in range(len(pages)):

        print('running: {}/{}'.format(i+1, len(pages)))
        
        try:

            page = w.page(pages[i])
            title = page.original_title

            page_data[title] = {}
            page_data[title]['categories'] = page.categories
            page_data[title]['content'] = page.content
            page_data[title]['images'] = page.images
            page_data[title]['links'] = page.links
            page_data[title]['web_links'] = sorted(page.references)
            
        except:
            
            # if page not found, continue to next
            continue

    return page_data

In [7]:
pages = sorted(G.nodes)
#pages = pages[0:10]
pages

['A Ghost Is Born',
 'A.M. (Wilco album)',
 'Being There (Wilco album)',
 'Billy Bragg',
 'Billy Bragg discography',
 'Brewing Up with Billy Bragg',
 'Cruel Country',
 'Glenn Kotche',
 'Grant Showbiz',
 'I Am Trying to Break Your Heart: A Film About Wilco',
 'Jay Bennett',
 'Jay Farrar',
 'Jay Farrar discography',
 'Jeff Tweedy',
 'Jeff Tweedy discography',
 'John Stirratt',
 'Ken Coomer',
 'Kicking Television: Live in Chicago',
 "Life's a Riot with Spy vs Spy",
 'Loose Fur',
 'Man in the Sand',
 'Mermaid Avenue',
 'Mermaid Avenue Vol. II',
 'Mermaid Avenue: The Complete Sessions',
 "Mermaid's Avenue",
 'Mike Heidorn',
 'Ode to Joy (Wilco album)',
 'Schmilco',
 'Sky Blue Sky',
 'Son Volt',
 'Star Wars (Wilco album)',
 'Sukierae',
 'Summerteeth',
 'Tweedy',
 'Tweedy (band)',
 'Uncle Tupelo',
 'Wilco',
 'Wilco (The Album)',
 'Wilco discography',
 'Woody Guthrie',
 'Yankee Hotel Foxtrot']

In [8]:
page_data = crawl_wiki_pages(pages)

running: 1/41
running: 2/41
running: 3/41
running: 4/41
running: 5/41
running: 6/41
running: 7/41
running: 8/41
running: 9/41
running: 10/41
running: 11/41
running: 12/41
running: 13/41
running: 14/41
running: 15/41
running: 16/41
running: 17/41
running: 18/41
running: 19/41
running: 20/41
running: 21/41
running: 22/41
running: 23/41
running: 24/41
running: 25/41
running: 26/41
running: 27/41
running: 28/41
running: 29/41
running: 30/41
running: 31/41
running: 32/41
running: 33/41
running: 34/41
running: 35/41
running: 36/41
running: 37/41




  lis = BeautifulSoup(html).find_all('li')


running: 38/41
running: 39/41
running: 40/41
running: 41/41


In [9]:
df = pd.DataFrame(page_data).T.reset_index()

df.columns = ['page', 'categories', 'content', 'images', 'links', 'web_links']

df.head()

Unnamed: 0,page,categories,content,images,links,web_links
0,A Ghost Is Born,"[2004 albums, Album articles lacking alt text ...",A Ghost Is Born is the fifth studio album by A...,[https://upload.wikimedia.org/wikipedia/common...,"[A.M. (Wilco album), ARIA Charts, A Rush of Bl...","[http://acharts.us/album/13436, http://article..."
1,A.M. (Wilco album),"[1995 debut albums, Albums produced by Brian P...",A.M. is the debut studio album by the American...,[https://upload.wikimedia.org/wikipedia/common...,"[A Ghost Is Born, Acoustic guitar, Album, AllM...",[http://articles.chicagotribune.com/1995-04-27...
2,Being There (Wilco album),"[1996 albums, Album articles lacking alt text ...",Being There is the second studio album by the ...,[https://upload.wikimedia.org/wikipedia/common...,"[1001 Albums You Must Hear Before You Die, A.M...",[http://www.chicagorecording.com/chrisshepard/...
3,Billy Bragg discography,"[Articles with short description, Billy Bragg,...",The discography of British singer-songwriter B...,[https://upload.wikimedia.org/wikipedia/common...,"[ARIA, ARIA Charts, A New England, Alternative...","[http://www.billybragg.co.uk/, https://austral..."
4,Brewing Up with Billy Bragg,"[1984 albums, Album articles lacking alt text ...",Brewing Up with Billy Bragg is the second albu...,[https://upload.wikimedia.org/wikipedia/common...,"[A New England, Album, AllMusic, Andrew Collin...",[http://hotpress.com/music/reviews/albums/4418...


In [10]:
df.shape # most pages came through, some were not actual pages (bad links?)

(33, 6)

# Save the Data

I prefer to save to json when not creating an edgelist, but you can change to another format if you like.

In [11]:
outfile = r'data/wikipedia_content_wilco.json'

df.to_json(outfile, orient='records')

In [12]:
df.shape

(33, 6)

In [13]:
pd.read_json(outfile).head(3) # spot check that file wrote successfully

Unnamed: 0,page,categories,content,images,links,web_links
0,A Ghost Is Born,"[2004 albums, Album articles lacking alt text ...",A Ghost Is Born is the fifth studio album by A...,[https://upload.wikimedia.org/wikipedia/common...,"[A.M. (Wilco album), ARIA Charts, A Rush of Bl...","[http://acharts.us/album/13436, http://article..."
1,A.M. (Wilco album),"[1995 debut albums, Albums produced by Brian P...",A.M. is the debut studio album by the American...,[https://upload.wikimedia.org/wikipedia/common...,"[A Ghost Is Born, Acoustic guitar, Album, AllM...",[http://articles.chicagotribune.com/1995-04-27...
2,Being There (Wilco album),"[1996 albums, Album articles lacking alt text ...",Being There is the second studio album by the ...,[https://upload.wikimedia.org/wikipedia/common...,"[1001 Albums You Must Hear Before You Die, A.M...",[http://www.chicagorecording.com/chrisshepard/...
