In [3]:
from bs4 import BeautifulSoup
# from lxml import etree
import pandas as pd
import requests
import re
from tqdm import tqdm
import pickle

In [223]:
site_url = 'https://en.wikipedia.org'
base_url = 'https://en.wikipedia.org/wiki/Category:'

In [248]:
def get_song_pages(year, year_subpage_url):
    song_pages = {}

    url = year_subpage_url

    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    song_columns = soup.find_all('div', {'class': 'mw-category-group'})

    for c in song_columns:
        try:
            if c.find('h3').text >= list(song_pages)[-1][0]: # minimize redundant retrieval
                song_urls = c.find_all('a', href=True)
                for u in song_urls:
                    song_pages[u.text] = (f"{site_url}{u['href']}", year)

        except:
            song_urls = c.find_all('a', href=True)
            for u in song_urls:
                song_pages[u.text] = (f"{site_url}{u['href']}", year)

    return song_pages

In [249]:
def get_year_subpages(year_page_url):
    year_subpages = {}
    url = year_page_url

    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    toc = soup.find('div', {'id': 'toc'})

    try:
        subpage_div = toc.find_all('li')[-1]
    except:
        subpage_div = toc.find('div', {'class': 'large-cat-toc-body'})

    subpage_urls = subpage_div.find_all('a', href=True)
    for u in subpage_urls:
        year_subpages[u.text] = u['href']

    return year_subpages

In [250]:
def get_year_pages(first_decade, last_decade):
    pattern = re.compile('\d{4} songs')

    year_pages = {}

    for decade in range(first_decade, last_decade + 10, 10):
        endpoint = f'{decade}s_songs'
        page = requests.get(f'{base_url}{endpoint}')
        soup = BeautifulSoup(page.content, 'html.parser')
        categories = soup.find_all('div', {'class': 'mw-category mw-category-columns'})
        for c in categories:
            urls = c.find_all('a', href=True)
            for u in urls:
                if bool(pattern.match(u.text)):
                    year_pages[u.text.split()[0]] = f"{site_url}{u['href']}"

    return year_pages

In [254]:
songs_for_year = {}
songs_by_year = []

years = get_year_pages(1950,2010)
for year, y_url in tqdm(years.items()): 
    print('Year:',year)
    subpages = get_year_subpages(y_url)
    for subpage, s_url in tqdm(subpages.items()):
        songs = get_song_pages(year, s_url)
        songs_for_year = songs_for_year | songs
    
    song_list = [(k, v[0], v[1]) for k,v in songs_for_year.items()]
    songs_by_year.extend(song_list)

    print(f'Songs Collected: {len(songs_by_year)}')
    
with open('wiki_songs.pickle', 'wb') as handle:
    pickle.dump(songs_by_year, handle)
    
songs_by_year

  0%|          | 0/70 [00:00<?, ?it/s]

Year: 1950


100%|██████████| 26/26 [00:04<00:00,  5.32it/s]
  1%|▏         | 1/70 [00:05<05:51,  5.09s/it]

Songs Collected: 135
Year: 1951


100%|██████████| 26/26 [00:05<00:00,  4.76it/s]
  3%|▎         | 2/70 [00:10<06:10,  5.44s/it]

Songs Collected: 417
Year: 1952


100%|██████████| 26/26 [00:05<00:00,  4.67it/s]
  4%|▍         | 3/70 [00:16<06:14,  5.60s/it]

Songs Collected: 847
Year: 1953


100%|██████████| 26/26 [00:06<00:00,  4.07it/s]
  6%|▌         | 4/70 [00:23<06:36,  6.00s/it]

Songs Collected: 1421
Year: 1954


100%|██████████| 26/26 [00:06<00:00,  3.91it/s]
  7%|▋         | 5/70 [00:30<06:50,  6.32s/it]

Songs Collected: 2146
Year: 1955


100%|██████████| 26/26 [00:06<00:00,  3.94it/s]
  9%|▊         | 6/70 [00:36<06:55,  6.49s/it]

Songs Collected: 3036
Year: 1956


100%|██████████| 26/26 [00:07<00:00,  3.46it/s]
 10%|█         | 7/70 [00:44<07:14,  6.90s/it]

Songs Collected: 4187
Year: 1957


100%|██████████| 26/26 [00:06<00:00,  3.92it/s]
 11%|█▏        | 8/70 [00:51<07:07,  6.90s/it]

Songs Collected: 5597
Year: 1958


100%|██████████| 26/26 [00:07<00:00,  3.67it/s]
 13%|█▎        | 9/70 [00:58<07:08,  7.03s/it]

Songs Collected: 7312
Year: 1959


100%|██████████| 26/26 [00:07<00:00,  3.50it/s]
 14%|█▍        | 10/70 [01:06<07:13,  7.23s/it]

Songs Collected: 9354
Year: 1960


100%|██████████| 26/26 [00:07<00:00,  3.59it/s]
 16%|█▌        | 11/70 [01:14<07:11,  7.31s/it]

Songs Collected: 11702
Year: 1961


100%|██████████| 26/26 [00:06<00:00,  3.87it/s]
 17%|█▋        | 12/70 [01:20<06:57,  7.20s/it]

Songs Collected: 14407
Year: 1962


100%|██████████| 26/26 [00:07<00:00,  3.64it/s]
 19%|█▊        | 13/70 [01:28<06:53,  7.25s/it]

Songs Collected: 17517
Year: 1963


100%|██████████| 26/26 [00:06<00:00,  3.90it/s]
 20%|██        | 14/70 [01:35<06:40,  7.15s/it]

Songs Collected: 21063
Year: 1964


100%|██████████| 26/26 [00:07<00:00,  3.57it/s]
 21%|██▏       | 15/70 [01:42<06:39,  7.26s/it]

Songs Collected: 25146
Year: 1965


100%|██████████| 26/26 [00:06<00:00,  3.85it/s]
 23%|██▎       | 16/70 [01:49<06:27,  7.18s/it]

Songs Collected: 29800
Year: 1966


100%|██████████| 26/26 [00:07<00:00,  3.57it/s]
 24%|██▍       | 17/70 [01:57<06:26,  7.28s/it]

Songs Collected: 35100
Year: 1967


100%|██████████| 26/26 [00:06<00:00,  3.81it/s]
 26%|██▌       | 18/70 [02:04<06:15,  7.21s/it]

Songs Collected: 41213
Year: 1968


100%|██████████| 26/26 [00:07<00:00,  3.64it/s]
 27%|██▋       | 19/70 [02:11<06:10,  7.26s/it]

Songs Collected: 48082
Year: 1969


100%|██████████| 26/26 [00:06<00:00,  3.84it/s]
 29%|██▊       | 20/70 [02:18<05:59,  7.18s/it]

Songs Collected: 55745
Year: 1970


100%|██████████| 26/26 [00:07<00:00,  3.60it/s]
 30%|███       | 21/70 [02:26<05:55,  7.26s/it]

Songs Collected: 64221
Year: 1971


100%|██████████| 26/26 [00:07<00:00,  3.55it/s]
 31%|███▏      | 22/70 [02:33<05:52,  7.35s/it]

Songs Collected: 73484
Year: 1972


100%|██████████| 26/26 [00:07<00:00,  3.29it/s]
 33%|███▎      | 23/70 [02:41<05:57,  7.60s/it]

Songs Collected: 83465
Year: 1973


100%|██████████| 26/26 [00:07<00:00,  3.53it/s]
 34%|███▍      | 24/70 [02:49<05:49,  7.60s/it]

Songs Collected: 94225
Year: 1974


100%|██████████| 26/26 [00:06<00:00,  3.85it/s]
 36%|███▌      | 25/70 [02:56<05:33,  7.42s/it]

Songs Collected: 105812
Year: 1975


100%|██████████| 26/26 [00:06<00:00,  3.73it/s]
 37%|███▋      | 26/70 [03:03<05:23,  7.36s/it]

Songs Collected: 118173
Year: 1976


100%|██████████| 26/26 [00:06<00:00,  3.77it/s]
 39%|███▊      | 27/70 [03:10<05:13,  7.29s/it]

Songs Collected: 131301
Year: 1977


100%|██████████| 26/26 [00:07<00:00,  3.60it/s]
 40%|████      | 28/70 [03:18<05:08,  7.34s/it]

Songs Collected: 145267
Year: 1978


100%|██████████| 26/26 [00:07<00:00,  3.62it/s]
 41%|████▏     | 29/70 [03:25<05:01,  7.36s/it]

Songs Collected: 160098
Year: 1979


100%|██████████| 26/26 [00:06<00:00,  3.77it/s]
 43%|████▎     | 30/70 [03:32<04:51,  7.29s/it]

Songs Collected: 175950
Year: 1980


100%|██████████| 26/26 [00:07<00:00,  3.67it/s]
 44%|████▍     | 31/70 [03:40<04:44,  7.30s/it]

Songs Collected: 192919
Year: 1981


100%|██████████| 26/26 [00:06<00:00,  3.83it/s]
 46%|████▌     | 32/70 [03:47<04:34,  7.22s/it]

Songs Collected: 210911
Year: 1982


100%|██████████| 26/26 [00:07<00:00,  3.58it/s]
 47%|████▋     | 33/70 [03:54<04:30,  7.30s/it]

Songs Collected: 229970
Year: 1983


100%|██████████| 26/26 [00:07<00:00,  3.60it/s]
 49%|████▊     | 34/70 [04:02<04:24,  7.35s/it]

Songs Collected: 250119
Year: 1984


100%|██████████| 26/26 [00:06<00:00,  3.72it/s]
 50%|█████     | 35/70 [04:09<04:15,  7.31s/it]

Songs Collected: 271373
Year: 1985


100%|██████████| 26/26 [00:07<00:00,  3.52it/s]
 51%|█████▏    | 36/70 [04:16<04:12,  7.41s/it]

Songs Collected: 293761
Year: 1986


100%|██████████| 26/26 [00:07<00:00,  3.71it/s]
 53%|█████▎    | 37/70 [04:24<04:02,  7.36s/it]

Songs Collected: 317255
Year: 1987


100%|██████████| 26/26 [00:07<00:00,  3.45it/s]
 54%|█████▍    | 38/70 [04:31<03:59,  7.48s/it]

Songs Collected: 341879
Year: 1988


100%|██████████| 26/26 [00:06<00:00,  3.75it/s]
 56%|█████▌    | 39/70 [04:39<03:48,  7.38s/it]

Songs Collected: 367657
Year: 1989


100%|██████████| 141/141 [00:40<00:00,  3.45it/s]
 57%|█████▋    | 40/70 [05:20<08:44, 17.50s/it]

Songs Collected: 394737
Year: 1990


100%|██████████| 141/141 [00:41<00:00,  3.43it/s]
 59%|█████▊    | 41/70 [06:01<11:55, 24.66s/it]

Songs Collected: 423071
Year: 1991


100%|██████████| 141/141 [00:41<00:00,  3.41it/s]
 60%|██████    | 42/70 [06:43<13:53, 29.76s/it]

Songs Collected: 452638
Year: 1992


100%|██████████| 141/141 [00:43<00:00,  3.23it/s]
 61%|██████▏   | 43/70 [07:27<15:17, 33.99s/it]

Songs Collected: 483516
Year: 1993


100%|██████████| 141/141 [00:42<00:00,  3.36it/s]
 63%|██████▎   | 44/70 [08:09<15:48, 36.48s/it]

Songs Collected: 515833
Year: 1994


100%|██████████| 141/141 [00:41<00:00,  3.39it/s]
 64%|██████▍   | 45/70 [08:51<15:52, 38.10s/it]

Songs Collected: 549618
Year: 1995


100%|██████████| 141/141 [00:41<00:00,  3.37it/s]
 66%|██████▌   | 46/70 [09:33<15:43, 39.30s/it]

Songs Collected: 584899
Year: 1996


100%|██████████| 141/141 [00:40<00:00,  3.45it/s]
 67%|██████▋   | 47/70 [10:14<15:16, 39.86s/it]

Songs Collected: 621778
Year: 1997


100%|██████████| 141/141 [00:41<00:00,  3.44it/s]
 69%|██████▊   | 48/70 [10:55<14:46, 40.28s/it]

Songs Collected: 660191
Year: 1998


100%|██████████| 141/141 [00:41<00:00,  3.43it/s]
 70%|███████   | 49/70 [11:37<14:12, 40.61s/it]

Songs Collected: 700102
Year: 1999


100%|██████████| 141/141 [00:41<00:00,  3.38it/s]
 71%|███████▏  | 50/70 [12:19<13:40, 41.05s/it]

Songs Collected: 741613
Year: 2000


100%|██████████| 141/141 [00:41<00:00,  3.37it/s]
 73%|███████▎  | 51/70 [13:01<13:05, 41.36s/it]

Songs Collected: 784719
Year: 2001


100%|██████████| 141/141 [00:42<00:00,  3.36it/s]
 74%|███████▍  | 52/70 [13:43<12:29, 41.64s/it]

Songs Collected: 829479
Year: 2002


100%|██████████| 141/141 [00:41<00:00,  3.41it/s]
 76%|███████▌  | 53/70 [14:25<11:47, 41.62s/it]

Songs Collected: 875805
Year: 2003


100%|██████████| 141/141 [00:55<00:00,  2.56it/s]
 77%|███████▋  | 54/70 [15:20<12:11, 45.73s/it]

Songs Collected: 923976
Year: 2004


100%|██████████| 141/141 [00:49<00:00,  2.84it/s]
 79%|███████▊  | 55/70 [16:10<11:46, 47.11s/it]

Songs Collected: 974314
Year: 2005


100%|██████████| 141/141 [00:41<00:00,  3.39it/s]
 80%|████████  | 56/70 [16:52<10:37, 45.54s/it]

Songs Collected: 1026906
Year: 2006


100%|██████████| 141/141 [00:42<00:00,  3.30it/s]
 81%|████████▏ | 57/70 [17:36<09:43, 44.89s/it]

Songs Collected: 1081812
Year: 2007


100%|██████████| 141/141 [00:47<00:00,  2.99it/s]
 83%|████████▎ | 58/70 [18:23<09:08, 45.68s/it]

Songs Collected: 1139368
Year: 2008


100%|██████████| 141/141 [01:01<00:00,  2.28it/s]
 84%|████████▍ | 59/70 [19:26<09:17, 50.73s/it]

Songs Collected: 1199532
Year: 2009


100%|██████████| 141/141 [00:45<00:00,  3.07it/s]
 86%|████████▌ | 60/70 [20:12<08:13, 49.36s/it]

Songs Collected: 1262501
Year: 2010


100%|██████████| 141/141 [00:48<00:00,  2.89it/s]
 87%|████████▋ | 61/70 [21:01<07:23, 49.24s/it]

Songs Collected: 1328226
Year: 2011


100%|██████████| 141/141 [00:47<00:00,  2.95it/s]
 89%|████████▊ | 62/70 [21:49<06:30, 48.87s/it]

Songs Collected: 1396347
Year: 2012


100%|██████████| 141/141 [00:43<00:00,  3.22it/s]
 90%|█████████ | 63/70 [22:33<05:32, 47.43s/it]

Songs Collected: 1466936
Year: 2013


100%|██████████| 141/141 [00:43<00:00,  3.21it/s]
 91%|█████████▏| 64/70 [23:17<04:38, 46.44s/it]

Songs Collected: 1540146
Year: 2014


100%|██████████| 141/141 [00:43<00:00,  3.27it/s]
 93%|█████████▎| 65/70 [24:00<03:47, 45.53s/it]

Songs Collected: 1615809
Year: 2015


100%|██████████| 141/141 [00:44<00:00,  3.19it/s]
 94%|█████████▍| 66/70 [24:45<03:00, 45.20s/it]

Songs Collected: 1694368
Year: 2016


100%|██████████| 141/141 [00:51<00:00,  2.76it/s]
 96%|█████████▌| 67/70 [25:36<02:21, 47.03s/it]

Songs Collected: 1775812
Year: 2017


100%|██████████| 141/141 [00:45<00:00,  3.10it/s]
 97%|█████████▋| 68/70 [26:22<01:33, 46.66s/it]

Songs Collected: 1860754
Year: 2018


100%|██████████| 141/141 [00:47<00:00,  2.99it/s]
 99%|█████████▊| 69/70 [27:09<00:46, 46.91s/it]

Songs Collected: 1948506
Year: 2019


100%|██████████| 141/141 [00:53<00:00,  2.61it/s]
100%|██████████| 70/70 [28:04<00:00, 24.06s/it]


Songs Collected: 2038724


[('Accidents Will Happen (Bing Crosby song)',
  'https://en.wikipedia.org/wiki/Accidents_Will_Happen_(Bing_Crosby_song)',
  '1950'),
 ("Adelaide's Lament",
  'https://en.wikipedia.org/wiki/Adelaide%27s_Lament',
  '1950'),
 ('Adoro a mi tierra',
  'https://en.wikipedia.org/wiki/Adoro_a_mi_tierra',
  '1950'),
 ('(Ah, the Apple Trees) When the World Was Young',
  'https://en.wikipedia.org/wiki/(Ah,_the_Apple_Trees)_When_the_World_Was_Young',
  '1950'),
 ('All My Love (Patti Page song)',
  'https://en.wikipedia.org/wiki/All_My_Love_(Patti_Page_song)',
  '1950'),
 ('American Beauty Rose (song)',
  'https://en.wikipedia.org/wiki/American_Beauty_Rose_(song)',
  '1950'),
 ('Anema e core (song)',
  'https://en.wikipedia.org/wiki/Anema_e_core_(song)',
  '1950'),
 ('Anytime, Any Place, Anywhere',
  'https://en.wikipedia.org/wiki/Anytime,_Any_Place,_Anywhere',
  '1950'),
 ("Auf Wiederseh'n, Sweetheart",
  'https://en.wikipedia.org/wiki/Auf_Wiederseh%27n,_Sweetheart',
  '1950'),
 ('Baby, Baby, Baby

In [36]:
pd.set_option('display.max_colwidth', -1)
data = pd.read_csv('decades-MusicBrainz.csv')
data['Clean_Title'] = data['Title'].replace(r'\([^)]*\)', '', regex=True).str.strip()

artist_dict = data.groupby('Title')['Artist'].apply(list).to_dict()

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,Title,Artist,Genres,Lyric,Decade,Year,Popularity,Clean_Title
118,"You Don't Love Me (No, No, No)",Beyoncé,Pop; R&B; Black Music,"No no no\nYou don't love me\nAnd I know now\nNo no no\nYou don't love me\nYes, I know now\n'Cause you left me, baby\nAnd I got no place to go now\n\nNo no no\nI'll do anything you say boy\nNo no no\nI'll do anything you say boy\n'Cause if you ask me, baby\nI'll get on my knees and pray boy",1990,1994,205.5,You Don't Love Me


In [100]:
with open('wiki_songs.pickle', 'rb') as f:
    wiki = pickle.load(f)

df = pd.DataFrame(wiki, columns=['Title','URL','Year'])
df['Clean_Title'] = df['Title'].replace(r'\([^)]*\)', '', regex=True).str.strip()

df = df.merge(data, how='inner', on='Clean_Title')
df['URL_Year'] = df.apply(lambda x: (x['URL'], x['Year_x']), axis=1)

url_dict = df[['Clean_Title','URL','Year_x']].drop_duplicates()[['Clean_Title','URL_Year']].groupby('Clean_Title')['URL_Year'].apply(list).to_dict()

AttributeError: 'DataFrameGroupBy' object has no attribute 'groupby'

In [107]:
url_dict = df.drop_duplicates(subset=['Clean_Title','URL','Year_x'])[['Clean_Title','URL_Year']].groupby('Clean_Title')['URL_Year'].apply(list).to_dict()

In [108]:
url_dict['Until I Die']

[('https://en.wikipedia.org/wiki/Until_I_Die', '2007')]

In [109]:
# import re
data[data['Clean_Title']=='Halo']

In [119]:
matches = []

with tqdm(total=len(artist_dict)) as pbar:
    for title, artist in artist_dict.items():
        # print(title)
        urls = url_dict.get(title)
        # print(urls)
        if urls:
            for u, y in urls:
                # print(u)
                page = requests.get(u)
                soup = BeautifulSoup(page.content, 'html.parser') # .find('div', {'id': 'mw-content-text'})
                for a in artist:
                    # print(u)
                    # print('Artist:', a)
                    # print('Trying panel...')
                    if soup.find_all('td', text=re.compile(a)):
                        matches.append((title, a, y))
                        # print('Pane:', title, a, y)
                        break
                    # print('Trying body...')
                    if soup.find_all('p', text=re.compile(a)):
                # if any(r['Artist'] in s for s in soup.strings):
                        matches.append((title, a, y))
                        # print('Body:', title, a, y)
                        break
            
            pbar.update(1)

with open('wiki_matches.pickle', 'wb') as handle:
    pickle.dump(matches, handle)

  5%|▌         | 1530/28941 [55:11<16:28:42,  2.16s/it] 


error: nothing to repeat at position 0

In [94]:
matches

[]

In [None]:
songs_by_year

In [239]:
import pandas as pd
df = pd.DataFrame(songs_by_year, columns=['Year','Song','URL'])
len(df[df['URL']=='1950'])

831

In [253]:
df[df['URL']=='1950'].to_csv('delete_me.csv')