## Webscraping

In [1]:
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

### Scraping Billboard's Top (<=50) Country Songs per Year (on 5-21-YYYY) from 1959-2019

In [2]:
# Getting the URL to request the data from
def change_url(first_year):
    return "https://www.billboard.com/charts/country-songs/"+str(year)+"-05-25"

In [3]:
## Webscraping and Cleaning
songs = ""
for year in range(1959,2020): # iterate through all the years
    url = change_url(year)
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    ranks = len(soup.select("[class~=chart-list-item]"))
    for rank in range(0,ranks):
        current_song = str(soup.select("[class~=chart-list-item]")[rank])
        current_song = current_song.split('>')[0].split("<div class")[1:]
        
        for i in range(0,len(current_song)): # all the different columns/descriptors
            descrip = current_song[i].replace('\"',"")
            descrip = current_song[i].split('data-')[1:]
            # cleaning each aspect of a column
            for element in range(0,len(descrip)):
                no_tags = descrip[element].split('=')[1].replace(",", " ")
                songs += str(no_tags) + ","
            songs = songs.replace('\"',"")
            songs += str(year) + "\n"
    
# add column names to dataset
columns = "artist,data-has-content,rank,song,year\n"   
songs = columns+songs

In [4]:
## Read songs into a long csv
outF = open("billboard_songs_artists_years.csv", "w")
outF.write(songs)
outF.close()

In [3]:
## Make it into a DataFrame to be able to analyze the data
df = pd.read_csv("billboard_songs_artists_years.csv")
df = df.drop(columns=["data-has-content"])

## Remove all of the whitespaces
df['artist'] = df['artist'].str.strip()
df['song'] = df['song'].str.strip()
#df['artist'] = df.apply(lambda x: ' '.join([w for w in  x['artist'].str.split(' ') if ('&' not in w) or (w != 'and') or (w != 'And')]))
df['artist'] = df['artist'].str.split(' ')
df['artist'] = df['artist'].map(lambda x: [w for w in x if (('&' not in w) and  (w != 'and') and (w != 'And'))])
df['artist'] = df['artist'].map(lambda x: ' '.join(x))
df['artist'] = df['artist'].str.split('/')
df['artist'] = df['artist'].map(lambda x: ' '.join(x))
df['song'] = df['song'].str.strip()
df.head(10)

Unnamed: 0,artist,rank,song,year
0,Johnny Horton,1,The Battle Of New Orleans,1959
1,George Jones,2,White Lightning,1959
2,Jim Reeves,3,Home,1959
3,George Morgan,4,I'm In Love Again,1959
4,Ray Price,5,Heartaches By The Number,1959
5,Webb Pierce,6,A Thousand Miles Ago,1959
6,Frankie Miller,7,Black Land Farmer,1959
7,Skeeter Davis,8,Set Him Free,1959
8,Johnny Cash,9,Luther Played The Boogie,1959
9,Johnny Horton,10,When It's Springtime In Alaska (It's Forty Below),1959


In [7]:
## List all of the unique country artists
all_artists = df['artist'].unique()
all_artists[0:50]
# len(all_artists)

array(['Johnny Horton', 'George Jones', 'Jim Reeves', 'George Morgan',
       'Ray Price', 'Webb Pierce', 'Frankie Miller', 'Skeeter Davis',
       'Johnny Cash', 'Carl Belew', 'Margie', 'Ernest Tubb', 'Don Gibson',
       'Wilma Lee Stoney Cooper', 'Wilburn Brothers', 'Hank Thompson',
       'Kitty Wells', 'Faron Young', 'Rose Maddox', 'Eddy Arnold',
       'Jimmy Martin', "James O'Gwynn", 'Bob Gallion', 'Hank Locklin',
       'Buck Owens', 'Stonewall Jackson', 'Jeanne Black', 'Roy Drusky',
       'Marty Robbins', 'Marion Worth', 'The Stanley Brothers',
       'Lonnie Irving', 'Ernest', 'Claude Gray', 'Wynn Stewart',
       'Elvis Presley', 'Sonny James', 'Charlie', 'Freddie Hart',
       'Patsy Cline Jim Reeves', 'Hank Snow', 'Jimmy Newman',
       'Moon Mullican', 'The Louvin Brothers', 'Warren Smith',
       'Porter Wagoner', 'Ray Sanders', 'Reno Smiley', 'Cowboy Copas',
       'Lewis Pruitt'], dtype=object)

### Scraping Genuis API for Song Lyrics

In [8]:
import lyricsgenius
import random
import json
import requests
from fuzzywuzzy import fuzz
auth = 'gieHFE9rIVa4DNDzVPLH6Vcv17A2bLhABjl13Y4zJFkwdcuNRnZgxZWa3sz-DgbX'
#genius = lyricsgenius.Genius(genius_key)
oops = []



In [15]:
# Genius API Requests
def request_song_info(song_title, artist_name):
    base_url = 'https://api.genius.com'
    headers = {'Authorization': 'Bearer ' + auth}
    search_url = base_url + '/search'
    data = {'q': song_title + ' ' + artist_name}
    response = requests.get(search_url, data=data, headers=headers)

    return response

In [16]:
def find_song_artist(song_title, artist_name, op):
    # Search for matches in the request response
    response = request_song_info(song_title, artist_name)
    json = response.json()
    remote_song_info = None
    for hit in json['response']['hits']:
        if (artist_name.lower() in hit['result']['primary_artist']['name'].lower()) or (fuzz.partial_ratio(artist_name.lower(),hit['result']['primary_artist']['name'].lower())>90):
            remote_song_info = hit
            break
    try:
        if remote_song_info:
            song_url = remote_song_info['result']['url']
        page = requests.get(song_url)
        html = BeautifulSoup(page.text, 'html.parser')
        lyrics = html.find('div', class_='lyrics').get_text()
        return lyrics
    except:
        print('exception: {}' +song_title + artist_name)
        return None

In [17]:
# Get the 
df['lyrics'] = df.apply(lambda x: find_song_artist(x['song'],x['artist'],oops), axis=1)
df.head()

exception: {}Poor Old Heartsick MeMargie
exception: {}Come Walk With MeWilma Lee Stoney Cooper
exception: {}Gambler's LoveRose Maddox
exception: {}How Can I Think Of TomorrowJames O'Gwynn
exception: {}You Take The Table And I'll Take The ChairsBob Gallion
exception: {}That's My Kind Of LoveMarion Worth
exception: {}Each Moment ('spent With You)Ernest
exception: {}(Doin' The) Lovers LeapWebb Pierce
exception: {}The Key's In The MailboxFreddie Hart
exception: {}The Hand You're Holding NowSkeeter Davis
exception: {}Everybody's Dying For LoveJimmy Newman
exception: {}Ragged But RightMoon Mullican
exception: {}Odds And Ends (Bits And Pieces)Warren Smith
exception: {}LonelyvilleRay Sanders
exception: {}Teach Me How To LieHank Thompson
exception: {}Don't Let Your Sweet Love DieReno Smiley
exception: {}Flat TopCowboy Copas
exception: {}Crazy BullfrogLewis Pruitt
exception: {}If A Woman Answers (Hang Up The Phone)Leroy Van Dyke
exception: {}I Can Mend Your Broken HeartDon Gibson
exception: {}Al

exception: {}It's Time To Love HerBilly Walker
exception: {}Working Like The Devil (For The Lord)Del Reeves
exception: {}The World Needs A MelodyRed Lane
exception: {}Married To A MemoryArlene
exception: {}A Good ManJune Carter Cash
exception: {}Next Time I Fall In Love (I Won't)Hank Thompson
exception: {}Sunday Morning ChristianHARLAN
exception: {}Charley's PicturePorter Wagoner
exception: {}Make Me Your Kind Of WomanPatti Page
exception: {}Then You Walk InSammi Smith
exception: {}There's Something About A LadyJohnny Duncan
exception: {}The Lonesomest Lonesome/That's What Leaving's AboutRay Price
exception: {}Grandma Harp/Turnin' Off A MemoryMerle Haggard
exception: {}Chantilly Lace/Think About It Darlin'Jerry Lee Lewis
exception: {}Ain't Nothin' Shakin' (But The Leaves On The Trees)'Billy Crash Craddock'
exception: {}KateJohnny Cash The Tennessee Three
exception: {}Send Me Some Lovin'Hank Williams  Jr. Lois Johnson
exception: {}I've Found Someone Of My OwnCal Smith
exception: {}Fools

exception: {}May ITerri Hollowell
exception: {}Isn't It Always LoveLynn Anderson
exception: {}Spare A Little Lovin'(on A Fool)Arnie Rue
exception: {}My Heart/silent Night(after The Fight)Ronnie Milsap
exception: {}I'm Already BlueThe Kendalls
exception: {}Tell Ole I Ain't Here  He Better Get On HomeMoe Bandy Joe Stampley
exception: {}Your Body Is An OutlawMel Tillis
exception: {}Too Old To Play CowboyRazzy Bailey
exception: {}New York Wine And Tennessee ShineDave Rowland Sugar
exception: {}Rodeo EyesZella Lehr
exception: {}The ChampMoe Bandy
exception: {}You're In Love With The Wrong ManMundo Earwood
exception: {}Bar Room BuddiesMerle Haggard Clint Eastwood
exception: {}I Can See Forever Loving YouFoxfire
exception: {}It Don't Hurt To DreamSylvia
exception: {}Shotgun RiderJoe Sun
exception: {}You Fill My LifeJuice Newton
exception: {}J.r.B.J. Wright
exception: {}Morning Comes Too EarlyJim Ed Brown Helen Cornelius
exception: {}It's OverRex Allen Jr.
exception: {}Friends/Anywhere There's

exception: {}Circle Of FriendsDavid Ball
exception: {}It's Your LoveTim McGraw With Faith Hill
exception: {}Why Would I Say GoodbyeBrooks Dunn
exception: {}A Dozen Red RosesTammy Graham
exception: {}Drink  Swear  Steal &amp; LieMichael Peterson
exception: {}Somewhere In LoveJohn Audrey Wiggins
exception: {}Here's Your Sign (Get The Picture)Bill Engvall With Special Guest Travis Tritt
exception: {}Papa BearKeith Harling
exception: {}God Must Have Spent A Little More Time On YouAlabama Featuring 'N Sync
exception: {}I Can't Get Over YouBrooks Dunn
exception: {}South Of Santa FeBrooks Dunn
exception: {}I Hope You DanceLee Ann Womack With Sons Of The Desert
exception: {}Stuck In LoveThe Judds
exception: {}You'll Always Be Loved By MeBrooks Dunn
exception: {}Do I Love You EnoughRicochet
exception: {}Ain't Nothing 'Bout YouBrooks Dunn
exception: {}I'm Gonna Miss Her (The Fishin' Song)Brad Paisley
exception: {}My Heart Is Lost To YouBrooks Dunn
exception: {}Mendocino County LineWillie Nelson 

Unnamed: 0,artist,rank,song,year,lyrics
0,Johnny Horton,1,The Battle Of New Orleans,1959,\n\n[Verse 1]\nIn 1814 we took a little trip\n...
1,George Jones,2,White Lightning,1959,"\n\nWell, in North Carolina, way back in the h..."
2,Jim Reeves,3,Home,1959,"\n\nWell, I've been a traveler most of my life..."
3,George Morgan,4,I'm In Love Again,1959,\n\nNot so long ago I had my poor heart broken...
4,Ray Price,5,Heartaches By The Number,1959,\n\nHeartache number one was when you left me\...


In [8]:
### clean a lyric string
def clean(text):    
    text = text.split('\n') # split string into lines
    text = [i for i in text if i] # drop missing lines
    text = [i for i in text if i[0]!='['] # drop bracketed lines (e.g. producer info, song part indicators)
    text = ' '.join(text) # join back into single string
    text = re.sub(r'[^\s\w]+', '', text) # filter out non-alpha-numeric characters 
    text = text.lower() # strip capitalization
    text = ' '.join([i for i in text.split()]) # drop stopwords
    return text

### Scraping densities of locations of artists' birthtowns