# Data Collection

In [2]:
from bs4 import BeautifulSoup
import requests
from nltk.corpus import words
import csv

#### The details (artist, title) of happy songs are collected by crawling last.fm website. The tags 'happy' and 'makes me happy' are used 

In [3]:
happy_songs = []
happy_urls = ['http://www.last.fm/tag/happy/tracks?page=', 'http://www.last.fm/tag/makes+me+happy/tracks?page=']
for happy_url in happy_urls:
    for i in range(1,21):
        url = happy_url + str(i)
        html = requests.get(url)
        page = BeautifulSoup(html.text, 'lxml')
        div = page.find('div', 'container page-content')
        main_div = div.find('div', 'col-main')
        table = main_div.find('tbody')

        for row in table.findAll('tr'):
            chartlist_name = row.find('td', 'chartlist-name')
            main_span = chartlist_name.find('span', 'chartlist-ellipsis-wrap')
            artist = chartlist_name.find('span', 'chartlist-artists').getText().strip()
            title = main_span.findAll('a')[1].getText().strip()
            if any(d['title'] == title and d['artist'] == artist for d in happy_songs) is False:
                happy_songs.append({'artist':artist, 'title':title})
print(len(happy_songs))

1816


#### The details (artist, title) of sad songs are collected by crawling last.fm website. The tags 'sad', 'sad songs' and 'sadness' are used 

In [4]:
sad_songs = []
sad_urls = ['http://www.last.fm/tag/sad/tracks?page=', 'http://www.last.fm/tag/sad+songs/tracks?page=', 'http://www.last.fm/tag/sadness/tracks?page=']
for sad_url in sad_urls:
    for i in range(1,21):
        url = sad_url + str(i)
        html = requests.get(url)
        page = BeautifulSoup(html.text, 'lxml')
        div = page.find('div', 'container page-content')
        main_div = div.find('div', 'col-main')
        table = main_div.find('tbody')

        for row in table.findAll('tr'):
            chartlist_name = row.find('td', 'chartlist-name')
            main_span = chartlist_name.find('span', 'chartlist-ellipsis-wrap')
            artist = chartlist_name.find('span', 'chartlist-artists').getText().strip()
            title = main_span.findAll('a')[1].getText().strip()
            if any(d['title'] == title and d['artist'] == artist for d in sad_songs) is False:
                sad_songs.append({'artist':artist, 'title':title})
print(len(sad_songs))

2622


### Artists and titles need some preprocessing in order to build the proper URL to get the lyrics

In [3]:
def replace_accent_letter(word):
    word = word.replace('\'','').replace('.','').replace('(','').replace(')','').replace('& ','').replace('?','').replace('.','')
    word = word.replace('ö','o').replace('ó','o').replace('í','i').replace('é','e').replace('ü','u').replace('å','a').replace('ø','o')
    return word

def check_artist(word):
    if word == 'the beatles':
        return 'beatles'
    elif word == 'the proclaimers':
        return 'proclaimers'
    elif word == 'the kooks':
        return 'kooks'
    elif word == 'katrina and the waves':
        return 'katrina the waves'
    elif word == 'the holloways':
        return 'holloways'
    elif word == 'sixpence none the richer':
        return 'six pence none the richer'
    elif word == 'the turtles':
        return 'turtles'
    elif word == 'the beach boys':
        return 'beach boys'    
    elif word == 'the monkees':
        return 'monkees'
    elif word == 'the jackson 5':
        return 'jackson 5'
    elif word == 'florence + the machine':
        return 'florence and the machine'
    elif word == 'the lovin\' spoonful':
        return 'lovin\' spoonful'
    elif word == 'karen o and the kids':
        return 'karen-o-the-kids'
    elif word == 'a-ha':
        return 'aha'
    elif word == 'the wombats':
        return 'wombats'
    elif word == 'the pipettes':
        return 'pipettes'
    elif word == 'the cardigans':
        return 'cardigans'
    elif word == 'the fratellis':
        return 'fratellis'
    elif word == 'the rolling stones':
        return 'rolling stones'
    elif word == 'simon & garfunkel':
        return 'simon and garfunkel'
    elif word == 'the b-52\'s':
        return 'b52s'
    elif word == 'the la\'s':
        return 'las'
    elif word == 'the dandy warhols':
        return 'dandy warhols'
    elif word == 'the flaming lips':
        return 'flaming lips'
    elif word == 'the temptations':
        return 'temptations'
    elif word == 'the strokes':
        return 'strokes'
    elif word == 'the cranberries':
        return 'cranberries'
    elif word == 'the apples in stereo':
        return 'apples in stereo'
    elif word == 'the feeling':
        return 'feeling'
    elif word == 'the smashing pumpkins':
        return 'smashing pumpkins'
    elif word == 'the darkness':
        return 'darkness'
    elif word == 'the kinks':
        return 'kinks'
    elif word == 'the magic numbers':
        return 'magic numbers'
    elif word == 'the all-american rejects':
        return 'all-american rejects'
    elif word == 'the rumble strips':
        return 'rumble strips'
    elif word == 'the decemberists':
        return 'decemberists'
    elif word == 'the boo radleys':
        return 'boo radleys'
    elif word == 'the presidents of the united states of america':
        return 'presidents of the united states of america'
    elif word == 'the little ones':
        return 'little ones'
    elif word == 'the j. geils band':
        return 'j. geils band'
    elif word == 'the champs':
        return 'champs'
    elif word == 'the pointer sisters':
        return 'pointer sisters'
    elif word == 'cansei de ser sexy':
        return 'css'
    elif word == 'the cat empire':
        return 'cat empire'
    elif word == 'daniel boone':
        return 'boone daniel'
    elif word == 'the velvet underground':
        return 'velvet underground'
    elif word == 'the pretenders':
        return 'pretenders'
    elif word == 'the housemartins':
        return 'housemartins'
    elif word == 'p!nk':
        return 'pink'
    elif word == 'the who':
        return 'who'
    elif word == 'the raveonettes':
        return 'raveonettes'
    elif word == 'the streets':
        return 'streets'
    elif word == 'the unicorns':
        return 'unicorns'
    elif word == 'the view':
        return 'view'
    else:
        return word

def check_title(word):
    if word == 'american boy (radio edit w/ kanye)':
        return 'american boy'
    elif word == 'the magic position':
        return 'magic position'
    elif word == 'mmmbop':
        return 'mmm bop'
    elif word == 'jungle drum':
        return 'jungle drums'
    elif word == 'all day and all of the night':
        return 'all day all of the night'
    elif word == 'light and day':
        return 'light day'
    elif word == 'yeah yeah yeah song':
        return 'the yeah yeah yeah song'
    elif word == 'the lovecats':
        return 'lovecats'
    elif word == 'the painter':
        return 'the painter bonus track'
    elif word == 'the lion sleeps tonight':
        return 'lion sleeps tonight'
    else:
        return word


### Preprocessing of the title and artist contained in the songs dict. The url of the song is added to the song object in the dictionary

## N.B. you should change happy/sad in the code

In [7]:
#change sad_songs/happy_songs
for song in sad_songs:
    if sad_songs.index(song) % 100 == 0:
        print('Song %s' % (sad_songs.index(song)))
    
    artist = song['artist'].lower()
    artist = check_artist(artist)
    artist = replace_accent_letter(artist).replace(' ','-')
    
    title = song['title'].lower()
    title = check_title(title)
    title = replace_accent_letter(title).replace('-','').replace(',','').replace(' ','-')
    
    url = 'http://www.metrolyrics.com/'+title+'-lyrics-'+artist+'.html'
    song['url'] = url

Song 0
Song 100
Song 200
Song 300
Song 400
Song 500
Song 600
Song 700
Song 800
Song 900
Song 1000
Song 1100
Song 1200
Song 1300
Song 1400
Song 1500
Song 1600
Song 1700
Song 1800
Song 1900
Song 2000
Song 2100
Song 2200
Song 2300
Song 2400
Song 2500
Song 2600


## Lyrics collection

### The lyrics are collected by crawling 'metrolyrics' website. Then lyrics are stored in a .txt file and we store the map (title,artist,url,path) in a .csv file

In [9]:
count = 0
for song in happy_songs:
    if happy_songs.index(song) % 50 == 0:
        print('Song %s' % (happy_songs.index(song)))
    
    url = song['url']
    data = list(song.values())
    html = requests.get(url)
    page = BeautifulSoup(html.text, 'lxml')
    if html.status_code == 200:
        main_div = page.find('div', 'lyrics-body')
        if main_div == None:
            continue
        div_text = main_div.find('div', {'id' : 'lyrics-body-text'})
        par = div_text.findAll('p')
        txt = ''
        for p in par:
            txt += p.getText() + " "
        
        count += 1
        filename = "happy/song_"+str(count)+".txt"

        resultFile = open("output.csv",'a')
        data.append(filename)
        wr = csv.writer(resultFile, dialect='excel', lineterminator='\n')
        wr.writerow(data)
        resultFile.close()

        with open(filename, "w") as text_file:
            try:
                text_file.write("%s" % txt)
            except:
                continue


Song 0
Song 50
Song 100
Song 150
Song 200
Song 250
Song 300
Song 350
Song 400
Song 450
Song 500
Song 550
Song 600
Song 650
Song 700
Song 750
Song 800
Song 850
Song 900
Song 950
Song 1000
Song 1050
Song 1100
Song 1150
Song 1200
Song 1250
Song 1300
Song 1350
Song 1400
Song 1450
Song 1500
Song 1550
Song 1600
Song 1650
Song 1700
Song 1750
Song 1800


In [10]:
count = 0
for song in sad_songs:
    if sad_songs.index(song) % 50 == 0:
        print('Song %s' % (sad_songs.index(song)))
    
    url = song['url']
    data = list(song.values())
    html = requests.get(url)
    page = BeautifulSoup(html.text, 'lxml')
    if html.status_code == 200:
        main_div = page.find('div', 'lyrics-body')
        if main_div == None:
            continue
        div_text = main_div.find('div', {'id' : 'lyrics-body-text'})
        par = div_text.findAll('p')
        txt = ''
        for p in par:
            txt += p.getText() + " "
        
        count += 1
        filename = "sad/song_"+str(count)+".txt"

        resultFile = open("output.csv",'a')
        data.append(filename)
        wr = csv.writer(resultFile, dialect='excel', lineterminator='\n')
        wr.writerow(data)
        resultFile.close()

        with open(filename, "w") as text_file:
            try:
                text_file.write("%s" % txt)
            except:
                continue


Song 0
Song 50
Song 100
Song 150
Song 200
Song 250
Song 300
Song 350
Song 400
Song 450
Song 500
Song 550
Song 600
Song 650
Song 700
Song 750
Song 800
Song 850
Song 900
Song 950
Song 1000
Song 1050
Song 1100
Song 1150
Song 1200
Song 1250
Song 1300
Song 1350
Song 1400
Song 1450
Song 1500
Song 1550
Song 1600
Song 1650
Song 1700
Song 1750
Song 1800
Song 1850
Song 1900
Song 1950
Song 2000
Song 2050
Song 2100
Song 2150
Song 2200
Song 2250
Song 2300
Song 2350
Song 2400
Song 2450
Song 2500
Song 2550
Song 2600


### Check if a .txt file is empty (because it is not english or it has different encoding) and copy all non-empty files in a folder that then we use as a corpus directory

In [6]:
import os
import shutil

for file in os.listdir('happy'):
    if os.path.getsize('happy/'+file) > 0:
        shutil.copy2('happy/'+file, 'dataset/happy-'+file)
for file in os.listdir('sad'):
    if os.path.getsize('sad/'+file) > 0:
        shutil.copy2('sad/'+file, 'dataset/sad-'+file)