In [3]:
import requests
import re #regular expressions

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import pandas as pd

# Lyrics Text Classification NLP Project

## Project Description

In this Project a Model for text classification on song lyrics will be build.<br>
Following topics will be covered:<br>
Web Scraping will to extract lyrics of two artists from the web.<br>
Apply NLP technics like a Beautiful Soup, Regex, Bag of Words.<br>
Linear Regression and Term Frequency and Inverse Document Frequency to build the model.<br>
The goal of the Model will be to recognize the artist given a piece of text.<br>

### First Artist: PJ Harvey

#### First steps, extracting the artist page

In [6]:
url  = 'http://www.lyrics.com/artist/PJ-Harvey'

In [7]:
response = requests.get(url)

In [8]:
response

<Response [200]>

In [9]:
response.status_code

200

In [10]:
pjh_html = response.text

In [11]:
with open('../data/pj_harvey/pjh.txt', "w") as f:
    f.write(pjh_html)

Using Beautiful Soup to extract a list with the album names

In [12]:
with open('../data/pj_harvey/pjh.txt', "r") as fArtist:
    fContent = fArtist.read()

In [13]:
pjh_soup = BeautifulSoup(fContent, "html.parser")

Show the structure of the beautiful soup:

In [14]:
print(pjh_soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
<!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <title>
   PJ Harvey Lyrics, Songs and Albums | Lyrics.com
  </title>
  <meta content="PJ Harvey Lyrics - All the great songs and their lyrics from PJ Harvey on Lyrics.com" name="description"/>
  <meta content="PJ Harvey lyrics, PJ Harvey song lyrics, PJ Harvey lyric" name="keywords"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <base href="https://www.lyrics.com/"/>
  <script>
   s4Prefix = 'https://static.stands4.com';
version = '1.3.87';
  </script>
  <!-- Bootstrap compiled and minified CSS -->
  <link href="https://maxcdn.bo

In [15]:
album_labels = pjh_soup.find_all(class_='artist-album-label')

In [16]:
len(album_labels)

132

In [17]:
album_labels

[<h3 class="artist-album-label"><a href="/album/3611504/Women-With-Attitude">Women With Attitude</a> <span class="year">[2017]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3305033/Hope-Six-Demolition-Project-%5BLP%5D">Hope Six Demolition Project [LP]</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3326676/The-Community-of-Hope">The Community of Hope</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3302621/The-Hope-Six-Demolition-Project">The Hope Six Demolition Project</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3337705/The-Orange-Monkey">The Orange Monkey</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3298410/The-Wheel">The Wheel</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3201104/She-Who-Rocks">She Who Rocks</a> <span class="year">[2015]</span></h3>

In [23]:
album_labels[0]

<h3 class="artist-album-label"><a href="/album/3611504/Women-With-Attitude">Women With Attitude</a> <span class="year">[2017]</span></h3>

In [24]:
album_labels[0].a

<a href="/album/3611504/Women-With-Attitude">Women With Attitude</a>

In [29]:
album_labels[0].attrs

{'class': ['artist-album-label']}

In [31]:
print(album_labels[0].string)

None


In [32]:
album_labels[0].a.attrs

{'href': '/album/3611504/Women-With-Attitude'}

In [26]:
album_labels[0].a.string

'Women With Attitude'

In [8]:
'Women With Attitude' in album_labels[0].a

True

Extracting all links

In [36]:
pjh_links = pjh_soup.find_all('a')
pjh_links

[<a href="login.php">Login</a>,
 <a href="https://www.abbreviations.com/">ABBREVIATIONS</a>,
 <a href="https://www.anagrams.net/">ANAGRAMS</a>,
 <a href="https://www.biographies.net/">BIOGRAPHIES</a>,
 <a href="https://www.calculators.net/">CALCULATORS</a>,
 <a href="https://www.convert.net/">CONVERSIONS</a>,
 <a href="https://www.definitions.net/">DEFINITIONS</a>,
 <a href="https://www.grammar.com/">GRAMMAR</a>,
 <a href="https://www.literature.com/">LITERATURE</a>,
 <a href="https://www.lyrics.com/">LYRICS</a>,
 <a href="https://www.phrases.com/">PHRASES</a>,
 <a href="https://www.poetry.com/">POETRY</a>,
 <a href="https://www.quotes.net/">QUOTES</a>,
 <a href="https://www.references.net/">REFERENCES</a>,
 <a href="https://www.rhymes.com/">RHYMES</a>,
 <a href="https://www.scripts.com/">SCRIPTS</a>,
 <a href="https://www.symbols.com/">SYMBOLS</a>,
 <a href="https://www.synonyms.com/">SYNONYMS</a>,
 <a href="https://www.uszip.com/">USZIP</a>,
 <a href="/artists/0">#</a>,
 <a href="/ar

Link elements to "Let England Shake" and "The Last Living Rose"

In [None]:
<a href="/lyric/23407951/PJ+Harvey/The+Last+Living+Rose">The Last Living Rose</a>

In [None]:
<a href="/lyric/23407945/PJ+Harvey/In+The+Dark+Places">In The Dark Places</a>

Let's try to get The Last Living Rose

In [12]:
url = ('http://www.lyrics.com/lyric/23407951/PJ+Harvey/The+Last+Living+Rose')

In [13]:
response = requests.get(url)

In [14]:
response

<Response [200]>

In [15]:
last_liv_html = response.text

In [16]:
with open('last_liv.txt', "w") as f:
    f.write(last_liv_html)

1.1 From the first Generalization: 12 songs out of Let England Shake

To the extraction of 5 albums with help of a dictionary

Select Artist and read html into fContent

In [17]:
with open('pj_harvey/pjh.txt', 'r') as fArtist:
    fContent = fArtist.read()

Call Artist dictionary

In [18]:
dict_pjh = {'folder_name':['hope_six_demolition_project','let_england_shake','stories_from_the_city','rid_of_me','dry'],
                          'album_id':['338219','234079','42436','273097','272648']}

In [19]:
dict_pjh['album_id']

['338219', '234079', '42436', '273097', '272648']

In [20]:
dict_pjh['folder_name']

['hope_six_demolition_project',
 'let_england_shake',
 'stories_from_the_city',
 'rid_of_me',
 'dry']

In [21]:
len(dict_pjh['album_id'])

5

In [22]:
#Album: All 5
#as a function it needs: dict{'album_id','folder_name'}, artist name for song link extraction

file_name_list = []

for n in range(0,len(dict_pjh['album_id'])):
    folder_name = dict_pjh['folder_name'][n]
    album_id = dict_pjh['album_id'][n]   
    
    #flist is an array with the song link in 1 col and song name in col 2    
    flist = re.findall(fr'<a href="(/lyric/{album_id}\d\d/PJ\+Harvey/[^"]+)">([^<]+)</a>', fContent) 
    #alternative <a href="/lyric/234079\d{2}/PJ\+Harvey/[\w+\+]{1,50}">[\w\s]{1,50}</a>
    
    for i in range(0,len(flist)) :
        song_name = flist [i][1]    
        file_name = song_name.lower().replace(' ','_')
        path = folder_name+'/'+file_name
        file_name_list.append(path)

        #print(song_name)
        #print(file_name)

        url = (f'http://www.lyrics.com{flist[i][0]}')
        #print(url)
        #response = requests.get(url)
        #song_html = response.text
        #with open (f'pj_harvey/{folder_name}/{file_name}.txt', 'w' ) as fSong :
            #fSong.write(song_html)

Beautiful Soup: extracting the lyrics out of song html as saved in a .txt

Let's read In The Dark Places

In [23]:
with open('pj_harvey/let_england_shake/in_the_dark_places.txt') as fSong :
    fContent = fSong.read()
    song_soup = BeautifulSoup(fContent, "html.parser")
    lyrics = (song_soup.pre.text)    
    print (lyrics)

We got up early,

Washed our faces,

Walked the fields

Put up crosses.

Passed through

The damned mountains,

Went hellwards,

And some of us returned,

And some of us did not.



In the fields and in the forests,

Under the moon and under the sun

Another summer has passed before us,

And not one man has,

And not one woman has 

Revealed the secrets of this world.



So our young men hid

With guns, in the dirt

And in the dark places.

Our young men hid with guns, 

In the dirt in the dark

Our young men hid

With guns, in the forest

And in the dark places.



And not one man has,

And not one woman has 

Revealed the secrets of this world.

And in the dark places.


Generalization: Loop extracting songs of a list with the names of the song files:

With help of the dictionary containing the folder names

In [24]:
file_name_list # alternative: with os.listdir or os.walk

['hope_six_demolition_project/the_community_of_hope',
 'hope_six_demolition_project/the_ministry_of_defence',
 'hope_six_demolition_project/a_line_in_the_sand',
 'hope_six_demolition_project/chain_of_keys',
 'hope_six_demolition_project/river_anacostia',
 'hope_six_demolition_project/near_the_memorials_to_vietnam_and_lincoln',
 'hope_six_demolition_project/the_orange_monkey',
 'hope_six_demolition_project/medicinals',
 'hope_six_demolition_project/the_ministry_of_social_affairs',
 'hope_six_demolition_project/the_wheel',
 'hope_six_demolition_project/dollar_dollar',
 'let_england_shake/let_england_shake',
 'let_england_shake/the_last_living_rose',
 'let_england_shake/the_glorious_land',
 'let_england_shake/the_words_that_maketh_murder',
 'let_england_shake/all_and_everyone',
 'let_england_shake/on_battleship_hill',
 'let_england_shake/england',
 'let_england_shake/in_the_dark_places',
 'let_england_shake/bitter_branches',
 'let_england_shake/hanging_in_the_wire',
 'let_england_shake/th

In [25]:
lyrics_list = []
for file_name in file_name_list:
    print(file_name)
    with open(f'pj_harvey/{file_name}.txt') as fSong :
        fContent = fSong.read()

    song_soup = BeautifulSoup(fContent, "html.parser")
    try :
        lyrics = (song_soup.pre.text)   
    except :
        continue
    #lyrics_tidy = re.sub(r'\\n',' ', lyrics)
    lyrics_tidy = lyrics.replace ('\n',' ')
    lyrics_list.append (lyrics_tidy)

hope_six_demolition_project/the_community_of_hope
hope_six_demolition_project/the_ministry_of_defence
hope_six_demolition_project/a_line_in_the_sand
hope_six_demolition_project/chain_of_keys
hope_six_demolition_project/river_anacostia
hope_six_demolition_project/near_the_memorials_to_vietnam_and_lincoln
hope_six_demolition_project/the_orange_monkey
hope_six_demolition_project/medicinals
hope_six_demolition_project/the_ministry_of_social_affairs
hope_six_demolition_project/the_wheel
hope_six_demolition_project/dollar_dollar
let_england_shake/let_england_shake
let_england_shake/the_last_living_rose
let_england_shake/the_glorious_land
let_england_shake/the_words_that_maketh_murder
let_england_shake/all_and_everyone
let_england_shake/on_battleship_hill
let_england_shake/england
let_england_shake/in_the_dark_places
let_england_shake/bitter_branches
let_england_shake/hanging_in_the_wire
let_england_shake/the_colour_of_the_earth
let_england_shake/written_on_the_forehead
stories_from_the_city/

In [26]:
for elem in lyrics_list :
    print (elem, '\n')

Here's the hope six demolition project  Stretching down to Benning Road  A well-known "pathway of death"  At least that's what I'm told  And here's the one sit-down restaurant  In Ward seven, nice  Okay, now this is just Drug-Town, just zombies  But that's just life    In the community of hope  The community of hope  The community of hope  The community of hope, hope, hope, hope    Here's the highway to death and destruction  South capitol is its name  And the school just looks like shit-hole  Does that look like a nice place?  Here's the old mental institution  Now the homeland security base  And here's god's deliverance center  A deli called M.L.K    And the community of hope  The community of hope  The community of hope  The community of hope, hope, hope, hope    They're gonna put a Walmart here  They're gonna put a Walmart here  They're gonna put a Walmart here  They're gonna put a Walmart here  They're gonna put a Walmart here  They're gonna put a Walmart here  They're gonna put a

In [27]:
len(lyrics_list)

59

In [28]:
lyrics_list_pjh = lyrics_list

In [29]:
lyrics_list_pjh

['Here\'s the hope six demolition project  Stretching down to Benning Road  A well-known "pathway of death"  At least that\'s what I\'m told  And here\'s the one sit-down restaurant  In Ward seven, nice  Okay, now this is just Drug-Town, just zombies  But that\'s just life    In the community of hope  The community of hope  The community of hope  The community of hope, hope, hope, hope    Here\'s the highway to death and destruction  South capitol is its name  And the school just looks like shit-hole  Does that look like a nice place?  Here\'s the old mental institution  Now the homeland security base  And here\'s god\'s deliverance center  A deli called M.L.K    And the community of hope  The community of hope  The community of hope  The community of hope, hope, hope, hope    They\'re gonna put a Walmart here  They\'re gonna put a Walmart here  They\'re gonna put a Walmart here  They\'re gonna put a Walmart here  They\'re gonna put a Walmart here  They\'re gonna put a Walmart here  Th

In [30]:
file_name_list_pjh = file_name_list

#### Second Artist: Portishead

and now let's do the same with a second artist,
working with beautiful soup from the beginning

First: Create a .txt file with the html of page cointaing the links to the song lyrics

In [22]:
url = 'https://www.lyrics.com/artist/Portishead/45223'

In [23]:
response = requests.get(url)

In [24]:
response

<Response [200]>

In [25]:
portishead_html = response.text

In [26]:
with open('portishead/portishead.txt', "w") as f:
    f.write(portishead_html)

now let's try to find the links to the album or links to the songs

Artist: Portishead, Album: Dummy

Extraction of the songs links with Beautiful Soup:

In [27]:
with open('portishead/portishead.txt', 'r') as fArtist:
    fContent = fArtist.read()

In [28]:
class_="album-artist-label"

In [29]:
fContent

'\n<!doctype html>\n<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->\n<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->\n<!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->\n<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]--><head>\n<meta charset="utf-8">\n<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n<title>Portishead Lyrics, Songs and Albums | Lyrics.com</title>\n<meta name="description" content="Portishead Lyrics - All the great songs and their lyrics from Portishead on Lyrics.com">\n<meta name="keywords" content="Portishead lyrics, Portishead song lyrics, Portishead lyric">\n<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">\n<base href="https://www.lyrics.com/">\n\n<script>\ns4Prefix = \'https://static.stands4.com\';\nversion = \'1.3.87\';\n</script>\n\n<!-- Bootstrap compiled and minified CSS -->\n<link rel="stylesheet" href="https:/

In [30]:
portishead_soup = BeautifulSoup(fContent, 'html.parser')
portishead_soup.find_all('h3')

#flist = re.findall('<a href="(/lyric/234079\d{2}/PJ\+Harvey/[^"]+)">([^<]+)</a>', fContent)

[<h3><a href="artist/Portishead/45223">Famous lyrics by »</a></h3>,
 <h3>Albums by <strong>Portishead</strong><span class="sort"><strong>Sort:</strong><a class="rc3 s" href="javascript:void(0);">By Album</a><a class="rc3" href="artist.php?name=Portishead&amp;aid=45223&amp;o=1">A - Z</a></span></h3>,
 <h3 class="artist-album-label"><a href="/album/3755060/Chill-Out-%5BUniversal%5D">Chill Out [Universal]</a> <span class="year">[2018]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3200130/Bristol">Bristol</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3347468/Urgent-Turquoise">Urgent Turquoise</a> <span class="year">[2015]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3094283/BBC-Radio-6-Music%E2%80%99s-Alternative-Jukebox">BBC Radio 6 Music’s Alternative Jukebox</a> <span class="year">[2014]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3090040/Wild-%5BOriginal-Motion-Picture-Soundtrack%5D">Wild [Orig

Extraction of the songs links with RegEx

For all albums with help of a dictionary

In [31]:
dict_portishead = {'folder_name':['dummy','portishead','third'],
                          'album_id':['272989','336353','276726']}

In [32]:
#Album: All 3
#as a function it needs: dict{'album_id','folder_name'}, artist name for song link extraction,
#fcontent created by reading the artist .txt file

file_name_list = []

for n in range(0,len(dict_portishead['album_id'])):
    folder_name = dict_portishead['folder_name'][n]
    album_id = dict_portishead['album_id'][n]   
    #flist is an array with the song link in 1 col and song name in col 2
    
    flist = re.findall(fr'<a href="(/lyric/{album_id}\d\d/Portishead/[^"]+)">([^<]+)</a>', fContent) 
    #alternative <a href="/lyric/234079\d{2}/PJ\+Harvey/[\w+\+]{1,50}">[\w\s]{1,50}</a>
    for i in range(0,len(flist)) :
        song_name = flist [i][1]    
        file_name = song_name.lower().replace(' ','_')
        path = folder_name+'/'+file_name
        file_name_list.append(path)

        url = (f'http://www.lyrics.com{flist[i][0]}')
        #print(url)
        #response = requests.get(url)
        #song_html = response.text
        #with open (f'portishead/{folder_name}/{file_name}.txt', 'w' ) as fSong :
            #fSong.write(song_html)

In [33]:
file_name_list

['dummy/mysterons',
 'dummy/sour_times',
 'dummy/strangers',
 'dummy/it_could_be_sweet',
 'dummy/wandering_star',
 "dummy/it's_a_fire",
 'dummy/numb',
 'dummy/roads',
 'dummy/pedestal',
 'dummy/biscuit',
 'dummy/glory_box',
 'portishead/cowboys',
 'portishead/undenied',
 'portishead/half_day_closing',
 'portishead/over',
 'portishead/humming',
 'portishead/mourning_air',
 'portishead/only_you',
 'portishead/elysium',
 'portishead/western_eyes',
 'third/silence',
 'third/hunter',
 'third/nylon_smile',
 'third/the_rip',
 'third/plastic',
 'third/we_carry_on',
 'third/deep_water',
 'third/machine_gun',
 'third/small',
 'third/magic_doors',
 'third/threads']

Beautiful Soup

Let's read Wandering Star off the album Dummy

In [34]:
with open ('portishead/dummy/wandering_star.txt') as fSong :
    fContent = fSong.read()
    song_soup = BeautifulSoup(fContent, "html.parser")
    lyrics = (song_soup.pre.text)    
    print (lyrics)

Please could you stay awhile to share my grief
For it's such a lovely day
To have to always feel this way
And the time that I will suffer less
Is when I never have to wake

Wandering stars, for whom it is reserved
The blackness of darkness forever
Wandering stars, for whom it is reserved
The blackness of darkness forever

Those who have seen the needles eye, now tread
Like a husk, from which all that was, now has fled
And the masks, that the monsters wear
To feed, upon their prey

Wandering stars, for whom it is reserved
The blackness of darkness forever
Wandering stars, for whom it is reserved
The blackness of darkness forever

Doubled up inside
Take a while to shed my grief
Always doubled up inside
Taunted, cruel

Wandering stars, for whom it is reserved
The blackness of darkness forever
Wandering stars, for whom it is reserved
The blackness of darkness forever


And now the all 3 Portishead albums:

In [35]:
lyrics_list = []
for file_name in file_name_list:
    with open(f'portishead/{file_name}.txt') as fSong :
        fContent = fSong.read()
    song_soup = BeautifulSoup(fContent, "html.parser")
    
    print(file_name)
    try :
        lyrics = (song_soup.pre.text) 
        #lyrics_tidy = re.sub(r'\\n',' ', lyrics)
        lyrics_tidy = lyrics.replace ('\n',' ')
        lyrics_list.append(lyrics_tidy)
    except :
        continue

dummy/mysterons
dummy/sour_times
dummy/strangers
dummy/it_could_be_sweet
dummy/wandering_star
dummy/it's_a_fire
dummy/numb
dummy/roads
dummy/pedestal
dummy/biscuit
dummy/glory_box
portishead/cowboys
portishead/undenied
portishead/half_day_closing
portishead/over
portishead/humming
portishead/mourning_air
portishead/only_you
portishead/elysium
portishead/western_eyes
third/silence
third/hunter
third/nylon_smile
third/the_rip
third/plastic
third/we_carry_on
third/deep_water
third/machine_gun
third/small
third/magic_doors
third/threads


In [36]:
print(lyrics_list);



In [37]:
len(lyrics_list)

31

In [38]:
lyrics_list_portishead = lyrics_list

In [39]:
file_name_list_portishead = file_name_list

In [40]:
for elem in lyrics_list_portishead :
    print(elem, '\n')

Inside your pretending Crimes have been swept aside Somewhere where they can forget  Divine upper reaches Still holding on This ocean will not be grasped  All for nothing Did you really want Did you really want Did you really want Did you really want  Refuse to surrender Strung out until ripped apart Who dares, who dares to condemn  All for nothing Did you really want Did you really want Did you really want Did you really want 

To pretend no one can find The fallacies of morning rose Forbidden fruit, hidden eyes Curtises that I despise in me Take a ride, take a shot now 'Cause nobody loves me It's true Not like you do  Covered by the blind belief That fantasies of sinful screens Bear the facts, assume the dye End the vows no need to lie, enjoy Take a ride, take a shot now 'Cause nobody loves me It's true Not like you do  Who am I, what and why 'Cause all I have left Is my memories of yesterday Oh these sour times 'Cause nobody loves me It's true Not like you do  After time the bitter 

#### Beth Gibbons Portishead

In [69]:
url = 'https://www.lyrics.com/artist/Beth-Gibbons-%26-Rustin-Man/2138007541'

In [70]:
response = requests.get(url)

In [71]:
response

<Response [200]>

In [72]:
beth_gibbons_html = response.text

In [73]:
with open('../data/data_re/beth_gibbons/beth_gibbons.txt', "w") as f:
    f.write(beth_gibbons_html)

In [74]:
with open('../data/data_re/beth_gibbons/beth_gibbons.txt', 'r') as fArtist:
    fContent = fArtist.read()

In [75]:
fContent

'\n<!doctype html>\n<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->\n<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->\n<!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->\n<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]--><head>\n<meta charset="utf-8">\n<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n<title>Beth Gibbons & Rustin Man Lyrics, Songs and Albums | Lyrics.com</title>\n<meta name="description" content="Beth Gibbons &amp; Rustin Man Lyrics - All the great songs and their lyrics from Beth Gibbons &amp; Rustin Man on Lyrics.com">\n<meta name="keywords" content="Beth Gibbons &amp; Rustin Man lyrics, Beth Gibbons &amp; Rustin Man song lyrics, Beth Gibbons &amp; Rustin Man lyric">\n<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">\n<base href="https://www.lyrics.com/">\n\n<script>\ns4Prefix = \'https://static.stands4.com\';\nversion 

In [76]:
flist = re.findall('<a href="(/lyric-lf/3813\d+/Beth\+Gibbons\+%26\+Rustin\+Man/[^"]+)">([^<]+)</a>', fContent) 
file_name_list=[]
for i in range(0,len(flist)) :
    song_name = flist [i][1]    
    file_name = song_name.lower().replace(' ','_')
    path = 'out_of_season/'+file_name
    file_name_list.append(path)

    url = (f'http://www.lyrics.com{flist[i][0]}')
    print(url)
    response = requests.get(url)
    song_html = response.text
    with open (f'../data/data_re/beth_gibbons/out_of_season/{file_name}.txt', 'w' ) as fSong :
        fSong.write(song_html)

http://www.lyrics.com/lyric-lf/3813701/Beth+Gibbons+%26+Rustin+Man/Tom+The+Model
http://www.lyrics.com/lyric-lf/3813695/Beth+Gibbons+%26+Rustin+Man/Resolve
http://www.lyrics.com/lyric-lf/3813692/Beth+Gibbons+%26+Rustin+Man/Drake
http://www.lyrics.com/lyric-lf/3813700/Beth+Gibbons+%26+Rustin+Man/Spider+Monkey
http://www.lyrics.com/lyric-lf/3813693/Beth+Gibbons+%26+Rustin+Man/Funny+Time+Of+Year
http://www.lyrics.com/lyric-lf/3813696/Beth+Gibbons+%26+Rustin+Man/Romance
http://www.lyrics.com/lyric-lf/3813694/Beth+Gibbons+%26+Rustin+Man/Mysteries
http://www.lyrics.com/lyric-lf/3813698/Beth+Gibbons+%26+Rustin+Man/Sand+River


In [77]:
flist

[('/lyric-lf/3813701/Beth+Gibbons+%26+Rustin+Man/Tom+The+Model',
  'Tom The Model'),
 ('/lyric-lf/3813695/Beth+Gibbons+%26+Rustin+Man/Resolve', 'Resolve'),
 ('/lyric-lf/3813692/Beth+Gibbons+%26+Rustin+Man/Drake', 'Drake'),
 ('/lyric-lf/3813700/Beth+Gibbons+%26+Rustin+Man/Spider+Monkey',
  'Spider Monkey'),
 ('/lyric-lf/3813693/Beth+Gibbons+%26+Rustin+Man/Funny+Time+Of+Year',
  'Funny Time Of Year'),
 ('/lyric-lf/3813696/Beth+Gibbons+%26+Rustin+Man/Romance', 'Romance'),
 ('/lyric-lf/3813694/Beth+Gibbons+%26+Rustin+Man/Mysteries', 'Mysteries'),
 ('/lyric-lf/3813698/Beth+Gibbons+%26+Rustin+Man/Sand+River', 'Sand River')]

In [78]:
file_name_list

['out_of_season/tom_the_model',
 'out_of_season/resolve',
 'out_of_season/drake',
 'out_of_season/spider_monkey',
 'out_of_season/funny_time_of_year',
 'out_of_season/romance',
 'out_of_season/mysteries',
 'out_of_season/sand_river']

In [79]:
lyrics_list = []
for file_name in file_name_list:
    with open(f'../data/data_re/beth_gibbons/{file_name}.txt') as fSong:
        fContent = fSong.read()
    song_soup = BeautifulSoup(fContent, "html.parser")

    print(file_name)
    lyrics = (song_soup.pre.text) 
    #lyrics_tidy = re.sub(r'\\n',' ', lyrics)
    lyrics_tidy = lyrics.replace ('\n',' ')
    lyrics_list.append(lyrics_tidy)

out_of_season/tom_the_model
out_of_season/resolve
out_of_season/drake
out_of_season/spider_monkey
out_of_season/funny_time_of_year
out_of_season/romance
out_of_season/mysteries
out_of_season/sand_river


In [81]:
len (lyrics_list)

8

In [82]:
lyrics_list_beth = lyrics_list
file_name_list_beth = file_name_list

#### Bags of Words and Naive Bayes

CountVectorizer to transform Corpus into matrix

Corpus should be a list where each element is a song, called document

In [300]:
#the corpus is just a list where each element is a song
#here the songs are sorted by two considered artists
corpus = lyrics_list_pjh[:39] + lyrics_list_portishead[:31] + lyrics_list_beth

vectorizer = CountVectorizer(stop_words='english') #instantiate the model
X_sp = vectorizer.fit_transform(corpus) #vectorization generates a sparse matrix

In [301]:
len(corpus)

78

Normalization with Tf-Idf

In [302]:
tf = TfidfTransformer() #instantiate the model
transformed = tf.fit_transform(X_sp)
transformed

<78x1504 sparse matrix of type '<class 'numpy.float64'>'
	with 2854 stored elements in Compressed Sparse Row format>

Now we can convert the normalized sparse matrix into the complete matrix and then into a data frame to apply a regression model on it.

In [303]:
#from list to corpus to sparse matrix to normalized matrix to dense matrix to df
tdf = pd.DataFrame(transformed.todense(), columns=vectorizer.get_feature_names())
tdf.head(2).round(2)

Unnamed: 0,000,abandoned,absurd,accuse,aching,acres,acting,adore,adorn,adulation,...,yards,yeah,year,years,yes,yesterday,york,young,zombies,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Classification model

In [304]:
X = tdf.values #from df to numpy array
y = ['PJ Harvey'] * 39 + ['Beth Gibbons'] * 39

In [305]:
x_train, x_test, y_train, y_test = train_test_split(X,y)

In [306]:
m = LogisticRegression()
m.fit(X, y)
m.score(x_test, y_test), m.score(X, y)

(1.0, 1.0)

In [307]:
m.predict(X)

array(['PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbo

In [308]:
m.predict_proba(X)#.round(2)

array([[0.37680429, 0.62319571],
       [0.35802967, 0.64197033],
       [0.36346546, 0.63653454],
       [0.36332249, 0.63667751],
       [0.36543735, 0.63456265],
       [0.36983243, 0.63016757],
       [0.373889  , 0.626111  ],
       [0.36481327, 0.63518673],
       [0.34394155, 0.65605845],
       [0.37416005, 0.62583995],
       [0.37596441, 0.62403559],
       [0.35353853, 0.64646147],
       [0.36257051, 0.63742949],
       [0.32385561, 0.67614439],
       [0.40099707, 0.59900293],
       [0.35103031, 0.64896969],
       [0.37545458, 0.62454542],
       [0.36980027, 0.63019973],
       [0.36089326, 0.63910674],
       [0.37333164, 0.62666836],
       [0.36244533, 0.63755467],
       [0.36552592, 0.63447408],
       [0.36740784, 0.63259216],
       [0.40031178, 0.59968822],
       [0.43627308, 0.56372692],
       [0.40591257, 0.59408743],
       [0.42135672, 0.57864328],
       [0.35588044, 0.64411956],
       [0.34413388, 0.65586612],
       [0.38437543, 0.61562457],
       [0.

In [309]:
m.classes_

array(['Beth Gibbons', 'PJ Harvey'], dtype='<U12')

Predictions, new songs, who said that?

In [310]:
songs = ["I can't wait for the night to come nothing else but waves of love",
         "Give me a reason",
         "for I am guilty for the voice that I obey",
         "Inside your pretending Crimes have been swept aside Somewhere where they can forget  Divine upper reaches Still holding on This ocean will not be grasped  All for nothing Did you really want Did you really want Did you really want Did you really want  Refuse to surrender Strung out until ripped apart Who dares, who dares to condemn  All for nothing Did you really want Did you really want Did you really want Did you really want"]

In [311]:
counts = vectorizer.transform(songs)
tfcounts = tf.transform(counts)

In [312]:
df_tf_counts = pd.DataFrame(tfcounts.todense(), columns=vectorizer.get_feature_names())

In [313]:
df_tf_counts

Unnamed: 0,000,abandoned,absurd,accuse,aching,acres,acting,adore,adorn,adulation,...,yards,yeah,year,years,yes,yesterday,york,young,zombies,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [314]:
m.predict(df_tf_counts)

array(['PJ Harvey', 'Beth Gibbons', 'PJ Harvey', 'Beth Gibbons'],
      dtype='<U12')

In [315]:
m.predict_proba(df_tf_counts)

array([[0.43822064, 0.56177936],
       [0.59446002, 0.40553998],
       [0.49379531, 0.50620469],
       [0.57391   , 0.42609   ]])

Without obtaining the complete matrix and converting to data frame:

In [283]:
m.predict(tfcounts)

array(['PJ Harvey', 'Beth Gibbons', 'PJ Harvey', 'Beth Gibbons'],
      dtype='<U12')

In [284]:
m.predict_proba(tfcounts)

array([[0.46591187, 0.53408813],
       [0.60311711, 0.39688289],
       [0.49625213, 0.50374787],
       [0.59697615, 0.40302385]])

accuracy: positive or negative
recall score 
precision 
f1-score: prec*recall/(prec+recall)

confussion matrix

Naive Bayes

Decision Tree

In [None]:
Random Forest

In [None]:
rf = RandomForest
rf = m_estimators, max_depth


Class Imbalance

In [None]:
random undersample
NearMiss1, 2
fit_resample(X_train, y_train)

random oversample