In [44]:
import pandas as pd
import os

import requests
import re #regular expressions

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Text Classification Project with PJ Harvey

## Project Description

In this Project a Model for text classification on song lyrics will be build.<br>
Web Scraping lyrics of two artists.<br>
Apply visualization and NLP technics like a Beautiful Soup, Regex, wordcloud, Bag of Words.<br>
Naive Bayes and Term Frequency and Inverse Document Frequency to build the model.<br>
The goal of the Model will be to recognize the artist given a piece of text.<br>

### First Artist: PJ Harvey

### First steps, extract the artist page

In [72]:
base_url = 'www.lyrics.com'

In [4]:
url  = 'http://www.lyrics.com/artist/PJ-Harvey'

In [5]:
response = requests.get(url)

In [6]:
response

<Response [200]>

In [7]:
response.status_code

200

In [8]:
pjh_html = response.text

In [10]:
with open('../data/pj_harvey/pjh.txt', "w") as f:
    f.write(pjh_html)

In [11]:
with open('../data/pj_harvey/pjh.txt', "r") as fArtist:
    fContent = fArtist.read()

### Define a Beautiful Soup of the Artist Page

In [12]:
pjh_soup = BeautifulSoup(fContent, "html.parser")

In [13]:
print(pjh_soup.prettify())

<!DOCTYPE html>
<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->
<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->
<!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->
<!--[if gt IE 8]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
  <title>
   PJ Harvey Lyrics, Songs and Albums | Lyrics.com
  </title>
  <meta content="PJ Harvey Lyrics - All the great songs and their lyrics from PJ Harvey on Lyrics.com" name="description"/>
  <meta content="PJ Harvey lyrics, PJ Harvey song lyrics, PJ Harvey lyric" name="keywords"/>
  <meta content="width=device-width, initial-scale=1, maximum-scale=1" name="viewport"/>
  <base href="https://www.lyrics.com/"/>
  <script>
   s4Prefix = 'https://static.stands4.com';
version = '1.3.87';
  </script>
  <!-- Bootstrap compiled and minified CSS -->
  <link href="https://maxcdn.bo

### Extract the links to the particular albums


These links are always inside an \<h3> tag with the attribute class="artist-album-label".

In [14]:
album_labels = pjh_soup.find_all(class_='artist-album-label')

In [15]:
len(album_labels)

132

List of all elements with album links:

In [16]:
album_labels

[<h3 class="artist-album-label"><a href="/album/3611504/Women-With-Attitude">Women With Attitude</a> <span class="year">[2017]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3305033/Hope-Six-Demolition-Project-%5BLP%5D">Hope Six Demolition Project [LP]</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3326676/The-Community-of-Hope">The Community of Hope</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3302621/The-Hope-Six-Demolition-Project">The Hope Six Demolition Project</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3337705/The-Orange-Monkey">The Orange Monkey</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3298410/The-Wheel">The Wheel</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3201104/She-Who-Rocks">She Who Rocks</a> <span class="year">[2015]</span></h3>

Examining the elements on the list:

In [17]:
album_labels[0]

<h3 class="artist-album-label"><a href="/album/3611504/Women-With-Attitude">Women With Attitude</a> <span class="year">[2017]</span></h3>

In [18]:
album_labels[0].a

<a href="/album/3611504/Women-With-Attitude">Women With Attitude</a>

In [19]:
album_labels[0].attrs

{'class': ['artist-album-label']}

In [20]:
print(album_labels[0].string)

None


In [21]:
album_labels[0].a.attrs

{'href': '/album/3611504/Women-With-Attitude'}

In [22]:
album_labels[0].a.string

'Women With Attitude'

In [23]:
'Women With Attitude' in album_labels[0].a

True

### Transform the labels list into a list with link elements

The link elements \<a> are child elements of the previous \<h3> elements:

In [24]:
album_links=[]
for label in album_labels:
    if label.a !=None:
        album_links.append(label.a)    

In [25]:
len(album_links)

131

In [26]:
album_links

[<a href="/album/3611504/Women-With-Attitude">Women With Attitude</a>,
 <a href="/album/3305033/Hope-Six-Demolition-Project-%5BLP%5D">Hope Six Demolition Project [LP]</a>,
 <a href="/album/3326676/The-Community-of-Hope">The Community of Hope</a>,
 <a href="/album/3302621/The-Hope-Six-Demolition-Project">The Hope Six Demolition Project</a>,
 <a href="/album/3337705/The-Orange-Monkey">The Orange Monkey</a>,
 <a href="/album/3298410/The-Wheel">The Wheel</a>,
 <a href="/album/3201104/She-Who-Rocks">She Who Rocks</a>,
 <a href="/album/3293549/Triple-J%3A-40-Years-of-Music">Triple J: 40 Years of Music</a>,
 <a href="/album/3094283/BBC-Radio-6-Music%E2%80%99s-Alternative-Jukebox">BBC Radio 6 Music’s Alternative Jukebox</a>,
 <a href="/album/2754261/Bernard-lenoir-l%27inrockuptible">Bernard lenoir l'inrockuptible</a>,
 <a href="/album/2459986/RMF-Styl">RMF Styl</a>,
 <a href="/album/2103701/Let-England-Shake">Let England Shake</a>,
 <a href="/album/2334073/Let-England-Shake%3A-12-Short-Films-b

The following would extract just all the links on the page:

In [51]:
pjh_links = pjh_soup.find_all('a')
pjh_links

[<a href="login.php">Login</a>,
 <a href="https://www.abbreviations.com/">ABBREVIATIONS</a>,
 <a href="https://www.anagrams.net/">ANAGRAMS</a>,
 <a href="https://www.biographies.net/">BIOGRAPHIES</a>,
 <a href="https://www.calculators.net/">CALCULATORS</a>,
 <a href="https://www.convert.net/">CONVERSIONS</a>,
 <a href="https://www.definitions.net/">DEFINITIONS</a>,
 <a href="https://www.grammar.com/">GRAMMAR</a>,
 <a href="https://www.literature.com/">LITERATURE</a>,
 <a href="https://www.lyrics.com/">LYRICS</a>,
 <a href="https://www.phrases.com/">PHRASES</a>,
 <a href="https://www.poetry.com/">POETRY</a>,
 <a href="https://www.quotes.net/">QUOTES</a>,
 <a href="https://www.references.net/">REFERENCES</a>,
 <a href="https://www.rhymes.com/">RHYMES</a>,
 <a href="https://www.scripts.com/">SCRIPTS</a>,
 <a href="https://www.symbols.com/">SYMBOLS</a>,
 <a href="https://www.synonyms.com/">SYNONYMS</a>,
 <a href="https://www.uszip.com/">USZIP</a>,
 <a href="/artists/0">#</a>,
 <a href="/ar

### Determining Artist Albums 

List of albums: we only want to extract main albums<br>
Extra option: extract this list from Wikipedia

In [28]:
pjh_album_list=['Dry','Rid of Me','To Bring You My Love','Is This Desire?','Stories from the City, Stories from the Sea',
                'Uh Huh Her','White Chalk','Let England Shake','The Hope Six Demolition Project']

In [29]:
#pjh_album_list = list(map(str.title,pjh_album_list))

In [30]:
pjh_album_list

['Dry',
 'Rid of Me',
 'To Bring You My Love',
 'Is This Desire?',
 'Stories from the City, Stories from the Sea',
 'Uh Huh Her',
 'White Chalk',
 'Let England Shake',
 'The Hope Six Demolition Project']

### Filter album links list with the selected albums

In [54]:
album_links[103].string

'To Bring You My Love'

In [56]:
pjh_album_list[2]

'To Bring You My Love'

In [33]:
album_links_red=[]

In [37]:
# match defined albums with the albums in the links list
for album in pjh_album_list:
    for link in album_links:
        if link.string==album:
            album_links_red.append(link)

This is now the working list for the artist:

In [38]:
album_links_red

[<a href="/album/79155/Dry">Dry</a>,
 <a href="/album/170348/Rid-of-Me">Rid of Me</a>,
 <a href="/album/209249/To-Bring-You-My-Love">To Bring You My Love</a>,
 <a href="/album/373571/Is-This-Desire%3F">Is This Desire?</a>,
 <a href="/album/503552/Stories-from-the-City%2C-Stories-from-the-Sea">Stories from the City, Stories from the Sea</a>,
 <a href="/album/692552/Uh-Huh-Her">Uh Huh Her</a>,
 <a href="/album/1208861/White-Chalk">White Chalk</a>,
 <a href="/album/2103701/Let-England-Shake">Let England Shake</a>,
 <a href="/album/3302621/The-Hope-Six-Demolition-Project">The Hope Six Demolition Project</a>]

In [105]:
album_links_red[3].get('href')

'/album/373571/Is-This-Desire%3F'

Here are the links we need:

In [40]:
links_pjh = list(map(lambda link: link.get('href'),album_links_red))
links_pjh

['/album/79155/Dry',
 '/album/170348/Rid-of-Me',
 '/album/209249/To-Bring-You-My-Love',
 '/album/373571/Is-This-Desire%3F',
 '/album/503552/Stories-from-the-City%2C-Stories-from-the-Sea',
 '/album/692552/Uh-Huh-Her',
 '/album/1208861/White-Chalk',
 '/album/2103701/Let-England-Shake',
 '/album/3302621/The-Hope-Six-Demolition-Project']

In [164]:
album_links_red[0].get('href')

'/album/79155/Dry'

### Create album folder and download html of each album page

In [153]:
#function: gets one album link as a link element <a>
def download_album(link, artist_path):
    """
    Creates an album folder
    Requests album page html
    Saves it as .txt in the album folder
    """
    album_address = link.get('href')
    album_name = link.string

    folder_name = album_name.lower().replace(' ','_').replace('?','')
    folder_path = artist_path + folder_name + '/'
    if os.path.exists(folder_path) == False:
        os.mkdir(folder_path)
        
    album_html = requests.get('http://'+base_url+album_address).text
    with open(folder_path+folder_name+'.txt',"w") as fAlbum:
        fAlbum.write(album_html)

In [108]:
# PJ Harvey
pjh_path = '../data/pj_harvey/'

for link in album_links_red:
    download_album(link,pjh_path)

In [112]:
os.listdir('../data/pj_harvey')

['dry',
 'is_this_desire',
 'let_england_shake',
 'pjh.txt',
 'rid_of_me',
 'stories_from_the_city,_stories_from_the_sea',
 'the_hope_six_demolition_project',
 'to_bring_you_my_love',
 'uh_huh_her',
 'white_chalk']

In [200]:
one_folder = os.listdir('../data/pj_harvey')[0]
one_folder

'dry'

In [202]:
with open (pjh_path+one_folder+'/'+one_folder+'.txt', 'r') as fAlbum:
    fContent = fAlbum.read()
    album_soup = BeautifulSoup(fContent, "html.parser")
    strong_elements = album_soup.find_all('strong')

In [213]:
strong_elements[1]

<strong><a href="/lyric/1562868/Oh+My+Lover">Oh My Lover</a></strong>

In [212]:
strong_elements[1].a.get('href')

'/lyric/1562868/Oh+My+Lover'

In [210]:
strong_elements[1].a.string

'Oh My Lover'

In [221]:
#function: gets one album folder and the relative path to the folder 
def download_songs(folder, artist_path):    
    """
    Opens album.txt in the folder with album html
    Converts it into a Beautiful Soup to extract song addresses and names
    Iterates through the folder songs to save the song html in the album folder
    """
    folder_path = artist_path + folder
    
    with open (artist_path+folder+'/'+folder+'.txt', 'r') as fAlbum:
        fContent = fAlbum.read()
        album_soup = BeautifulSoup(fContent, "html.parser")
        strong_elements = album_soup.find_all('strong')
    
    for element in strong_elements:
        if element.a != None:
            if element.a.get('href')[1:6]=='lyric':
                song_address = element.a.get('href')
                song_name = element.a.string.lower().replace(' ','_').replace('?','')
                
                song_html = requests.get('http://'+base_url+song_address).text
                with open(folder_path+'/'+song_name+'.txt',"w") as fSong:
                    fSong.write(song_html)

In [231]:
# PJ Harvey
pjh_path = '../data/pj_harvey/'
next(os.walk(pjh_path))[1]

['dry',
 'is_this_desire',
 'let_england_shake',
 'rid_of_me',
 'stories_from_the_city,_stories_from_the_sea',
 'the_hope_six_demolition_project',
 'to_bring_you_my_love',
 'uh_huh_her',
 'white_chalk']

In [232]:
for folder in next(os.walk(pjh_path))[1]:
    download_songs(folder,pjh_path)

### Second Artist: Portishead

and now let's do the same with a second artist,
working with beautiful soup from the beginning

First: Create a .txt file with the html of page cointaing the links to the song lyrics

In [31]:
url = 'https://www.lyrics.com/artist/Portishead/45223'

In [32]:
response = requests.get(url)

In [33]:
response

<Response [200]>

In [92]:
portishead_html = response.text

In [93]:
with open('portishead/portishead.txt', "w") as f:
    f.write(portishead_html)

now let's try to find the links to the album or links to the songs

Artist: Portishead, Album: Dummy

Extraction of the songs links with Beautiful Soup:

In [35]:
with open('portishead/portishead.txt', 'r') as fArtist:
    fContent = fArtist.read()

In [36]:
class_="album-artist-label"

In [37]:
fContent

'\n<!doctype html>\n<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->\n<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->\n<!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->\n<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]--><head>\n<meta charset="utf-8">\n<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n<title>Portishead Lyrics, Songs and Albums | Lyrics.com</title>\n<meta name="description" content="Portishead Lyrics - All the great songs and their lyrics from Portishead on Lyrics.com">\n<meta name="keywords" content="Portishead lyrics, Portishead song lyrics, Portishead lyric">\n<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">\n<base href="https://www.lyrics.com/">\n\n<script>\ns4Prefix = \'https://static.stands4.com\';\nversion = \'1.3.73\';\n</script>\n\n<!-- Bootstrap compiled and minified CSS -->\n<link rel="stylesheet" href="https:/

In [38]:
portishead_soup = BeautifulSoup(fContent, 'html.parser')
portishead_soup.find_all('h3')

#flist = re.findall('<a href="(/lyric/234079\d{2}/PJ\+Harvey/[^"]+)">([^<]+)</a>', fContent)

[<h3><a href="artist/Portishead/45223">Famous lyrics by »</a></h3>,
 <h3>Albums by <strong>Portishead</strong><span class="sort"><strong>Sort:</strong><a class="rc3 s" href="javascript:void(0);">By Album</a><a class="rc3" href="artist.php?name=Portishead&amp;aid=45223&amp;o=1">A - Z</a></span></h3>,
 <h3 class="artist-album-label"><a href="/album/3755060/Chill-Out-%5BUniversal%5D">Chill Out [Universal]</a> <span class="year">[2018]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3200130/Bristol">Bristol</a> <span class="year">[2016]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3347468/Urgent-Turquoise">Urgent Turquoise</a> <span class="year">[2015]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3094283/BBC-Radio-6-Music%E2%80%99s-Alternative-Jukebox">BBC Radio 6 Music’s Alternative Jukebox</a> <span class="year">[2014]</span></h3>,
 <h3 class="artist-album-label"><a href="/album/3090040/Wild-%5BOriginal-Motion-Picture-Soundtrack%5D">Wild [Orig

Extraction of the songs links with RegEx

For all albums with help of a dictionary

In [39]:
dict_portishead = {'folder_name':['dummy','portishead','third'],
                          'album_id':['272989','336353','276726']}

In [40]:
#Album: All 3
#as a function it needs: dict{'album_id','folder_name'}, artist name for song link extraction,
#fcontent created by reading the artist .txt file

file_name_list = []

for n in range(0,len(dict_portishead['album_id'])):
    folder_name = dict_portishead['folder_name'][n]
    album_id = dict_portishead['album_id'][n]   
    #flist is an array with the song link in 1 col and song name in col 2
    
    flist = re.findall(fr'<a href="(/lyric/{album_id}\d\d/Portishead/[^"]+)">([^<]+)</a>', fContent) 
    #alternative <a href="/lyric/234079\d{2}/PJ\+Harvey/[\w+\+]{1,50}">[\w\s]{1,50}</a>
    for i in range(0,len(flist)) :
        song_name = flist [i][1]    
        file_name = song_name.lower().replace(' ','_')
        path = folder_name+'/'+file_name
        file_name_list.append(path)

        url = (f'http://www.lyrics.com{flist[i][0]}')
        #print(url)
        #response = requests.get(url)
        #song_html = response.text
        #with open (f'portishead/{folder_name}/{file_name}.txt', 'w' ) as fSong :
            #fSong.write(song_html)

In [41]:
file_name_list

['dummy/mysterons',
 'dummy/sour_times',
 'dummy/strangers',
 'dummy/it_could_be_sweet',
 'dummy/wandering_star',
 "dummy/it's_a_fire",
 'dummy/numb',
 'dummy/roads',
 'dummy/pedestal',
 'dummy/biscuit',
 'dummy/glory_box',
 'portishead/cowboys',
 'portishead/undenied',
 'portishead/half_day_closing',
 'portishead/over',
 'portishead/humming',
 'portishead/mourning_air',
 'portishead/only_you',
 'portishead/elysium',
 'portishead/western_eyes',
 'third/silence',
 'third/hunter',
 'third/nylon_smile',
 'third/the_rip',
 'third/plastic',
 'third/we_carry_on',
 'third/deep_water',
 'third/machine_gun',
 'third/small',
 'third/magic_doors',
 'third/threads']

Beautiful Soup

Let's read Wandering Star off the album Dummy

In [42]:
with open ('portishead/dummy/wandering_star.txt') as fSong :
    fContent = fSong.read()
    song_soup = BeautifulSoup(fContent, "html.parser")
    lyrics = (song_soup.pre.text)    
    print (lyrics)

Please could you stay awhile to share my grief
For it's such a lovely day
To have to always feel this way
And the time that I will suffer less
Is when I never have to wake

Wandering stars, for whom it is reserved
The blackness of darkness forever
Wandering stars, for whom it is reserved
The blackness of darkness forever

Those who have seen the needles eye, now tread
Like a husk, from which all that was, now has fled
And the masks, that the monsters wear
To feed, upon their prey

Wandering stars, for whom it is reserved
The blackness of darkness forever
Wandering stars, for whom it is reserved
The blackness of darkness forever

Doubled up inside
Take a while to shed my grief
Always doubled up inside
Taunted, cruel

Wandering stars, for whom it is reserved
The blackness of darkness forever
Wandering stars, for whom it is reserved
The blackness of darkness forever


And now the all 3 Portishead albums:

In [43]:
lyrics_list = []
for file_name in file_name_list:
    with open(f'portishead/{file_name}.txt') as fSong :
        fContent = fSong.read()
    song_soup = BeautifulSoup(fContent, "html.parser")
    
    print(file_name)
    try :
        lyrics = (song_soup.pre.text) 
        #lyrics_tidy = re.sub(r'\\n',' ', lyrics)
        lyrics_tidy = lyrics.replace ('\n',' ')
        lyrics_list.append(lyrics_tidy)
    except :
        continue

dummy/mysterons
dummy/sour_times
dummy/strangers
dummy/it_could_be_sweet
dummy/wandering_star
dummy/it's_a_fire
dummy/numb
dummy/roads
dummy/pedestal
dummy/biscuit
dummy/glory_box
portishead/cowboys
portishead/undenied
portishead/half_day_closing
portishead/over
portishead/humming
portishead/mourning_air
portishead/only_you
portishead/elysium
portishead/western_eyes
third/silence
third/hunter
third/nylon_smile
third/the_rip
third/plastic
third/we_carry_on
third/deep_water
third/machine_gun
third/small
third/magic_doors
third/threads


In [44]:
print(lyrics_list);



In [45]:
len(lyrics_list)

31

In [46]:
lyrics_list_portishead = lyrics_list

In [47]:
file_name_list_portishead = file_name_list

In [48]:
for elem in lyrics_list_portishead :
    print(elem, '\n')

Inside your pretending Crimes have been swept aside Somewhere where they can forget  Divine upper reaches Still holding on This ocean will not be grasped  All for nothing Did you really want Did you really want Did you really want Did you really want  Refuse to surrender Strung out until ripped apart Who dares, who dares to condemn  All for nothing Did you really want Did you really want Did you really want Did you really want 

To pretend no one can find The fallacies of morning rose Forbidden fruit, hidden eyes Curtises that I despise in me Take a ride, take a shot now 'Cause nobody loves me It's true Not like you do  Covered by the blind belief That fantasies of sinful screens Bear the facts, assume the dye End the vows no need to lie, enjoy Take a ride, take a shot now 'Cause nobody loves me It's true Not like you do  Who am I, what and why 'Cause all I have left Is my memories of yesterday Oh these sour times 'Cause nobody loves me It's true Not like you do  After time the bitter 

#### Beth Gibbons Portishead

In [257]:
url = 'https://www.lyrics.com/artist/Beth-Gibbons-%26-Rustin-Man/2138007541'

In [258]:
response = requests.get(url)

In [259]:
response

<Response [200]>

In [260]:
beth_gibbons_html = response.text

In [261]:
with open('beth_gibbons/beth_gibbons.txt', "w") as f:
    f.write(beth_gibbons_html)

In [293]:
with open('beth_gibbons/beth_gibbons.txt', 'r') as fArtist:
    fContent = fArtist.read()

In [294]:
fContent

'\n<!doctype html>\n<!--[if lt IE 7]> <html class="no-js lt-ie9 lt-ie8 lt-ie7" lang="en"> <![endif]-->\n<!--[if IE 7]>    <html class="no-js lt-ie9 lt-ie8" lang="en"> <![endif]-->\n<!--[if IE 8]>    <html class="no-js lt-ie9" lang="en"> <![endif]-->\n<!--[if gt IE 8]><!--> <html class="no-js" lang="en"> <!--<![endif]--><head>\n<meta charset="utf-8">\n<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1">\n<title>Beth Gibbons & Rustin Man Lyrics, Songs and Albums | Lyrics.com</title>\n<meta name="description" content="Beth Gibbons &amp; Rustin Man Lyrics - All the great songs and their lyrics from Beth Gibbons &amp; Rustin Man on Lyrics.com">\n<meta name="keywords" content="Beth Gibbons &amp; Rustin Man lyrics, Beth Gibbons &amp; Rustin Man song lyrics, Beth Gibbons &amp; Rustin Man lyric">\n<meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=1">\n<base href="https://www.lyrics.com/">\n\n<script>\ns4Prefix = \'https://static.stands4.com\';\nversion 

In [295]:
flist = re.findall('<a href="(/lyric-lf/3813\d+/Beth\+Gibbons\+%26\+Rustin\+Man/[^"]+)">([^<]+)</a>', fContent) 
file_name_list=[]
for i in range(0,len(flist)) :
    song_name = flist [i][1]    
    file_name = song_name.lower().replace(' ','_')
    path = 'out_of_season/'+file_name
    file_name_list.append(path)

    url = (f'http://www.lyrics.com{flist[i][0]}')
    print(url)
    #response = requests.get(url)
    #song_html = response.text
    #with open (f'beth_gibbons/out_of_season/{file_name}.txt', 'w' ) as fSong :
        #fSong.write(song_html)

http://www.lyrics.com/lyric-lf/3813701/Beth+Gibbons+%26+Rustin+Man/Tom+The+Model
http://www.lyrics.com/lyric-lf/3813695/Beth+Gibbons+%26+Rustin+Man/Resolve
http://www.lyrics.com/lyric-lf/3813692/Beth+Gibbons+%26+Rustin+Man/Drake
http://www.lyrics.com/lyric-lf/3813700/Beth+Gibbons+%26+Rustin+Man/Spider+Monkey
http://www.lyrics.com/lyric-lf/3813693/Beth+Gibbons+%26+Rustin+Man/Funny+Time+Of+Year
http://www.lyrics.com/lyric-lf/3813696/Beth+Gibbons+%26+Rustin+Man/Romance
http://www.lyrics.com/lyric-lf/3813694/Beth+Gibbons+%26+Rustin+Man/Mysteries
http://www.lyrics.com/lyric-lf/3813698/Beth+Gibbons+%26+Rustin+Man/Sand+River


In [296]:
file_name_list

['out_of_season/tom_the_model',
 'out_of_season/resolve',
 'out_of_season/drake',
 'out_of_season/spider_monkey',
 'out_of_season/funny_time_of_year',
 'out_of_season/romance',
 'out_of_season/mysteries',
 'out_of_season/sand_river']

In [297]:
lyrics_list = []
for file_name in file_name_list:
    with open(f'beth_gibbons/{file_name}.txt') as fSong :
        fContent = fSong.read()
    song_soup = BeautifulSoup(fContent, "html.parser")

    print(file_name)
    lyrics = (song_soup.pre.text) 
    #lyrics_tidy = re.sub(r'\\n',' ', lyrics)
    lyrics_tidy = lyrics.replace ('\n',' ')
    lyrics_list.append(lyrics_tidy)

out_of_season/tom_the_model
out_of_season/resolve
out_of_season/drake
out_of_season/spider_monkey
out_of_season/funny_time_of_year
out_of_season/romance
out_of_season/mysteries
out_of_season/sand_river


In [298]:
len (lyrics_list)

8

In [299]:
lyrics_list_beth = lyrics_list
file_name_list_beth = file_name_list

#### Bags of Words and Naive Bayes

CountVectorizer to transform Corpus into matrix

Corpus should be a list where each element is a song, called document

In [300]:
#the corpus is just a list where each element is a song
#here the songs are sorted by two considered artists
corpus = lyrics_list_pjh[:39] + lyrics_list_portishead[:31] + lyrics_list_beth

vectorizer = CountVectorizer(stop_words='english') #instantiate the model
X_sp = vectorizer.fit_transform(corpus) #vectorization generates a sparse matrix

In [301]:
len(corpus)

78

Normalization with Tf-Idf

In [302]:
tf = TfidfTransformer() #instantiate the model
transformed = tf.fit_transform(X_sp)
transformed

<78x1504 sparse matrix of type '<class 'numpy.float64'>'
	with 2854 stored elements in Compressed Sparse Row format>

Now we can convert the normalized sparse matrix into the complete matrix and then into a data frame to apply a regression model on it.

In [303]:
#from list to corpus to sparse matrix to normalized matrix to dense matrix to df
tdf = pd.DataFrame(transformed.todense(), columns=vectorizer.get_feature_names())
tdf.head(2).round(2)

Unnamed: 0,000,abandoned,absurd,accuse,aching,acres,acting,adore,adorn,adulation,...,yards,yeah,year,years,yes,yesterday,york,young,zombies,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.05,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Classification model

In [304]:
X = tdf.values #from df to numpy array
y = ['PJ Harvey'] * 39 + ['Beth Gibbons'] * 39

In [305]:
x_train, x_test, y_train, y_test = train_test_split(X,y)

In [306]:
m = LogisticRegression()
m.fit(X, y)
m.score(x_test, y_test), m.score(X, y)

(1.0, 1.0)

In [307]:
m.predict(X)

array(['PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey',
       'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'PJ Harvey', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons',
       'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbons', 'Beth Gibbo

In [308]:
m.predict_proba(X)#.round(2)

array([[0.37680429, 0.62319571],
       [0.35802967, 0.64197033],
       [0.36346546, 0.63653454],
       [0.36332249, 0.63667751],
       [0.36543735, 0.63456265],
       [0.36983243, 0.63016757],
       [0.373889  , 0.626111  ],
       [0.36481327, 0.63518673],
       [0.34394155, 0.65605845],
       [0.37416005, 0.62583995],
       [0.37596441, 0.62403559],
       [0.35353853, 0.64646147],
       [0.36257051, 0.63742949],
       [0.32385561, 0.67614439],
       [0.40099707, 0.59900293],
       [0.35103031, 0.64896969],
       [0.37545458, 0.62454542],
       [0.36980027, 0.63019973],
       [0.36089326, 0.63910674],
       [0.37333164, 0.62666836],
       [0.36244533, 0.63755467],
       [0.36552592, 0.63447408],
       [0.36740784, 0.63259216],
       [0.40031178, 0.59968822],
       [0.43627308, 0.56372692],
       [0.40591257, 0.59408743],
       [0.42135672, 0.57864328],
       [0.35588044, 0.64411956],
       [0.34413388, 0.65586612],
       [0.38437543, 0.61562457],
       [0.

In [309]:
m.classes_

array(['Beth Gibbons', 'PJ Harvey'], dtype='<U12')

Predictions, new songs, who said that?

In [310]:
songs = ["I can't wait for the night to come nothing else but waves of love",
         "Give me a reason",
         "for I am guilty for the voice that I obey",
         "Inside your pretending Crimes have been swept aside Somewhere where they can forget  Divine upper reaches Still holding on This ocean will not be grasped  All for nothing Did you really want Did you really want Did you really want Did you really want  Refuse to surrender Strung out until ripped apart Who dares, who dares to condemn  All for nothing Did you really want Did you really want Did you really want Did you really want"]

In [311]:
counts = vectorizer.transform(songs)
tfcounts = tf.transform(counts)

In [312]:
df_tf_counts = pd.DataFrame(tfcounts.todense(), columns=vectorizer.get_feature_names())

In [313]:
df_tf_counts

Unnamed: 0,000,abandoned,absurd,accuse,aching,acres,acting,adore,adorn,adulation,...,yards,yeah,year,years,yes,yesterday,york,young,zombies,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [314]:
m.predict(df_tf_counts)

array(['PJ Harvey', 'Beth Gibbons', 'PJ Harvey', 'Beth Gibbons'],
      dtype='<U12')

In [315]:
m.predict_proba(df_tf_counts)

array([[0.43822064, 0.56177936],
       [0.59446002, 0.40553998],
       [0.49379531, 0.50620469],
       [0.57391   , 0.42609   ]])

Without obtaining the complete matrix and converting to data frame:

In [283]:
m.predict(tfcounts)

array(['PJ Harvey', 'Beth Gibbons', 'PJ Harvey', 'Beth Gibbons'],
      dtype='<U12')

In [284]:
m.predict_proba(tfcounts)

array([[0.46591187, 0.53408813],
       [0.60311711, 0.39688289],
       [0.49625213, 0.50374787],
       [0.59697615, 0.40302385]])

accuracy: positive or negative
recall score 
precision 
f1-score: prec*recall/(prec+recall)

confussion matrix

Naive Bayes

Decision Tree

In [None]:
Random Forest

In [None]:
rf = RandomForest
rf = m_estimators, max_depth


Class Imbalance

In [None]:
random undersample
NearMiss1, 2
fit_resample(X_train, y_train)

random oversample