In [None]:
# 5.02.2021

### Project 4: "Text Classification"

In [1]:
import pandas as pd
import requests
import re
import os
import spacy
from bs4 import BeautifulSoup
from typing import Tuple, List
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB

### 1. Download HTML pages & Get a list of song urls:

In [2]:
num = 20

def collect_song_links(artist:str, site:str='metrolyrics') -> Tuple[List, str]:

    links = []  
    if site == 'metrolyrics':
        artist = artist.replace(' ', '-')
        url = "https://www.metrolyrics.com/" + artist + "-lyrics.html"
        response = requests.get(url)
        if response.status_code != 200:
            print('Sorry try again.')

        soup = BeautifulSoup(markup=response.text)
        for td in soup.find_all('td'):
            if td.a is not None:
                links.append(td.a.get('href'))  
    
    elif site == 'lyrics':
        print('Feature not available.')
    
    return links, artist

### 2. Extracting lyrics from song urls:

In [3]:
def get_songs_lyrics(links:list, artist_name:str, num:int) -> List[Tuple]:
    
    lyrics = []
    for li in links[:num]:
        response = requests.get(li)
        soup = BeautifulSoup(markup=response.text)
        lyrics_section = soup.find(attrs={'id':'lyrics-body-text'})
        lyrics_chunk = []
        for verse in lyrics_section.find_all('p', class_='verse'):
            lyrics_chunk.append(verse.text)

        lyrics.append((' '.join(lyrics_chunk))) #artist_name
    
    return lyrics

### 3. Combining two functions together:

In [4]:
def main (artist, num):
    
    links, artist = collect_song_links(artist)
    results = get_songs_lyrics(links, artist, num)
    
    return results

In [5]:
LABELS = ['frank sinatra'] * num + ['chris rea'] * num

In [6]:
# Getting the text:
CORPUS = main('frank sinatra', num) + main('chris rea', num)
CORPUS = list(CORPUS)
CORPUS

["L is for the way you look at me\nO is for the only one I see\nV is very, very extraordinary\nE is even more than anyone that you adore can Love is all that I can give to you\nLove is more than just a game for two\nTwo in love can make it\nTake my heart and please don't break it\nLove was made for me and you L is for the way you look at me\nO is for the only one I see\nV is very, very extraordinary\nE is even more than anyone that you adore can Love is all that I can give to you\nLove is more than just a game for two\nTwo in love can make it\nTake my heart and please don't break it\nLove was made for me and you\nLove was made for me and you\nLove was made for me and you\nLove was made for me and you",
 "Start spreading the news\nI am leaving today\nI want to be a part of it\nNew York, New York These vagabond shoes\nAre longing to stray\nRight through the very heart of it\nNew York, New York I wanna wake up, in a city\nThat doesn't sleep\nAnd find I'm king of the hill\nTop of the heap 

In [7]:
CORPUS = pd.Series(CORPUS)
CORPUS

0     L is for the way you look at me\nO is for the ...
1     Start spreading the news\nI am leaving today\n...
2                                                      
3     And now the end is near\nAnd so I face the fin...
4     Next time you're found\nWith your chin on the ...
5     You're just too good to be true\nI can't take ...
6     Somewhere beyond the sea\nSomewhere waitin' fo...
7     I know I stand in line,\nuntil you think you h...
8     Some day, when I'm awfully low\nWhen the world...
9     That's life (that's life), that's what all the...
10    Oh, it's a long, long while from May to Decemb...
11    Have yourself a merry little Christmas\nLet yo...
12    She gets too hungry for dinner at eight\nShe l...
13    It had to be you\nIt had to be you\nI wandered...
14    The summer wind came blowin' in\nFrom across t...
15    Come fly with me, let's fly, let's fly away\nI...
16    Writer(s): leo robin/ralph rainger Thanks for ...
17    She's a sunflower, she's my one flower,\nS

In [13]:
# Cleaning the data
def clean_data(column:pd.Series) -> pd.Series:
    
    column = column.copy()
    column = column.str.replace(r"[(),:!?@&\'\`\"\_]", "")
    column = column.str.replace(r"[\n]", " ")
    column = column.str.replace(r"[...]", "")
    column = column.str.replace(r"[\d]", "") 
    column = column.str.replace(r"[-]", "") 
    column = column.str.replace(r"[", "") 
    column = column.str.replace(r"]", "")
    column = column.str.lower()
    
    return column

In [14]:
CORPUS = clean_data(CORPUS)
CORPUS

0     l is for the way you look at me o is for the o...
1     start spreading the news i am leaving today i ...
2                                                      
3     and now the end is near and so i face the fina...
4     next time youre found with your chin on the gr...
5     youre just too good to be true i cant take my ...
6     somewhere beyond the sea somewhere waitin for ...
7     i know i stand in line until you think you hav...
8     some day when im awfully low when the world is...
9     thats life thats life thats what all the peopl...
10    oh its a long long while from may to december ...
11    have yourself a merry little christmas let you...
12    she gets too hungry for dinner at eight she li...
13    it had to be you it had to be you i wandered a...
14    the summer wind came blowin in from across the...
15    come fly with me lets fly lets fly away if you...
16    writers leo robin/ralph rainger thanks for the...
17    shes a sunflower shes my one flower shes t

### 4. Converting text to numbers by applying the Bag Of Words method:

In [10]:
cv = CountVectorizer(stop_words='english', ngram_range=(1, 1))
vec = cv.fit_transform(CORPUS)
pd.DataFrame(vec.todense(), columns=cv.get_feature_names(), index=LABELS).head(5)

Unnamed: 0,aah,aand,acapulco,ache,ado,adore,ah,ahead,aint,air,...,years,yes,yore,york,youd,youll,young,younger,youre,yuletide
frank sinatra,0,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
frank sinatra,0,0,0,0,0,0,0,0,0,0,...,0,0,0,13,0,0,0,0,0,0
frank sinatra,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
frank sinatra,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
frank sinatra,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,0


### 5. Applying Tf-Idf Transformation (Normalization):

In [12]:
tf = TfidfTransformer()
vec = tf.fit_transform(vec)

#vec2 = tf.fit_transform(vec)

pd.DataFrame(vec.todense(), columns=cv.get_feature_names(), index=LABELS).head(5)

Unnamed: 0,aah,aand,acapulco,ache,ado,adore,ah,ahead,aint,air,...,years,yes,yore,york,youd,youll,young,younger,youre,yuletide
frank sinatra,0.0,0.0,0.0,0.0,0.0,0.442568,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
frank sinatra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.753371,0.0,0.0,0.0,0.0,0.0,0.0
frank sinatra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
frank sinatra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.140059,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
frank sinatra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.065312,0.0


### 6. Putting everything together in a pipeline:

In [16]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(CORPUS)

### 7. Applying the model: 

### Multinomial Naive Bayes Classifier

In [17]:
clf = MultinomialNB()
clf.fit(X, LABELS)
MultinomialNB()

MultinomialNB()

In [18]:
# Pipeline
pipeline = make_pipeline(TfidfVectorizer(stop_words = 'english'), 
                         MultinomialNB(class_prior=None))

pipeline.fit(CORPUS, LABELS)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(stop_words='english')),
                ('multinomialnb', MultinomialNB())])

In [19]:
pipeline.predict(['come fly with me lets fly lets fly away if you'])

array(['frank sinatra'], dtype='<U13')

In [20]:
pipeline.predict_proba(['come fly with me lets fly lets fly away if you'])

array([[0.44671259, 0.55328741]])

In [21]:
pipeline.predict(['on the hard fast train on the road to gain something gets right'])

array(['chris rea'], dtype='<U13')