# Using Classification to Evaluate

In [1]:
import numpy as np 
import pandas as pd 
import sys
import json
import os
import re

from sklearn.feature_extraction import text      
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
#Code from Chenghui
lyrics = pd.read_csv('lyrics-data.csv') 
lyrics=lyrics.query("language=='en'")
artists = pd.read_csv("artists-data.csv")

partists = (artists[artists["Genres"]=="Pop"])
psongs = pd.merge(lyrics,partists,left_on="ALink",right_on="Link")

#Getting the categories we want
psongs = (psongs[["Artist","Genres","SName","Lyric"]])
#removing empty and songs with no lyrics
psongs = psongs.dropna()
psongs = psongs[psongs["Lyric"]!='Instrumental']

psongs.head

<bound method NDFrame.head of               Artist Genres                            SName  \
0       Shawn Mendes    Pop                    It'll Be Okay   
1       Shawn Mendes    Pop  There's Nothing Holdin' Me Back   
2       Shawn Mendes    Pop                 Treat You Better   
3       Shawn Mendes    Pop                         Stitches   
4       Shawn Mendes    Pop                   Never Be Alone   
...              ...    ...                              ...   
2539      Dima Bilan    Pop                          Trouble   
2540  Lene Alexandra    Pop                 Hot Boy Hot Girl   
2541  Lene Alexandra    Pop                  My Boobs Are Ok   
2542  Lene Alexandra    Pop           Sexy Naughty Bitchy Me   
2543  Lene Alexandra    Pop                 Sillycone Valley   

                                                  Lyric  
0     Are we gonna make it?\nIs this gonna hurt?\nOh...  
1     I wanna follow where she goes\nI think about h...  
2     I won't lie to you\nI

In [3]:
rartists = (artists[artists["Genres"]=="Rap"])
rsongs = pd.merge(lyrics,rartists,left_on="ALink",right_on="Link")

#Getting the categories we want
rsongs = (rsongs[["Artist","Genres","SName","Lyric"]])
#removing empty and songs with no lyrics
rsongs = rsongs.dropna()
rsongs = rsongs[rsongs["Lyric"]!='Instrumental']

rsongs.head

<bound method NDFrame.head of                     Artist Genres                                  SName  \
0                   Fugees    Rap        Killing Me Softly With His Song   
1                   Fugees    Rap                          How Many Mics   
2                   Fugees    Rap                           Ready Or Not   
3                   Fugees    Rap                     Vocab (LP Version)   
4                   Fugees    Rap                                Zealots   
...                    ...    ...                                    ...   
2007  Dipset/The Diplomats    Rap                               Who I Am   
2008  Dipset/The Diplomats    Rap                                Worried   
2009  Dipset/The Diplomats    Rap  Wouldn't You Like To Be A Gangsta Too   
2010  Dipset/The Diplomats    Rap              Ya'll Can't Live His Life   
2011  Dipset/The Diplomats    Rap                        You Make Me Say   

                                                  Lyric  

In [4]:
print("Shape of Pop:")
print((psongs).shape)
print("Shape of Rap:")
print((rsongs).shape)

Shape of Pop:
(2544, 4)
Shape of Rap:
(2012, 4)


In [5]:
#Creating out training data
train_psongs_len = int(len(psongs)*0.4)
train_rsongs_len = int(len(rsongs)*0.4)

train_data = pd.concat([psongs.head(train_psongs_len),rsongs.head(train_rsongs_len)])
print(train_data['Lyric'].values)

["Are we gonna make it?\nIs this gonna hurt?\nOh, we can try to sedate it\nBut that never works\nYeah\n\nI start to imagine a world where we don't collide\nIt's making me sick but we'll heal and the sun will rise\n\nIf you tell me you're leaving, I'll make it easy\nIt'll be okay\nIf we can't stop the bleeding\nWe don't have to fix it, we don't have to stay\nI will love you either way\nOoh-ooh, it'll be oh, be okay\nOoh-ooh\n\nOh, the future we dreamed of is fading to black\nOh-oh, oh-oh, oh\nOh, thеre's nothing more painful\nNothing more painful, oh-woah (Oh-woah)\n\nI start to imaginе a world where we don't collide\nAnd it's making me sick but we'll heal and the sun will rise\n\nIf you tell me you're leaving, I'll make it easy\nIt'll be okay (It'll be okay)\nAnd if we can't stop the bleeding\nWe don't have to fix it, we don't have to stay (Don't have to stay)\nI will love you either way\nOoh-ooh, it'll be oh, be okay\nOoh-ooh\n\nI will love you either way\nIt might be so sweet\nIt mig

### Vocab

In [6]:
#Creating our Vocab
cv = CountVectorizer(strip_accents='ascii', lowercase=True, stop_words='english', analyzer='word')
cv.fit(train_data['Lyric'].values)
#bag of words
bow = cv.transform(train_data['Lyric'].values) 

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

models = {'Logistic Regression':LogisticRegression(max_iter=500),
          'Decision Tree': DecisionTreeClassifier(),
          'Random Forest': RandomForestClassifier(),
          'SVC': SVC()}

for i,m in enumerate(models.keys()):
    print(i+1, ":", m)
    models[m].fit(bow.toarray(), train_data["Genres"])

1 : Logistic Regression
2 : Decision Tree
3 : Random Forest
4 : SVC


# Accuracy of Models

In [8]:
#Creating our test data
test_data = pd.concat([psongs.iloc[train_psongs_len:len(psongs)],rsongs.iloc[train_rsongs_len:len(rsongs)]])
print(test_data['Lyric'].values[0][0:200])

test_bow = cv.transform(test_data['Lyric'].values) 

She was my once in a lifetime
Happy ending come true
Oh I guess I should have told her
I thought she knew

She said I took her for granted
That's the last thing I would do
Whoa I'll never understand i


In [9]:
#Get the scores of each model
accuracy = {}
test_data = test_data[['Artist', 'SName','Genres']] #[["Artist","Genres","SName","Lyric"]])
for m in models.keys():
    print(m)
    pred_genre= models[m].predict(test_bow.toarray())
    test_data[m]=pred_genre
    accuracy[m]=accuracy_score(test_data['Genres'],pred_genre)
print ("---Accuracy Scores---")
print(accuracy)

Logistic Regression
Decision Tree
Random Forest
SVC
---Accuracy Scores---
{'Logistic Regression': 0.8486288848263254, 'Decision Tree': 0.8106032906764168, 'Random Forest': 0.8745886654478976, 'SVC': 0.8288848263254114}


Seems like Random Forest is the best. Let's see if we can predit the genre
of our generated text. 

In [10]:
generated = pd.read_csv('some_RNN_generated_data.csv') 
print(generated.head())
generated.dropna()


                                                 RNN  \
0  Hah, Yeah, what, oh, so.. what you can kang to...   
1  Anohownerely fills flickin' all my love when y...   
2  And we got nothin' to rish high and I play\nI ...   
3  Loveg “tell ya\nSlopt to have me\nniggas be fr...   
4  Lover you"who knows what lies a numberod\nThey...   

                                                LSTM  \
0  Bring me the night and day\nAll the pain , tha...   
1  You be there for me to\nNow we don't make me w...   
2  \nYou be there for me to\nNow we don't make me...   
3  And we still in me the - Boy\nI don't know whe...   
4  I just won't be so true\nYou make me one more ...   

                                               GPT-2  
0  Even though you do,I'm not dead. You just come...  
1  Fifty-five times I'm going out for love and wa...  
2  Don't tell anyone else what to do is a lie  Bu...  
3                                                NaN  
4                                                Na

Unnamed: 0,RNN,LSTM,GPT-2
0,"Hah, Yeah, what, oh, so.. what you can kang to...","Bring me the night and day\nAll the pain , tha...","Even though you do,I'm not dead. You just come..."
1,Anohownerely fills flickin' all my love when y...,You be there for me to\nNow we don't make me w...,Fifty-five times I'm going out for love and wa...
2,And we got nothin' to rish high and I play\nI ...,\nYou be there for me to\nNow we don't make me...,Don't tell anyone else what to do is a lie Bu...


In [11]:

RNN_real_genre = ["Rap","Pop","Pop","Pop","Pop"]
RNN_bow = cv.transform(generated['RNN'].values)
for m in models.keys():
    print(m)
    pred_genre= models[m].predict(RNN_bow.toarray())
    print(pred_genre)
    print("Percent match: ", np.mean( pred_genre == RNN_real_genre ))


Logistic Regression
['Pop' 'Pop' 'Pop' 'Rap' 'Pop']
Percent match:  0.6
Decision Tree
['Rap' 'Pop' 'Pop' 'Rap' 'Pop']
Percent match:  0.8
Random Forest
['Pop' 'Pop' 'Pop' 'Rap' 'Pop']
Percent match:  0.6
SVC
['Pop' 'Pop' 'Pop' 'Rap' 'Pop']
Percent match:  0.6


In [12]:
LSTM_real_genre = ["Pop","Pop","Pop","Rap","Pop"]
LSTM_bow = cv.transform(generated['LSTM'].values)
for m in models.keys():
    print(m)
    pred_genre= models[m].predict(LSTM_bow.toarray())
    print(pred_genre)
    print("Percent match: ", np.mean( pred_genre == LSTM_real_genre ))

Logistic Regression
['Pop' 'Pop' 'Pop' 'Rap' 'Pop']
Percent match:  1.0
Decision Tree
['Pop' 'Pop' 'Pop' 'Rap' 'Pop']
Percent match:  1.0
Random Forest
['Pop' 'Pop' 'Pop' 'Rap' 'Pop']
Percent match:  1.0
SVC
['Pop' 'Pop' 'Pop' 'Rap' 'Pop']
Percent match:  1.0


In [13]:
GPT_real_genre = ["Pop","Pop","Pop"]
temp=(generated['GPT-2'].dropna())
GPT_bow = cv.transform(temp.values)
for m in models.keys():
    print(m)
    pred_genre= models[m].predict(GPT_bow.toarray())
    print(pred_genre)
    print("Percent match: ", np.mean( pred_genre == GPT_real_genre ))

Logistic Regression
['Pop' 'Pop' 'Pop']
Percent match:  1.0
Decision Tree
['Pop' 'Pop' 'Pop']
Percent match:  1.0
Random Forest
['Pop' 'Pop' 'Pop']
Percent match:  1.0
SVC
['Pop' 'Pop' 'Pop']
Percent match:  1.0


# More Metrics

### BERT SCORE 

In [14]:
#!pip install bert_score
from datasets import load_metric

bertscore_metric = load_metric('bertscore')

In [15]:
bert_scores = bertscore_metric.compute(predictions=generated['RNN'].values, references=psongs["Lyric"][:5].values, lang="en")
# Normally, we use the f1-score attribute
print("RNN:")
print(bert_scores['f1'])

bert_scores = bertscore_metric.compute(predictions=generated['LSTM'].values, references=psongs["Lyric"][:5].values, lang="en")
# Normally, we use the f1-score attribute
print("LSTM:")
print(bert_scores['f1'])


artist =  "Lana Del Rey"

# code from Chenghui
lyrics = pd.read_csv('lyrics-data.csv') 
lyrics=lyrics.query("language=='en'")
artists = pd.read_csv("artists-data.csv")
lyrics_df = pd.merge(lyrics,artists,left_on="ALink",right_on="Link")
lyrics_df = lyrics_df[["Artist","Genres","Popularity","Songs","SName","Lyric"]]
lyrics_popular=lyrics_df.query(f"Artist=='{artist}'")
lyrics_popular=lyrics_popular.sort_values(['Songs','Artist'], ascending=[False,True])

bert_scores = bertscore_metric.compute(predictions=temp.values, references=lyrics_popular["Lyric"][:3].values, lang="en")
# Normally, we use the f1-score attribute
print("GPT-2:")
print(bert_scores['f1'])

#Using first x songs as referencce

RNN:
[0.7927799224853516, 0.8087143301963806, 0.8047899603843689, 0.7679463028907776, 0.7915160655975342]
LSTM:
[0.8123180866241455, 0.8202456831932068, 0.8177557587623596, 0.7911052107810974, 0.7901195287704468]
GPT-2:
[0.7938002347946167, 0.7782478332519531, 0.7792690396308899]


In [16]:
flattened = psongs["Lyric"]
flattened = np.reshape(flattened.values, (1,flattened.shape[0]))
flattened = [flattened,flattened,flattened,flattened,flattened]

lana = lyrics_popular["Lyric"]
lana = np.reshape(lana.values, (1,lana.shape[0]))
lana = [lana,lana,lana]

bert_scores = bertscore_metric.compute(predictions=generated['RNN'].values, references=flattened, lang="en")
# Normally, we use the f1-score attribute
print("RNN:")
print(bert_scores['f1'])

bert_scores = bertscore_metric.compute(predictions=generated['LSTM'].values, references=flattened, lang="en")
# Normally, we use the f1-score attribute
print("LSTM:")
print(bert_scores['f1'])

temp=(generated['GPT-2'].dropna())

bert_scores = bertscore_metric.compute(predictions=temp.values, references=lana, lang="en")
# Normally, we use the f1-score attribute
print("GPT-2 (Lana Del Rey):")
print(bert_scores['f1'])

RNN:
[0.7642351388931274, 0.7681809663772583, 0.7660707831382751, 0.7490460276603699, 0.7736159563064575]
LSTM:
[0.7846565842628479, 0.7853808403015137, 0.7853808403015137, 0.7669862508773804, 0.7778748869895935]
GPT-2 (Lana Del Rey):
[0.7807826995849609, 0.7799481749534607, 0.778856635093689]


#### BERT SCORE RESULTS

Our results are okay. However, the problem with BERT Score is that you need references for the generation to compare to. Our models just generate text from training without matching reference. 

We also tried using the entire lyric corpus as a reference but it also does not perform well.

Therefore, this is not a really good evaluation.

### Bleurt Score

In [17]:
#!pip install git+https://github.com/google-research/bleurt.git

bleurt = load_metric("bleurt", module_type="metric")

results = bleurt.compute(predictions=generated['RNN'].values, references=psongs["Lyric"][:5].values)
# Normally, we use the f1-score attribute
print("RNN:")
print(results.values())

results = bleurt.compute(predictions=generated['LSTM'].values, references=psongs["Lyric"][:5].values)
# Normally, we use the f1-score attribute
print("LSTM:")
print(results.values())


artist =  "Lana Del Rey"

# code from Chenghui
lyrics = pd.read_csv('lyrics-data.csv') 
lyrics=lyrics.query("language=='en'")
artists = pd.read_csv("artists-data.csv")
lyrics_df = pd.merge(lyrics,artists,left_on="ALink",right_on="Link")
lyrics_df = lyrics_df[["Artist","Genres","Popularity","Songs","SName","Lyric"]]
lyrics_popular=lyrics_df.query(f"Artist=='{artist}'")
lyrics_popular=lyrics_popular.sort_values(['Songs','Artist'], ascending=[False,True])

results = bleurt.compute(predictions=temp.values, references=lyrics_popular["Lyric"][:3].values)
# Normally, we use the f1-score attribute
print("GPT-2:")
print(results.values())

#Using first x songs as referencce

2022-12-03 18:15:52.474289: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/slurm/18.08.9/lib64/slurm:/cm/shared/apps/slurm/18.08.9/lib64
2022-12-03 18:15:52.474330: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Using default BLEURT-Base checkpoint for sequence maximum length 128. You can use a bigger model for better results with e.g.: datasets.load_metric('bleurt', 'bleurt-large-512').


INFO:tensorflow:Reading checkpoint /home/kyi/.cache/huggingface/metrics/bleurt/default/downloads/extracted/04db1fcea10999e5cc231dbfc408fcc03c8f60e13f3fac524f02511381ec77e0/bleurt-base-128.
INFO:tensorflow:Config file found, reading.
INFO:tensorflow:Will load checkpoint bert_custom
INFO:tensorflow:Loads full paths and checks that files exists.
INFO:tensorflow:... name:bert_custom
INFO:tensorflow:... vocab_file:vocab.txt
INFO:tensorflow:... bert_config_file:bert_config.json
INFO:tensorflow:... do_lower_case:True
INFO:tensorflow:... max_seq_length:128
INFO:tensorflow:Creating BLEURT scorer.
INFO:tensorflow:Creating WordPiece tokenizer.
INFO:tensorflow:WordPiece tokenizer instantiated.
INFO:tensorflow:Creating Eager Mode predictor.
INFO:tensorflow:Loading model.


2022-12-03 18:16:21.054444: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /cm/shared/apps/slurm/18.08.9/lib64/slurm:/cm/shared/apps/slurm/18.08.9/lib64
2022-12-03 18:16:21.054541: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-12-03 18:16:21.054563: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (c0169): /proc/driver/nvidia/version does not exist
2022-12-03 18:16:21.055087: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


INFO:tensorflow:BLEURT initialized.
RNN:
dict_values([[-1.5755281448364258, -1.3506577014923096, -1.3700361251831055, -1.4300510883331299, -1.416689395904541]])
LSTM:
dict_values([[-1.398155927658081, -1.4942516088485718, -1.4351022243499756, -1.5420317649841309, -1.3202416896820068]])
GPT-2:
dict_values([[-1.5025784969329834, -1.3701467514038086, -1.4091140031814575]])


# Manual Evaluation
Seems like the best way to evaluate something like lyric generation is manually evaluating it. 