In [1071]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from sklearn import feature_selection, feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K
import collections
import pathlib
from collections import defaultdict
from pathlib import Path
import pandas as df
import os
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

## Text preprocessing: stemming and lemmatization

In [1072]:
# this text preprocessor function was originally written by Mauro Di Pietro

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    # clean (convert to lowercase and remove punctuations and   
    # characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()
    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [1159]:
# sorting through and extracting clustered files from their corresponding folders
# i.e. path = '/Users/alina/desktop/475/proj/dataset/Lyrics/Cluster*/[corresponding mood folders]/*.txt'
def sort_files(path):
    results = defaultdict(list)
    for folder in Path(path).iterdir():
        for file in folder.iterdir():
            with open(file, "r") as file_open:
                results["mood_file"].append(file)
                results["filename"].append(file.name)
                results["cluster"].append(path.name)
                results["mood"].append(folder.name)
                results["text"].append(file_open.read())
        df = pd.DataFrame(results)
    return df

In [1074]:
p = "/Users/alina/desktop/475/proj/dataset/Lyrics"

df = pd.DataFrame()
for folder in Path(p).iterdir():
    if os.path.isdir(folder):
        temp = sort_files(folder)
        df = pd.concat([df, temp])

df = df.sort_values("filename")
df.to_csv('lyric_bag.csv', index=False)

In [1075]:
df = pd.read_csv("lyric_bag.csv")

## Cleaning Text

In [1076]:
lst_stopwords = nltk.corpus.stopwords.words("english")

df["text_clean"] = df["text"].apply(lambda x: 
          utils_preprocess_text(x, flg_stemm=False, flg_lemm=True, 
          lst_stopwords=lst_stopwords))
df.head()


Unnamed: 0,mood_file,filename,cluster,mood,text,text_clean
0,/Users/alina/desktop/475/proj/dataset/Lyrics/C...,001.txt,Cluster1,Boisterous,Mama he treats your daughter mean \nMama he tr...,mama treat daughter mean mama treat daughter m...
1,/Users/alina/desktop/475/proj/dataset/Lyrics/C...,003.txt,Cluster1,Boisterous,I plopped down in my easy chair and turned on ...,plopped easy chair turned channel 2 bad gunsli...
2,/Users/alina/desktop/475/proj/dataset/Lyrics/C...,004.txt,Cluster1,Boisterous,Back in black \nI hit the sack \nIt's been too...,back black hit sack long im glad back yes im l...
3,/Users/alina/desktop/475/proj/dataset/Lyrics/C...,007.txt,Cluster1,Boisterous,"Woo, I gave you my money, I gave you my time.\...",woo gave money gave time wanna hurt girl serio...
4,/Users/alina/desktop/475/proj/dataset/Lyrics/C...,008.txt,Cluster1,Boisterous,Is it my imagination \nOr have I finally found...,imagination finally found something worth livi...


## Splitting Dataset

In [1110]:
text = pd.DataFrame()

text['cluster'] = df['cluster']
text['text_clean'] = df['text_clean']

#df_train, df_test = model_selection.train_test_split(text, test_size=0.3)
df_train, df_test = model_selection.train_test_split(text, test_size=0.4)

print(df_train.shape, df_test.shape)

(458, 2) (306, 2)


In [1111]:
train_X = df_train['text_clean']
train_y = df_train['cluster']

test_X = df_test['text_clean']
test_y = df_test['cluster']

In [1112]:
tf_idf = TfidfVectorizer()
X_train_tf = tf_idf.fit_transform(train_X)
X_train_tf = tf_idf.transform(train_X)
print("n_samples: %d, n_features: %d" % X_train_tf.shape)

n_samples: 458, n_features: 6557


In [1113]:
X_test_tf = tf_idf.transform(test_X)
print("n_samples: %d, n_features: %d" % X_test_tf.shape)

n_samples: 306, n_features: 6557


In [1114]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

## Using Naive Bayes

In [1115]:
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train_tf, train_y)

y_pred = naive_bayes_classifier.predict(X_test_tf)

accuracy = metrics.accuracy_score(test_y, y_pred)
auc = metrics.roc_auc_score(y_test, predicted_prob, multi_class="ovr")
print("Accuracy:",  round(accuracy,3))
print("Auc:", round(auc,3))
print(metrics.classification_report(test_y, y_pred, target_names=
                                    ['Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5']))

Accuracy: 0.373
Auc: 0.671
              precision    recall  f1-score   support

    Cluster1       0.45      0.09      0.16        53
    Cluster2       0.26      0.31      0.28        45
    Cluster3       0.39      0.89      0.54        88
    Cluster4       0.43      0.21      0.28        72
    Cluster5       0.40      0.04      0.08        48

    accuracy                           0.37       306
   macro avg       0.39      0.31      0.27       306
weighted avg       0.39      0.37      0.30       306



In [1116]:
print("Confusion matrix:")
print(metrics.confusion_matrix(test_y, y_pred))

Confusion matrix:
[[ 5 15 24  7  2]
 [ 2 14 26  3  0]
 [ 1  5 78  3  1]
 [ 1 11 45 15  0]
 [ 2  9 28  7  2]]


In [1117]:
loc = random.choice(test_X.index)
print(loc)
random_test = [test_X[loc]]
random_cluster = test_y[loc]
print(random_cluster)
print(random_test)

316
Cluster3
['im 500 mile away home teardrop fell momma note read thing wrote said miss son love come home well didnt pack right back im 500 mile away home away home away home cold tired alone yes im 500 mile away home know road took day left home sure look different guess look different cause time change everything wonder theyll say see boy looking bad oh wonder theyll say get home cant remember ate thumb walk wait im still 500 mile away home luck right id tonight im 500 mile away home away home away home oh im still 500 mile away home']


In [1118]:
tf_idf2 = TfidfVectorizer()
X_train_tf = tf_idf2.fit_transform(train_X)
X_train_tf = tf_idf2.transform(train_X)
X_test_tf = tf_idf2.transform(test_X)

test_input = tf_idf2.transform(random_test)
test_input.shape

(1, 6557)

In [1119]:
res = naive_bayes_classifier.predict(test_input)[0]
if res == 'Cluster1':
    print("cluster 1")
    
elif res == 'Cluster2':
    print("cluster 2")
    
elif res == 'Cluster3':
    print("cluster 3")
    
elif res == 'Cluster4':
    print("cluster 4")
    
elif res == 'Cluster5':
    print("cluster 5")

cluster 3


### Alternative Naive Bayes approach, using pipelines

In [1120]:
corpus = df_train['text_clean'].values
y_train = df_train['cluster'].values

vec = feature_extraction.text.TfidfVectorizer()
vec = vec.fit(corpus)
X_names = vec.get_feature_names_out()
vec = feature_extraction.text.TfidfVectorizer(vocabulary=X_names)

X_train = vec.fit_transform(corpus)
dic_vocabulary = vec.vocabulary_

classifier = naive_bayes.MultinomialNB()
model = pipeline.Pipeline([('vectorizer', vec),('classifier', classifier)])
model['classifier'].fit(X_train, y_train)

X_test = df_test['text_clean'].values
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

#### Accuracy, Precision, Recall

In [1121]:
accuracy = metrics.accuracy_score(test_y, predicted)
auc = metrics.roc_auc_score(test_y, predicted_prob, multi_class="ovr")
print("Accuracy:",  round(accuracy,3))
print("Auc:", round(auc,3))
print(metrics.classification_report(test_y, predicted, target_names= 
                                    ['Cluster1', 'Cluster2', 'Cluster3', 'Cluster4', 'Cluster5']))

Accuracy: 0.373
Auc: 0.692
              precision    recall  f1-score   support

    Cluster1       0.45      0.09      0.16        53
    Cluster2       0.26      0.31      0.28        45
    Cluster3       0.39      0.89      0.54        88
    Cluster4       0.43      0.21      0.28        72
    Cluster5       0.40      0.04      0.08        48

    accuracy                           0.37       306
   macro avg       0.39      0.31      0.27       306
weighted avg       0.39      0.37      0.30       306



## Using bigrams

In [1134]:
df_train, df_test = model_selection.train_test_split(df, test_size=0.2)
y_train = df_train['cluster'].values
y_test = df_test['cluster'].values

vectorizer2 = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))
corpus = df_train['text_clean']
vectorizer2.fit(corpus)
X_train = vectorizer2.transform(corpus)
dic_vocabulary = vectorizer2.vocabulary_

print(X_train.shape)

(611, 10000)


In [1135]:
classifier = naive_bayes.MultinomialNB()
model = pipeline.Pipeline([('vectorizer', vectorizer2),('classifier', classifier)])
model['classifier'].fit(X_train, y_train)

X_test = df_test['text_clean'].values
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

In [1136]:
accuracy = metrics.accuracy_score(y_test, predicted)
auc = metrics.roc_auc_score(y_test, predicted_prob, multi_class="ovr")
print("Accuracy:",  round(accuracy,3))
print("Auc:", round(auc,3))
print(metrics.classification_report(y_test, predicted))

Accuracy: 0.379
Auc: 0.709
              precision    recall  f1-score   support

    Cluster1       1.00      0.06      0.12        31
    Cluster2       0.29      0.08      0.12        26
    Cluster3       0.39      0.82      0.53        45
    Cluster4       0.36      0.53      0.43        32
    Cluster5       0.00      0.00      0.00        19

    accuracy                           0.38       153
   macro avg       0.41      0.30      0.24       153
weighted avg       0.44      0.38      0.29       153



### Chi-square test

In [1148]:
y = df_train['cluster']
X_names = vectorizer2.get_feature_names_out()
p_value_limit = 0.70
df_features = pd.DataFrame()
for cat in np.unique(y):
    chi2, p = feature_selection.chi2(X_train, y==cat)
    df_features = df_features.append(pd.DataFrame({'feature':X_names, 'score':1-p, 'cluster':cat}))
    df_features = df_features.sort_values(['cluster','score'], ascending=[True,False])
    df_features = df_features[df_features['score']>p_value_limit]
X_names = df_features['feature'].unique().tolist()

print(df_features.shape)
df_features.head(5)

(669, 3)


Unnamed: 0,feature,score,cluster
1056,ch,0.966676,Cluster1
1523,daughter,0.96509,Cluster1
6599,rock,0.964756,Cluster1
5399,mickey,0.961259,Cluster1
4515,kindsa,0.960823,Cluster1


In [1149]:
y_train = df_train['cluster'].values

vectorizer3 = feature_extraction.text.TfidfVectorizer(vocabulary=X_names)
vectorizer3.fit(corpus)
X_train = vectorizer3.transform(corpus)
dic_vocabulary = vectorizer3.vocabulary_

classifier = naive_bayes.MultinomialNB()

model = pipeline.Pipeline([('vectorizer', vectorizer3), ('classifier', classifier)])
model['classifier'].fit(X_train, y_train)

X_test = df_test['text_clean'].values
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

In [1150]:
accuracy = metrics.accuracy_score(y_test, predicted)
auc = metrics.roc_auc_score(y_test, predicted_prob, multi_class="ovr")
print("Accuracy:",  round(accuracy,3))
print("Auc:", round(auc,3))
print(metrics.classification_report(y_test, predicted))

Accuracy: 0.379
Auc: 0.664
              precision    recall  f1-score   support

    Cluster1       0.33      0.23      0.27        31
    Cluster2       0.24      0.27      0.25        26
    Cluster3       0.41      0.67      0.50        45
    Cluster4       0.48      0.34      0.40        32
    Cluster5       0.50      0.16      0.24        19

    accuracy                           0.38       153
   macro avg       0.39      0.33      0.33       153
weighted avg       0.39      0.38      0.36       153



In [1140]:
for cat in np.unique(y):
   print(cat)
   print("  . selected features:", len(df_features[df_features['cluster']==cat]))
   print("  . top features:", ", ".join(df_features[df_features['cluster']==cat]['feature'].values[:10]))
   print(" ")

Cluster1
  . selected features: 334
  . top features: rock rock, ch, mickey, im back, jenny, lucille, get enough, daughter, gonna rock, rock
 
Cluster2
  . selected features: 289
  . top features: soul man, wah, good time, sunshine, alfie, im soul, happy birthday, roll baby, hold hold, hear music
 
Cluster3
  . selected features: 163
  . top features: always something, something remind, wait, prayer, everybody know, pink, daniel, miss, fall piece, angel
 
Cluster4
  . selected features: 202
  . top features: purple, judy, drug rock, sex drug, jane, mary, eater, purple people, shirley, plug
 
Cluster5
  . selected features: 297
  . top features: cant cant, fa, personality, fa fa, niggaz, daddy come, aint aint, gloria, fuck, attitude
 


## Using trigrams

In [1156]:
df_train, df_test = model_selection.train_test_split(df, test_size=0.2)
y_train = df_train['cluster'].values
y_test = df_test['cluster'].values

vectorizer2 = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,3))
corpus = df_train['text_clean']
vectorizer2.fit(corpus)
X_train = vectorizer2.transform(corpus)
dic_vocabulary = vectorizer2.vocabulary_

In [1157]:
classifier = naive_bayes.MultinomialNB()
model = pipeline.Pipeline([('vectorizer', vectorizer2),('classifier', classifier)])
model['classifier'].fit(X_train, y_train)

X_test = df_test['text_clean'].values
predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

In [1158]:
accuracy = metrics.accuracy_score(y_test, predicted)
auc = metrics.roc_auc_score(y_test, predicted_prob, multi_class="ovr")
print("Accuracy:",  round(accuracy,3))
print("Auc:", round(auc,3))
print(metrics.classification_report(y_test, predicted))

Accuracy: 0.405
Auc: 0.708
              precision    recall  f1-score   support

    Cluster1       1.00      0.10      0.18        31
    Cluster2       0.67      0.09      0.15        23
    Cluster3       0.38      0.91      0.54        45
    Cluster4       0.43      0.42      0.43        38
    Cluster5       0.00      0.00      0.00        16

    accuracy                           0.41       153
   macro avg       0.50      0.30      0.26       153
weighted avg       0.52      0.41      0.32       153

