In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow_hub as hub

In [None]:
#train_df = pd.read_csv("train_dataset.csv", usecols = ['title', 'artist', 'genre', 'general_genre', 'emotion_4Q', 'emotion_2Q', 'lyrics'])
train_df = pd.read_csv("train_dataset.csv", usecols = ['emotion_4Q', 'emotion_2Q', 'lyrics'])
val_df = pd.read_csv("val_dataset.csv", usecols = ['emotion_4Q', 'emotion_2Q', 'lyrics'])
test_df = pd.read_csv("test_dataset.csv", usecols = ['emotion_4Q', 'emotion_2Q', 'lyrics'])

In [None]:
#train_df.head()

In [None]:
# Data Preprocessing
# Convert emotion labels to one-hot encoding
train_labels = pd.get_dummies(train_df['emotion_4Q'])
val_labels = pd.get_dummies(val_df['emotion_4Q'])
test_labels = pd.get_dummies(test_df['emotion_4Q'])

# Convert lyrics to numpy arrays
train_lyrics = np.array(train_df['lyrics'])
val_lyrics = np.array(val_df['lyrics'])
test_lyrics = np.array(test_df['lyrics'])

In [None]:
# Creating TensorFlow Dataset Objects
def df_to_dataset(dataframe, labels, shuffle=True, batch_size=1024):
    ds = tf.data.Dataset.from_tensor_slices((dataframe, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

train_data = df_to_dataset(train_lyrics, train_labels)
val_data = df_to_dataset(val_lyrics, val_labels)
test_data = df_to_dataset(test_lyrics, test_labels)

In [None]:
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords
nltk.download('stopwords')

# Get English stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    # Remove text within square brackets
    text = re.sub(r'\[.*?\]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text.strip()

# Apply the cleaning function to each lyric in the dataset
train_df['clean_lyrics'] = train_df['lyrics'].apply(clean_text)
val_df['clean_lyrics'] = val_df['lyrics'].apply(clean_text)
test_df['clean_lyrics'] = test_df['lyrics'].apply(clean_text)

# Re-create TensorFlow Dataset objects with cleaned lyrics
train_data = df_to_dataset(train_df['clean_lyrics'], train_labels)
val_data = df_to_dataset(val_df['clean_lyrics'], val_labels)
test_data = df_to_dataset(test_df['clean_lyrics'], test_labels)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# Print a few lines of cleaned lyrics data
print("Sample cleaned lyrics from training dataset:")
for i in range(5):
    print(train_df['clean_lyrics'].iloc[i])
    print('-' * 50)

In [None]:
# Text Embedding with TensorFlow Hub
embedding = "https://tfhub.dev/google/universal-sentence-encoder/4"
hub_layer = hub.KerasLayer(embedding, dtype=tf.string, trainable=True)

In [None]:
# Model Definition and Compilation
model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(train_labels.shape[1], activation='softmax'))  # Output layer for multi-class classification

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

In [None]:
# Model Training  (model version 1)
history = model.fit(train_data, epochs=8, validation_data=val_data)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
# Model Evaluation
model.evaluate(test_data)



[1.2744420766830444, 0.523809552192688]

In [None]:
model.evaluate(val_data)



[1.2766435146331787, 0.4948805570602417]

In [None]:
model.evaluate(train_data)



[1.2381280660629272, 0.6245434880256653]

LSTM

In [None]:
import re

def clean_text(text):
    # Remove text within square brackets
    text = re.sub(r'\[.*?\]', '', text)
    # Convert text to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters and extra whitespaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Apply the cleaning function to each lyric in the dataset
train_df['clean_lyrics'] = train_df['lyrics'].apply(clean_text)
val_df['clean_lyrics'] = val_df['lyrics'].apply(clean_text)
test_df['clean_lyrics'] = test_df['lyrics'].apply(clean_text)

# Re-create TensorFlow Dataset objects with cleaned lyrics
train_data = df_to_dataset(train_df['clean_lyrics'], train_labels)
val_data = df_to_dataset(val_df['clean_lyrics'], val_labels)
test_data = df_to_dataset(test_df['clean_lyrics'], test_labels)

In [None]:
# LSTM Model
encoder = tf.keras.layers.TextVectorization(max_tokens=2000)
encoder.adapt(train_data.map(lambda text, label: text))

In [None]:
vocab_size = len(encoder.get_vocabulary())
vocab = np.array(encoder.get_vocabulary())
vocab[:50]

array(['', '[UNK]', 'the', 'you', 'i', 'and', 'to', 'a', 'me', 'it', 'my',
       'in', 'of', 'your', 'on', 'im', 'that', 'is', 'all', 'we', 'for',
       'be', 'so', 'dont', 'its', 'no', 'like', 'with', 'just', 'up',
       'but', 'what', 'love', 'oh', 'this', 'know', 'now', 'got', 'can',
       'if', 'when', 'out', 'do', 'go', 'youre', 'down', 'yeah', 'get',
       'are', 'come'], dtype='<U13')

In [None]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=32,
        mask_zero=True
    ),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.4),
    tf.keras.layers.Dense(train_labels.shape[1], activation='softmax')  # Output layer for multi-class classification
])


In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVe  (None, None)              0         
 ctorization)                                                    
                                                                 
 embedding (Embedding)       (None, None, 32)          64000     
                                                                 
 lstm (LSTM)                 (None, 32)                8320      
                                                                 
 dense_3 (Dense)             (None, 32)                1056      
                                                                 
 dropout_2 (Dropout)         (None, 32)                0         
                                                                 
 dense_4 (Dense)             (None, 4)                 132       
                                                      

In [None]:
model.evaluate(train_data)



[1.3861325979232788, 0.23666910827159882]

In [None]:
model.evaluate(val_data)



[1.3869799375534058, 0.20136518776416779]

In [None]:
# Model Training
history = model.fit(train_data, epochs=8, validation_data=val_data)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
# Model Evaluation
model.evaluate(test_data)



[1.3779428005218506, 0.28911563754081726]