In [None]:
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from utils import Attention
from keras.layers import Embedding

import matplotlib.pyplot as plt
import seaborn as sns 
plt.style.use('fivethirtyeight')
%matplotlib inline

# The Data

In [None]:
df = pd.read_csv('train-lyrics.csv')
test_df = pd.read_csv('test-lyrics.csv')

joined_genres = pd.concat([df['genre'], test_df['genre']]).reset_index(drop=True)
cat_labels = []
for genre in joined_genres:
    if genre == "country": cat_labels.append(0)
    elif genre == "pop": cat_labels.append(1)
    elif genre == "r-b": cat_labels.append(2)
    elif genre == "rock": cat_labels.append(3)
    elif genre == "rap": cat_labels.append(4)

texts = pd.concat([df['input texts'], test_df['input texts']]).reset_index(drop=True)

# Loading in Pre-trained Word Vectors (GLoVE)

In [None]:
def load_glove(file):
    
    f = open(file, 'r', encoding='utf8')
    glove_model = {}
    
    for line in f:
        split_lines = line.split()
        if len(split_lines) > 1:
            word = split_lines[0]
            word_embedding = np.array([float(value) for value in split_lines[1:]])
            
            glove_model[word] = word_embedding
    
    print(len(glove_model), " words loaded")
    return glove_model

glove_path = '../glove.6B.200d.txt'

glove_pretrained = load_glove(glove_path)

## Creating Vocabulary and Tokenizer

In [None]:
NUM_WORDS = 200
tokenizer = Tokenizer(num_words=NUM_WORDS, 
                      filters='’()``''')

# Create vocabulary from lyrics data
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

vocab_size = len(word_index)
vocab_size

In [None]:
MAX_SEQUENCE_LENGTH = 100

# Creating our data matrix
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Creating label matrix 
labels = tf.keras.utils.to_categorical(cat_labels)
labels

### Creating Embedding Matrix from GLoVE embeddings

In [None]:
len(word_index)

In [None]:
# Creating our embeddings matrix 

EMBEDDING_DIM = 200

embeddings_matrix = np.zeros((vocab_size + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    embedding_vector = glove_pretrained.get(word)
    
    try:
    
        if embedding_vector is not None:
            
            embeddings_matrix[i] = embedding_vector
    except:
        
        print(word)

In [None]:
[embeddings_matrix]

# Data Splitting 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, 
                                                    labels,
                                                    stratify=labels)

In [None]:
y_train.sum(axis=0)

In [None]:
y_test.sum(axis=0)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
num_labels = y_train.shape[1]
num_labels

# Baseline Model 1 - LSTM 

In [None]:
def create_model(rnn_units=200,
                 optimizer=tf.keras.optimizers.Adam(), 
                 loss=tf.keras.losses.CategoricalCrossentropy(),
                 metrics=["accuracy"]):
    
    # Embedding layer
    embedding_layer = Embedding(vocab_size + 1, 
                            EMBEDDING_DIM, 
                            weights=[embeddings_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH, 
                            trainable=False, 
                            name="embedding_layer") 
    
    sequence_input = layers.Input(shape=(MAX_SEQUENCE_LENGTH, ), 
                                  dtype="int32", 
                                  name="input layer")
    
    embeddings = embedding_layer(sequence_input)
    
    rnn_output = layers.LSTM(rnn_units,
                             name="LSTM")(embeddings)
    
    output = layers.Dense(num_labels, 
                          activation="softmax", 
                          name="output_layer")(rnn_output)
    
    model = keras.Model(inputs=sequence_input, outputs=output)
    
    print(model.summary())
    
    model.compile(optimizer=optimizer, 
                  loss=loss, 
                  metrics=metrics)
    
    return model

In [None]:
def create_model_training_df(history_dict, num_epochs):
    epochs_index = pd.Index(data=list(range(1, num_epochs + 1)), name="epoch")
    out_df = pd.DataFrame(history_dict).set_index(epochs_index)
    
    return out_df

def plot_model_performance(df, model_name):
    
    df_one = df[['loss', 'val_loss']]
    plt.figure(figsize=(7, 7))
    sns.lineplot(data=df_one)
    plt.title(model_name + " Loss by Epochs");
    plt.show();
    
    df_two = df[['accuracy', 'val_accuracy']]
    plt.figure(figsize=(7, 7))
    sns.lineplot(data=df_two)
    plt.title(model_name + " Accuracy by Epochs");
    plt.show();
    
    
def evaluate_model(model):
    
    return model.evaluate(X_test, y_test)

In [None]:
def fit_model(create_fn):
    
    model = create_fn()
    
    history = model.fit(X_train, y_train, 
                        epochs=10, 
                        batch_size=20, 
                        validation_data=(X_test, y_test),
                        verbose=1)
    
    return model, history

In [None]:
baseline_model_one, history = fit_model(create_model)

In [None]:
model_one_history_df = create_model_training_df(history.history, 10)
plot_model_performance(model_one_history_df, "LSTM Baseline Model")

In [None]:
baseline_model_one.evaluate(X_test, y_test)

# Improving Model 1

In [None]:
def create_model_imp(rnn_units=200,
                 optimizer=tf.keras.optimizers.Adam(), 
                 loss=tf.keras.losses.CategoricalCrossentropy(),
                 metrics=["accuracy"]):
    
    # Embedding layer
    embedding_layer = Embedding(vocab_size + 1, 
                            EMBEDDING_DIM, 
                            weights=[embeddings_matrix], 
                            input_length=MAX_SEQUENCE_LENGTH, 
                            trainable=True) # Changed weights to be trainable 
    
    sequence_input = layers.Input(shape=(MAX_SEQUENCE_LENGTH, ), 
                                  dtype="int32")
    
    embeddings = embedding_layer(sequence_input)
    
    rnn_output = layers.LSTM(rnn_units)(embeddings)
    
    dropout = layers.Dropout(0.5)(rnn_output) # Added a dropout layer to reduce overfitting 
    
    output = layers.Dense(num_labels, 
                          activation="softmax")(dropout)
    
    model = keras.Model(inputs=sequence_input, outputs=output)
    
    model.compile(optimizer=optimizer, 
                  loss=loss, 
                  metrics=metrics)
    
    return model

In [None]:
improved_model_one, improved_model_one_history = fit_model(create_model_imp)

In [None]:
improved_model_history_df = create_model_training_df(improved_model_one_history.history, 10)
plot_model_performance(improved_model_history_df, "LSTM Model (trainable weights)")

In [None]:
improved_model_one.evaluate(X_test, y_test)

# Improving Model 1 Pt. 2

In [None]:
def create_model_impt_pt2(rnn_units=200,
                 optimizer=tf.keras.optimizers.Adam(), 
                 loss=tf.keras.losses.CategoricalCrossentropy(),
                 metrics=["accuracy"]):
    
    # Embedding layer
    embedding_layer = Embedding(vocab_size + 1, 
                                EMBEDDING_DIM, 
                                weights=[embeddings_matrix], 
                                input_length=MAX_SEQUENCE_LENGTH, 
                                trainable=True) 
    
    
    model = keras.Sequential()
    
    model.add(layers.Input(shape=(MAX_SEQUENCE_LENGTH, ), 
                                  dtype="int32"))
    
    model.add(embedding_layer)
    
    model.add(layers.LSTM(rnn_units,
                          return_sequences=True))
    
    model.add(layers.Dropout(0.5))
    
    model.add(layers.LSTM(rnn_units)    # Added a second LSTM layer
    
    model.add(layers.Dropout(0.5)) # And a second dropout layer
    
    
    model.add(layers.Dense(num_labels,
                           activation="softmax"))
    
    print(model.summary())
    
    model.compile(optimizer=optimizer, 
                  loss=loss, 
                  metrics=metrics)
    
    return model

In [None]:
improved_model_one_pt2, improved_model_one_pt2_history = fit_model(create_model_impt_pt2)

In [None]:
improved_model_pt2_history_df = create_model_training_df(improved_model_one_pt2_history.history, 10)
plot_model_performance(improved_model_pt2_history_df, "Stacked LSTM Model")

In [None]:
improved_model_one_pt2.evaluate(X_test, y_test)

# Improving Model 1 Pt. 3

In [None]:
def create_model_pt3(rnn_units=200,
                     a_units=200, 
                     optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
                     loss=tf.keras.losses.CategoricalCrossentropy(),
                     metrics=["accuracy"]):
    
    sequence_input = layers.Input(shape=(MAX_SEQUENCE_LENGTH, ), 
                                  dtype="int32")
    
    # Embedding layer
    embedding_layer = Embedding(vocab_size + 1, 
                                EMBEDDING_DIM, 
                                weights=[embeddings_matrix], 
                                input_length=MAX_SEQUENCE_LENGTH, 
                                trainable=True)(sequence_input) 
    
    lstm_out, hidden_h, hidden_c = layers.LSTM(rnn_units, 
                                               name="LSTM", 
                                               return_sequences=True, 
                                               return_state=True)(embedding_layer)
    
    
    x, attn_weights = Attention(a_units)(lstm_out, hidden_h)
    
    
    x = layers.Dense(50)(x)
    
    
    output = layers.Dense(num_labels,
                          activation="softmax")(x)
    
    model = keras.Model(inputs=sequence_input, outputs=output)
    
    print(model.summary())
    
    model.compile(optimizer=optimizer, 
                  loss=loss, 
                  metrics=metrics)
    
    return model

In [None]:
improved_model_pt3, improved_model_pt3_history = fit_model(create_model_pt3)

In [None]:
improved_model_pt3_df = create_model_training_df(improved_model_pt3_history.history, 10)
plot_model_performance(improved_model_pt3_df, "LSTM Model (w Attention)")

In [None]:
evaluate_model(improved_model_pt3)

In [None]:
from sklearn.metrics import confusion_matrix

y_pred = improved_model_pt3.predict(X_test)
cm = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))

genre_lst = ["country", "pop", "r-b", "rock", "rap"]

sns.heatmap(cm, cmap="Blues", 
            xticklabels=genre_lst, 
            yticklabels=genre_lst)

plt.title("Confusion Matrix");

# Comparing all versions of Model 1

## Baseline Model 1

In [None]:
keras.utils.plot_model(baseline_model_one, show_shapes=True, dpi=90)

In [None]:
baseline_model_one.evaluate(X_test, y_test)

## Improved Model 1 Version 1

In [None]:
keras.utils.plot_model(improved_model_one, show_shapes=True, dpi=90)

In [None]:
improved_model_one.evaluate(X_test, y_test)

## Improved Model 1 Version 2

In [None]:
keras.utils.plot_model(improved_model_one_pt2, show_shapes=True, dpi=90)

In [None]:
improved_model_one_pt2.evaluate(X_test, y_test)

## Improved Model 1 Version 3

In [None]:
keras.utils.plot_model(improved_model_pt3, show_shapes=True, dpi=90)

In [None]:
improved_model_pt3.evaluate(X_test, y_test)