In [None]:
import pandas as pd
import numpy as np

from transformers import BertTokenizer, TFBertModel
from tensorflow.keras import layers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from utils import Attention
from keras.layers import Embedding

from sklearn.model_selection import train_test_split
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns 
plt.style.use('fivethirtyeight')
%matplotlib inline

In [None]:
def genre_to_numbers(arr):
    output = []
    for genre in arr:
        if genre == "country": output.append(0)
        elif genre == "pop": output.append(1)
        elif genre == "r-b": output.append(2)
        elif genre == "rock": output.append(3)
        elif genre == "rap": output.append(4)
    return output

def remove_duplicate_words(string):
    temp = string.split()
    
    return " ".join(sorted(set(temp), key=temp.index))

In [None]:
train_df = pd.read_csv('train-lyrics.csv')
test_df = pd.read_csv('test-lyrics.csv')

joined_genres = pd.concat([train_df['genre'], test_df['genre']]).reset_index(drop=True)
cat_labels = []
for genre in joined_genres:
    if genre == "country": cat_labels.append(0)
    elif genre == "pop": cat_labels.append(1)
    elif genre == "r-b": cat_labels.append(2)
    elif genre == "rock": cat_labels.append(3)
    elif genre == "rap": cat_labels.append(4)

texts = pd.concat([train_df['input texts'], test_df['input texts']]).reset_index(drop=True)



semicleaned_input_texts = texts.str.replace("``", "")\
                   .str.replace("""''""", "")\
                   .str.replace("(", "")\
                   .str.replace("’", "")\
                   .str.replace(")", "")

num_labels = len(test_df['genre'].value_counts())
new_d = {'genre':joined_genres, 'sem texts':semicleaned_input_texts}
df = pd.DataFrame(new_d)
df['texts'] = df['sem texts'].apply(remove_duplicate_words)
df.head(2)

# Model Utility Functions

In [None]:
def fit_model(create_fn, 
              epochs=20, 
              batch_size=20, 
              **kwargs):
    
    model = create_fn(kwargs)
    
    history = model.fit([bert_x_train_tokenized.input_ids, 
                         bert_x_train_tokenized.attention_mask], 
                         y_train, 
                         validation_data=([
                               bert_x_test_tokenized.input_ids, 
                               bert_x_test_tokenized.attention_mask], y_test), 
                           epochs = epochs, 
                           batch_size = batch_size)

    return model, history

In [None]:
def create_model_training_df(history_dict, num_epochs):
    epochs_index = pd.Index(data=list(range(1, num_epochs + 1)), name="epoch")
    out_df = pd.DataFrame(history_dict).set_index(epochs_index)
    
    return out_df

def plot_model_performance(df, model_num):
    
    df_one = df[['loss', 'val_loss']]
    plt.figure(figsize=(7, 7))
    sns.lineplot(data=df_one)
    plt.title(f"Model {model_num} Loss by Epochs");
    plt.show();
    
    df_two = df[['accuracy', 'val_accuracy']]
    plt.figure(figsize=(7, 7))
    sns.lineplot(data=df_two)
    plt.title(f"Model {model_num} Accuracy by Epochs");
    plt.show();
    
    
def evaluate_model(model):
    
    return model.evaluate(bert_x_test_tokenized.input_ids, 
                          bert_x_test_tokenized.attention_mask,
                          y_test)

## Huggingface BERT

In [None]:
from transformers import (DistilBertTokenizer, TFDistilBertModel, DistilBertConfig)

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [None]:
try: 
    del bert 
except: 
    pass

config = DistilBertConfig.from_pretrained("distilbert-base-cased", 
                                          output_hidden_states=True,
                                          output_attentions=True, 
                                          return_dict=False)
bert = TFDistilBertModel.from_pretrained('distilbert-base-cased', config=config)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['texts'],
                                                    df['genre'], 
                                                    stratify=df['genre'])

bert_X_train = X_train.to_list()
bert_X_test = X_test.to_list()
bert_y_train = genre_to_numbers(y_train)
bert_y_test = genre_to_numbers(y_test)

y_train = tf.keras.utils.to_categorical(bert_y_train)
y_test = tf.keras.utils.to_categorical(bert_y_test)

In [None]:
num_labels

In [None]:
MAX_SEQUENCE_LENGTH = 100
bert_x_train_tokenized = tokenizer(bert_X_train, 
                                   max_length=MAX_SEQUENCE_LENGTH, 
                                   add_special_tokens=False,
                                   truncation=True,
                                   padding='max_length', 
                                   return_tensors="tf")

bert_x_test_tokenized = tokenizer(bert_X_test, 
                                  max_length=MAX_SEQUENCE_LENGTH, 
                                  add_special_tokens=False,
                                  truncation=True, 
                                  padding='max_length',
                                  return_tensors="tf")

In [None]:
def create_model(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                                loss=tf.keras.losses.CategoricalCrossentropy(),
                                metrics=["accuracy"]):
    
    input_ids = layers.Input(shape=(MAX_SEQUENCE_LENGTH, ), 
                             dtype=tf.int32)
    attention_mask = layers.Input(shape=(MAX_SEQUENCE_LENGTH, ), 
                                  dtype=tf.int32)
    
    bert_inputs = {
        'input_ids': input_ids, 
        'attention_mask': attention_mask
    }
    
    
    bert_out = bert(input_ids=bert_inputs['input_ids'], 
                    attention_mask=bert_inputs['attention_mask'])[0]
    
    
    x = layers.Dense(64, activation="relu")(bert_out)
    
    x = layers.GlobalMaxPool1D()(x)
    
    dropout = layers.Dropout(0.5)(x)

    x = tf.keras.layers.Dense(num_labels, activation='softmax')(dropout)

    model = tf.keras.Model(inputs=[input_ids, attention_mask], 
                                          outputs=[classification])
    
    print(model.summary())
    
    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=metrics)


    return model

In [None]:
bert_model, bert_model_history = fit_model(create_model, 
                                           epochs=10,
                                           batch_size=50)

In [None]:
bert_model_df = create_model_training_df(bert_model_history.history, 10)
plot_model_performance(bert_model_df, 4)

In [None]:
bert_model.evaluate([bert_x_test_tokenized.input_ids, bert_x_test_tokenized.attention_mask], y_test)

# DistilBERT Model W Attention

In [None]:
try: 
    del bert 
except:
    pass

bert = TFDistilBertModel.from_pretrained('distilbert-base-cased', config=config)

def create_model_pt2(hidden_size=100, 
                                train_layers=-1, 
                                optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                                loss=tf.keras.losses.CategoricalCrossentropy(),
                                metrics=["accuracy"]):
    
    input_ids = layers.Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    attention_mask = layers.Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype=tf.int32)
    
    bert_inputs = {
        'input_ids': input_ids, 
        'attention_mask': attention_mask
    }
    
    
    bert_out = bert(input_ids=bert_inputs['input_ids'], 
                    attention_mask=bert_inputs['attention_mask'])[0]

    
    x, y, z = layers.LSTM(300, 
                          return_sequences=True, 
                          return_state=True)(bert_out)
    
    x, y = Attention(100)(x, y)
    
    x = layers.Dropout(0.5)(x)
    
    classification = tf.keras.layers.Dense(num_labels, 
                                           activation='softmax')(x)

    model = tf.keras.Model(inputs=[input_ids, attention_mask], 
                           outputs=[classification])
    
    print(model.summary())
    
    model.compile(optimizer=optimizer,
                  loss=loss,
                  metrics=metrics)


    return model

In [None]:
bert_model_pt2, bert_model_pt2_history = fit_model(create_model_pt2, 
                                           train_layers=0,
                                           epochs=10,
                                           batch_size=50)

In [None]:
bert_model_pt2_df = create_model_training_df(bert_model_pt2_history.history, 10)
plot_model_performance(bert_model_pt2_df, 4)

In [None]:
bert_model_pt2.evaluate([bert_x_test_tokenized.input_ids, bert_x_test_tokenized.attention_mask], y_test)

In [None]:
y_pred = bert_model_pt2.predict([bert_x_test_tokenized.input_ids, bert_x_test_tokenized.attention_mask])

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))

genre_lst = ["country", "pop", "r-b", "rock", "rap"]

sns.heatmap(cm, cmap="Blues", 
            xticklabels=genre_lst, 
            yticklabels=genre_lst)

plt.title("Confusion Matrix");