# 8-Genre Deep Models
### **Genre Label: 8**
### **Data: Under sampling (max 7000 for train) || clean_df**
### **Bert Model: Uncase**

In [1]:
#@title Installs
!pip install pydot --quiet
!pip install gensim --quiet
!pip install tensorflow==2.15.0 --quiet #15 13
!pip install tf_keras==2.15.0 --quiet
!pip install tensorflow-datasets==4.8 --quiet #8
!pip install tensorflow-text==2.15.0 --quiet #15
!pip install transformers==4.17 --quiet #4.40.2 #4.37.2



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m475.2/475.2 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m74.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m59.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m442.0/442.0 kB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorstore 0.1.63 requires ml-dtypes>=0.3.1, but you have ml-dtypes 0.2.0 which is incompatible.
tf-ker

In [2]:
import pandas as pd
import numpy as np
import os
import time
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

from tqdm import tqdm
from multiprocessing import Pool, cpu_count


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Pre-Processing

In [4]:
cleaned_df = pd.read_pickle('/content/drive/MyDrive/w266/266 Final Project/Music4all Dataset/cleaned_df_8_genre.pkl')

In [None]:
train_df, remaining_df = train_test_split(cleaned_df, test_size=0.20, random_state=42)
val_df, test_df = train_test_split(remaining_df, test_size=0.50, random_state=42)

print('Training set shape:', train_df.shape)
print('Validation set shape:', val_df.shape)
print('Test set shape:', test_df.shape)

Training set shape: (59256, 8)
Validation set shape: (7407, 8)
Test set shape: (7407, 8)


In [None]:
# random undersample to fit the lowest class which is alternative
from sklearn.utils import resample

max_samples = 7000
undersampled_train_df = pd.DataFrame()

for genre in train_df['broad_genre'].unique():
    genre_df = train_df[train_df['broad_genre'] == genre]
    if len(genre_df) > max_samples:
        genre_df = resample(genre_df, replace=False, n_samples=max_samples, random_state=42)
    undersampled_train_df = pd.concat([undersampled_train_df, genre_df])

print('Training set genre distribution:')
print(undersampled_train_df['broad_genre'].value_counts())
train_df = undersampled_train_df

Training set genre distribution:
broad_genre
Pop                 7000
Jazz and R&B        7000
Rock                7000
Country and Folk    7000
Punk and Metal      7000
Electronic          4324
Hip Hop & Rap       2693
Reggae              2060
Name: count, dtype: int64


In [None]:
# getting the text and tables from train, validation and test
train_text = train_df['lyrics'].tolist()
val_text = val_df['lyrics'].tolist()
test_text = test_df['lyrics'].tolist()

train_label = train_df['broad_genre'].tolist()
val_label = val_df['broad_genre'].tolist()
test_label = test_df['broad_genre'].tolist()

In [None]:
train_df['lyric_length'] = train_df.lyrics.apply(lambda x: len(x.split()))
val_df['lyric_length'] = val_df.lyrics.apply(lambda x: len(x.split()))
test_df['lyric_length'] = test_df.lyrics.apply(lambda x: len(x.split()))

In [None]:
numerical_features = ['release', 'danceability', 'energy', 'valence', 'lyric_length']

# scaling numerical features
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(train_df[numerical_features])
X_val_scale = scaler.transform(val_df[numerical_features])
X_test_scale = scaler.transform(test_df[numerical_features])

In [None]:
# getting rid of \n in the lyrics column
def preprocess_text(text):
  return text.replace('\n', ' ')

train_text = [preprocess_text(text) for text in train_text]
val_text = [preprocess_text(text) for text in val_text]
test_text = [preprocess_text(text) for text in test_text]


In [None]:
encode_label = LabelEncoder()
train_label = encode_label.fit_transform(train_label)
val_label = encode_label.transform(val_label)
test_label = encode_label.transform(test_label)

### Tokenize dataset with expanded genre label

In [None]:
word_len = [len(w.split()) for w in train_text]

sum(word_len) / len(word_len)

192.37491208566826

In [None]:
MAX_SEQUENCE_LENGTH = 250

In [None]:
from transformers import BertTokenizer, TFBertModel
import torch

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# training
train_tokenized = bert_tokenizer(train_text,
                                 max_length=MAX_SEQUENCE_LENGTH,
                                 truncation=True,
                                 padding='max_length',
                                 return_tensors='tf')
train_inputs = [train_tokenized.input_ids, train_tokenized.token_type_ids, train_tokenized.attention_mask]
train_inputs_num = [train_tokenized.input_ids,
                train_tokenized.token_type_ids,
                train_tokenized.attention_mask,
                X_train_scale]
train_labels = np.array(train_label)

# validation
val_tokenized = bert_tokenizer(val_text,
                               max_length=MAX_SEQUENCE_LENGTH,
                               truncation=True,
                               padding='max_length',
                               return_tensors='tf')
val_inputs = [val_tokenized.input_ids, val_tokenized.token_type_ids, val_tokenized.attention_mask]
val_inputs_num = [val_tokenized.input_ids,
              val_tokenized.token_type_ids,
              val_tokenized.attention_mask,
              X_val_scale]
val_labels = np.array(val_label)

# test
test_tokenized = bert_tokenizer(test_text,
                                max_length=MAX_SEQUENCE_LENGTH,
                                truncation=True,
                                padding='max_length',
                                return_tensors='tf')
test_inputs = [test_tokenized.input_ids, test_tokenized.token_type_ids, test_tokenized.attention_mask]
test_inputs_num = [test_tokenized.input_ids,
               test_tokenized.token_type_ids,
               test_tokenized.attention_mask,
               X_test_scale]
test_labels = np.array(test_label)

In [None]:
print('Train input IDs shape:', train_inputs[0].shape)
print('Validation input IDs shape:', val_inputs[0].shape)
print('Test input IDs shape:', test_inputs[0].shape)
print('First tokenized train input:', train_inputs[0][0])
print('First tokenized validation input:', val_inputs[0][0])
print('First tokenized test input:', test_inputs[0][0])

Train input IDs shape: (44077, 250)
Validation input IDs shape: (7407, 250)
Test input IDs shape: (7407, 250)
First tokenized train input: tf.Tensor(
[  101  2514  4558  7200  2113  2272  8307  2963  3275  4558  2617 10436
  4452  3959  2895  6069 12985  2364  8432  5913  3342  2735  2157  3426
  2113  2444  9647  4826  7955 18987  3377  3342 12342  3362  2152  2514
  2907  2292  2066 12342 12342  2051  2088  2210  3553  2146  3201  2157
  2146  2514  2503  2113  4452  3959  2895  6069 12985  2364  8432  5913
  3342  2735  2157  3426  2113  2444  9647  4826  7955 18987  3377  3342
 12342  2425  2360  6211  2051  2689  2568 10587  2215  2342  2066  2342
  7200  4797  2903  4452  3959  2895  6069 12985  2364  8432  5913  3342
  2735  2157  3426  2113  2444  9647  4826  7955 18987  3377  3342 12342
   102     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     

## Model function

In [None]:
#@title Text-Only BERT Model

def text_only_bert(bert_base_model,
                    max_sequence_length=MAX_SEQUENCE_LENGTH,
                    hidden_size = 100,
                    num_hidden = 1,
                    dropout=0.3,
                    learning_rate=2e-5,
                    last_layer_num=11,
                    num_classes=14
                  ):
    # freezing all bert layers except the last transformer block
    for w in bert_base_model.weights:
        if f'layer_._{last_layer_num}' not in w.name and 'pooler' not in w.name:
            w._trainable = False

    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_out = bert_base_model(bert_inputs)

    cls_token = bert_out[0][:, 0, :]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)
    hidden = tf.keras.layers.Dropout(dropout)(hidden)

    for i in range(num_hidden - 1):
      hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name=f'hidden_layer_{i+1}')(hidden)
      hidden = tf.keras.layers.Dropout(dropout)(hidden)


    classification = tf.keras.layers.Dense(num_classes, activation='softmax', name='classification_layer')(hidden)
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=1.0),
                                 loss='sparse_categorical_crossentropy',
                                 metrics=['accuracy'])
    return classification_model

In [None]:
#@title Text & Numerical Feature BERT Model

def text_numerical_bert(bert_base_model,
                          max_sequence_length=MAX_SEQUENCE_LENGTH,
                          hidden_size = 100,
                          num_hidden = 1,
                          dropout=0.3,
                          learning_rate=2e-5,
                          last_layer_num=11,
                          num_classes=14,
                          num_non_text_features=5
                       ):
    # freezing all bert layers except the last transformer block
    for w in bert_base_model.weights:
        if f'layer_._{last_layer_num}' not in w.name and 'pooler' not in w.name:
            w._trainable = False

    input_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_sequence_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}

    bert_out = bert_base_model(bert_inputs)

    cls_token = bert_out[0][:, 0, :]

    # Non-text features input
    non_text_features_input = tf.keras.layers.Input(shape=(num_non_text_features,), dtype=tf.float32, name='non_text_features')

    # Combine CLS token with non-text features
    combined_features = tf.keras.layers.Concatenate(name='combined_features')([cls_token, non_text_features_input])


    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(combined_features)
    hidden = tf.keras.layers.Dropout(dropout)(hidden)

    for i in range(num_hidden - 1):
      hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name=f'hidden_layer_{i+1}')(hidden)
      hidden = tf.keras.layers.Dropout(dropout)(hidden)


    classification = tf.keras.layers.Dense(num_classes, activation='softmax', name='classification_layer')(hidden)
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask, non_text_features_input], outputs=[classification])
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, clipnorm=1.0),
                                 loss='sparse_categorical_crossentropy',
                                 metrics=['accuracy'])
    return classification_model

## Train Models

In [None]:
num_classes = len(encode_label.classes_)
checkpoint_dir = '/content/drive/MyDrive/w266/266 Final Project/model_checkpoints/'
bert_last_layer_num = 11

### Text-Only BERT Model

In [None]:
bert_model_text = text_only_bert(
    bert_model,
    max_sequence_length=MAX_SEQUENCE_LENGTH,
    hidden_size=200,
    dropout=0.3,
    learning_rate=2e-5,
    last_layer_num=bert_last_layer_num,
    num_classes = num_classes
)

bert_model_text.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask_layer (Inpu  [(None, 250)]                0         []                            
 tLayer)                                                                                          
                                                                                                  
 input_ids_layer (InputLaye  [(None, 250)]                0         []                            
 r)                                                                                               
                                                                                                  
 token_type_ids_layer (Inpu  [(None, 250)]                0         []                            
 tLayer)                                                                                    

In [None]:
#calculate sample weights
sample_weight = np.ones(shape=(len(train_labels),))
for label in range(np.max(train_labels) + 1):
    class_weight = 1 - np.sum(train_labels == label) / len(train_labels)
    sample_weight[train_labels == label] = class_weight

In [None]:
bert_model_text_history = bert_model_text.fit(
    train_inputs,
    train_labels,
    sample_weight=sample_weight,
    validation_data=(val_inputs, val_labels),
    epochs=6,
    batch_size=8
)


Epoch 1/6




Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
val_predictions = bert_model_text.predict(val_inputs)
val_predictions = np.argmax(val_predictions, axis=1)
print(classification_report(val_labels, val_predictions, target_names=encode_label.classes_))


                  precision    recall  f1-score   support

Country and Folk       0.44      0.55      0.49      1002
      Electronic       0.31      0.24      0.27       509
   Hip Hop & Rap       0.78      0.76      0.77       343
    Jazz and R&B       0.47      0.48      0.47      1155
             Pop       0.46      0.47      0.46      1489
  Punk and Metal       0.52      0.74      0.61       989
          Reggae       0.63      0.38      0.48       290
            Rock       0.51      0.35      0.41      1630

        accuracy                           0.49      7407
       macro avg       0.51      0.50      0.50      7407
    weighted avg       0.49      0.49      0.48      7407



### Text & Numerical Feature BERT Model

In [None]:
bert_model_num = text_numerical_bert(
    bert_model,
    max_sequence_length=MAX_SEQUENCE_LENGTH,
    hidden_size=200,
    dropout=0.3,
    learning_rate=2e-5,
    last_layer_num=bert_last_layer_num,
    num_classes = num_classes
)

bert_model_num.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask_layer (Inpu  [(None, 250)]                0         []                            
 tLayer)                                                                                          
                                                                                                  
 input_ids_layer (InputLaye  [(None, 250)]                0         []                            
 r)                                                                                               
                                                                                                  
 token_type_ids_layer (Inpu  [(None, 250)]                0         []                            
 tLayer)                                                                                    

In [None]:
bert_model_num_history = bert_model_num.fit(
    train_inputs_num,
    train_labels,
    sample_weight=sample_weight,
    validation_data=(val_inputs_num, val_labels),
    epochs=6,
    batch_size=8
)


Epoch 1/6




Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
val_predictions = bert_model_num.predict(val_inputs_num)
val_predictions = np.argmax(val_predictions, axis=1)
print(classification_report(val_labels, val_predictions, target_names=encode_label.classes_))


                  precision    recall  f1-score   support

Country and Folk       0.46      0.63      0.53      1002
      Electronic       0.31      0.37      0.34       509
   Hip Hop & Rap       0.62      0.82      0.71       343
    Jazz and R&B       0.53      0.43      0.47      1155
             Pop       0.53      0.38      0.44      1489
  Punk and Metal       0.58      0.76      0.66       989
          Reggae       0.54      0.53      0.54       290
            Rock       0.52      0.43      0.47      1630

        accuracy                           0.51      7407
       macro avg       0.51      0.54      0.52      7407
    weighted avg       0.51      0.51      0.50      7407



## Test Models

### Text-Only BERT Model

In [None]:
test_predictions = bert_model_text.predict(test_inputs)
test_predictions = np.argmax(test_predictions, axis=1)
print(classification_report(test_labels, test_predictions, target_names=encode_label.classes_))

                  precision    recall  f1-score   support

Country and Folk       0.44      0.58      0.50      1025
      Electronic       0.34      0.24      0.28       573
   Hip Hop & Rap       0.75      0.77      0.76       335
    Jazz and R&B       0.43      0.45      0.44      1087
             Pop       0.47      0.45      0.46      1536
  Punk and Metal       0.53      0.73      0.61      1051
          Reggae       0.58      0.34      0.42       259
            Rock       0.47      0.33      0.39      1541

        accuracy                           0.48      7407
       macro avg       0.50      0.49      0.48      7407
    weighted avg       0.47      0.48      0.47      7407



### Text & Numerical Feature BERT Model

In [None]:
test_predictions = bert_model_num.predict(test_inputs_num)
test_predictions = np.argmax(test_predictions, axis=1)
print(classification_report(test_labels, test_predictions, target_names=encode_label.classes_))

### Finding Most Common Word Used Between Each Genre

In [5]:
cleaned_df.head()

Unnamed: 0,release,danceability,energy,valence,lyrics,broad_genre,track_name,artist_name
2498,2009,0.635,0.746,0.548,"It's a sunny day, so I got nowhere to hide\nNo...",Pop,rain on me,cheryl
6415,1974,0.319,0.925,0.658,Goering's on the phone to Freiburg\nSay's Will...,Rock,me 262,blue öyster cult
43361,2009,0.357,0.708,0.47,Absorbing your words\nBattles raging within me...,Punk and Metal,nostalgia,the chameleons
21714,2008,0.809,0.913,0.648,"Yeah\nYeah\n\nLove always finds a way, every s...",Pop,one world,the cheetah girls
9623,2008,0.617,0.922,0.6,"Well, it's midnight, damn right\nWe're wound u...",Rock,burn it to the ground,nickelback


In [8]:
import spacy
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_stop]
    return ' '.join(tokens)

cleaned_df['lyrics'] = cleaned_df['lyrics'].apply(preprocess_text)

# getting the genres
unique_genres = cleaned_df['broad_genre'].unique()

genre_vectorizers = {genre: CountVectorizer(stop_words='english') for genre in unique_genres}
genre_word_counts = {}

for genre in unique_genres:
    genre_lyrics = cleaned_df[cleaned_df['broad_genre'] == genre]['lyrics']
    word_count_matrix = genre_vectorizers[genre].fit_transform(genre_lyrics)
    word_counts = word_count_matrix.sum(axis=0)
    word_counts = [(word, word_counts[0, idx]) for word, idx in genre_vectorizers[genre].vocabulary_.items()]
    genre_word_counts[genre] = pd.DataFrame(word_counts, columns=['word', 'count']).sort_values(by='count', ascending=False)

for genre in unique_genres:
    print(f"Top 10 most common words for {genre}:")
    print(genre_word_counts[genre].head(10))
    print("\n")









Top 10 most common words for Pop:
     word  count
46   know  32645
39   love  31696
57     oh  28910
22   like  25624
74   come  17729
106  baby  17434
38   yeah  17346
48   feel  16395
42   time  16392
153  want  16377


Top 10 most common words for Rock:
     word  count
17   know  22974
324  love  18491
88     oh  17523
21   like  16392
184  come  15802
69   time  14785
99   want  12231
164  feel  12012
157  yeah  11408
182  away   9696


Top 10 most common words for Punk and Metal:
     word  count
66   know  11482
12   time   8845
113  like   8039
57   come   7827
91   feel   7430
143  life   7056
96    let   6880
34   away   6224
68    way   6019
71   love   5707


Top 10 most common words for Jazz and R&B:
     word  count
24   know  16190
15   love  12850
153  like  11586
23   baby  11140
138  time  10720
50   come   9984
47   yeah   8546
10   feel   7835
123  want   6919
32     oh   6458


Top 10 most common words for Electronic:
     word  count
72   love   9917
77   know   