In [23]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, SimpleRNN, GRU, LSTM, Bidirectional, Dropout, BatchNormalization
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

In [24]:
df = pd.read_csv("tcc_ceds_music.csv")
df

Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.137110,sadness,1.000000
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.647540,0.954819,0.000002,0.325021,0.263240,world/life,1.000000
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.002770,0.002770,0.002770,...,0.002770,0.225422,0.456298,0.585288,0.840361,0.000000,0.351814,0.139112,music,1.000000
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.775350,0.743736,romantic,1.000000
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.001350,0.001350,0.417772,...,0.068800,0.001350,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28367,82447,mack 10,10 million ways,2019,hip hop,cause fuck leave scar tick tock clock come kno...,78,0.001350,0.001350,0.001350,...,0.065664,0.001350,0.889527,0.759711,0.062549,0.000000,0.751649,0.695686,obscene,0.014286
28368,82448,m.o.p.,ante up (robbin hoodz theory),2019,hip hop,minks things chain ring braclets yap fame come...,67,0.001284,0.001284,0.035338,...,0.001284,0.001284,0.662082,0.789580,0.004607,0.000002,0.922712,0.797791,obscene,0.014286
28369,82449,nine,whutcha want?,2019,hip hop,get ban get ban stick crack relax plan attack ...,77,0.001504,0.154302,0.168988,...,0.001504,0.001504,0.663165,0.726970,0.104417,0.000001,0.838211,0.767761,obscene,0.014286
28370,82450,will smith,switch,2019,hip hop,check check yeah yeah hear thing call switch g...,67,0.001196,0.001196,0.001196,...,0.001196,0.001196,0.883028,0.786888,0.007027,0.000503,0.508450,0.885882,obscene,0.014286


In [25]:
#remove unneccessary data from dataset
df = df[['lyrics', 'genre']]
df

Unnamed: 0,lyrics,genre
0,hold time feel break feel untrue convince spea...,pop
1,believe drop rain fall grow believe darkest ni...,pop
2,sweetheart send letter goodbye secret feel bet...,pop
3,kiss lips want stroll charm mambo chacha merin...,pop
4,till darling till matter know till dream live ...,pop
...,...,...
28367,cause fuck leave scar tick tock clock come kno...,hip hop
28368,minks things chain ring braclets yap fame come...,hip hop
28369,get ban get ban stick crack relax plan attack ...,hip hop
28370,check check yeah yeah hear thing call switch g...,hip hop


In [26]:
#train test split
# Splitting the data into features (X) and target (y)
X = df.drop(columns=['genre'])
y = df['genre']

# Performing train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Assuming y_train and y_test are your target variables
label_binarizer = LabelBinarizer()
y_train = label_binarizer.fit_transform(y_train)
y_test = label_binarizer.transform(y_test)

In [28]:
#reference: https://www.tensorflow.org/text/guide/word_embeddings
# Vocabulary size and number of words in a sequence.
vocab_size = 10000
sequence_length = 100

vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

# Make a text-only dataset (no labels) and call adapt to build the vocabulary.
vectorize_layer.adapt(X_train)

In [29]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(7)
])

In [30]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [31]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    callbacks = [callback])

Epoch 1/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.2022 - loss: 0.4740 - val_accuracy: 0.2615 - val_loss: 0.3833
Epoch 2/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.2747 - loss: 0.3778 - val_accuracy: 0.3004 - val_loss: 0.3734
Epoch 3/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.3268 - loss: 0.3654 - val_accuracy: 0.3369 - val_loss: 0.3652
Epoch 4/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.3828 - loss: 0.3502 - val_accuracy: 0.3625 - val_loss: 0.3586
Epoch 5/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.4325 - loss: 0.3342 - val_accuracy: 0.3676 - val_loss: 0.3558
Epoch 6/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.4693 - loss: 0.3217 - val_accuracy: 0.3693 - val_loss: 0.3547
Epoch 7/15
[1m710/710[0m 

<keras.src.callbacks.history.History at 0x17758ebe440>

In [32]:
model.summary()

Trying RNN:

In [33]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GRU(32, return_sequences=True),
  SimpleRNN(16),
  Dense(7)
])

In [34]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [35]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    callbacks= [callback])

Epoch 1/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 27ms/step - accuracy: 0.2279 - loss: 0.4357 - val_accuracy: 0.2463 - val_loss: 0.3896
Epoch 2/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 34ms/step - accuracy: 0.2575 - loss: 0.3847 - val_accuracy: 0.2811 - val_loss: 0.3772
Epoch 3/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 32ms/step - accuracy: 0.3092 - loss: 0.3687 - val_accuracy: 0.3255 - val_loss: 0.3678
Epoch 4/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 62ms/step - accuracy: 0.3699 - loss: 0.3513 - val_accuracy: 0.3433 - val_loss: 0.3640
Epoch 5/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 38ms/step - accuracy: 0.4038 - loss: 0.3373 - val_accuracy: 0.3535 - val_loss: 0.3628
Epoch 6/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 37ms/step - accuracy: 0.4334 - loss: 0.3244 - val_accuracy: 0.3508 - val_loss: 0.3675


<keras.src.callbacks.history.History at 0x17759821bd0>

In [36]:
model.summary()

LSTM RNN:

In [37]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  LSTM(32),
  Dense(7)
])

In [38]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [39]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    callbacks= [callback])

Epoch 1/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 40ms/step - accuracy: 0.2268 - loss: 0.4241 - val_accuracy: 0.2442 - val_loss: 0.3883
Epoch 2/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 32ms/step - accuracy: 0.2454 - loss: 0.3864 - val_accuracy: 0.2633 - val_loss: 0.3850
Epoch 3/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 22ms/step - accuracy: 0.2794 - loss: 0.3763 - val_accuracy: 0.3149 - val_loss: 0.3702
Epoch 4/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 43ms/step - accuracy: 0.3378 - loss: 0.3580 - val_accuracy: 0.3343 - val_loss: 0.3670
Epoch 5/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 19ms/step - accuracy: 0.3787 - loss: 0.3448 - val_accuracy: 0.3503 - val_loss: 0.3650
Epoch 6/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 36ms/step - accuracy: 0.4243 - loss: 0.3299 - val_accuracy: 0.3408 - val_loss: 0.3722


<keras.src.callbacks.history.History at 0x17759f33580>

In [40]:
model.summary()

Bidirectional:

In [41]:
embedding_dim=16

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  Bidirectional(LSTM(32, return_sequences=True), input_shape=(5,7)),
  Bidirectional(LSTM(16)),
  Dense(7)
])

  super().__init__(**kwargs)


In [42]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [43]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=0,
    verbose=0,
    mode='auto',
    baseline=None,
    restore_best_weights=False,
    start_from_epoch=0
)

model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=15,
    callbacks= [callback])

Epoch 1/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 87ms/step - accuracy: 0.2241 - loss: 0.4214 - val_accuracy: 0.2463 - val_loss: 0.3885
Epoch 2/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 109ms/step - accuracy: 0.2461 - loss: 0.3850 - val_accuracy: 0.2892 - val_loss: 0.3738
Epoch 3/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 91ms/step - accuracy: 0.3314 - loss: 0.3616 - val_accuracy: 0.3440 - val_loss: 0.3633
Epoch 4/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 139ms/step - accuracy: 0.4253 - loss: 0.3350 - val_accuracy: 0.3651 - val_loss: 0.3564
Epoch 5/15
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 97ms/step - accuracy: 0.4944 - loss: 0.3089 - val_accuracy: 0.3669 - val_loss: 0.3646


<keras.src.callbacks.history.History at 0x1775d0bf4f0>

In [44]:
model.summary()

Test with different params:

In [45]:
embedding_dim=100

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  Bidirectional(LSTM(128, return_sequences=True)),
  Dropout(.5),
  Bidirectional(LSTM(64)),
  BatchNormalization(),
  Dense(64, activation="relu"),
  Dropout(.5),
  Dense(7, activation="softmax")
])

In [46]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

In [48]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,  # Stop after 3 epochs if no improvement
    restore_best_weights=True
)

model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    callbacks= [callback])

Epoch 1/50
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 176ms/step - accuracy: 0.4965 - loss: 1.3409 - val_accuracy: 0.3570 - val_loss: 1.6571
Epoch 2/50
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 202ms/step - accuracy: 0.5822 - loss: 1.1426 - val_accuracy: 0.3468 - val_loss: 1.8093
Epoch 3/50
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 160ms/step - accuracy: 0.6504 - loss: 0.9750 - val_accuracy: 0.3530 - val_loss: 1.9536
Epoch 4/50
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 164ms/step - accuracy: 0.7138 - loss: 0.8215 - val_accuracy: 0.3344 - val_loss: 2.1880


<keras.src.callbacks.history.History at 0x1777d52f8e0>

In [49]:
model.summary()

Do hyperparameter optimization

In [None]:
from tensorflow import keras
from kerastuner.tuners import RandomSearch

def build_model(hp):
    model = keras.Sequential([
        vectorize_layer,
        Embedding(vocab_size, hp.Int('embedding_dim', min_value=32, max_value=256, step=32), name="embedding"),
        Bidirectional(LSTM(hp.Int('units_1', min_value=32, max_value=256, step=32), return_sequences=True)),
        Dropout(hp.Float('dropout_1', min_value=0.2, max_value=0.5, step=0.1)),
        Bidirectional(LSTM(hp.Int('units_2', min_value=32, max_value=256, step=32))),
        BatchNormalization(),
        Dense(hp.Int('units_3', min_value=32, max_value=256, step=32), activation="relu"),
        Dropout(hp.Float('dropout_2', min_value=0.2, max_value=0.5, step=0.1)),
        Dense(7, activation="softmax")
    ])

    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    return model

  from kerastuner.tuners import RandomSearch


In [None]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,  # Stop after 3 epochs if no improvement
    restore_best_weights=True
)

# Define tuner
tuner = RandomSearch(
    build_model,
    objective='val_accuracy',
    max_trials=10,
    executions_per_trial=2,
    directory='my_dir',
    project_name='text_classification')

# Start the search for the best hyperparameter configuration
tuner.search(X_train, y_train,
             validation_data=(X_test, y_test),
             epochs=50,
             callbacks=[callback])

# Get the best hyperparameters
# best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

Trial 10 Complete [00h 40m 41s]
val_accuracy: 0.34140968322753906

Best val_accuracy So Far: 0.35885462164878845
Total elapsed time: 06h 42m 23s


In [None]:
# print(best_hps.values)

{'embedding_dim': 224, 'units_1': 32, 'dropout_1': 0.2, 'units_2': 64, 'units_3': 224, 'dropout_2': 0.2}


Run with new hyperparameters:
{'embedding_dim': 224, 'units_1': 32, 'dropout_1': 0.2, 'units_2': 64, 'units_3': 224, 'dropout_2': 0.2}

In [None]:
embedding_dim=224

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  Bidirectional(LSTM(32, return_sequences=True)),
  Dropout(.2),
  Bidirectional(LSTM(64)),
  BatchNormalization(),
  Dense(224, activation="relu"),
  Dropout(.2),
  Dense(7, activation="softmax")
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.CategoricalCrossentropy(),
              metrics=['accuracy'])

In [None]:
callback = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=3,  # Stop after 3 epochs if no improvement
    restore_best_weights=True
)

model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=50,
    callbacks= [callback])

Epoch 1/50
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m744s[0m 1s/step - accuracy: 0.2750 - loss: 1.7652 - val_accuracy: 0.3381 - val_loss: 1.6885
Epoch 2/50
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 66ms/step - accuracy: 0.4571 - loss: 1.4206 - val_accuracy: 0.3239 - val_loss: 2.0978
Epoch 3/50
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 99ms/step - accuracy: 0.5718 - loss: 1.1505 - val_accuracy: 0.3609 - val_loss: 1.8098
Epoch 4/50
[1m710/710[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 139ms/step - accuracy: 0.6775 - loss: 0.8815 - val_accuracy: 0.3350 - val_loss: 2.1807


<keras.src.callbacks.history.History at 0x177436add80>

In [None]:
model.summary()