In [9]:
import numpy as np
import pandas as pd

In [10]:
data = pd.read_csv('data/forModelTraining.csv')

In [11]:
data['Artist'].value_counts()

Eminem             3000
J. Cole            3000
Nicki Minaj        3000
Lil Pump           3000
Taylor Swift       3000
One Direction      3000
Bruno Mars         3000
Freddie Mercury    3000
Billie Eilish      3000
Hozier             3000
Name: Artist, dtype: int64

In [12]:
data.sample(10)

Unnamed: 0,Artist,Title,Lyrics
28826,Hozier,Whole Lotta Love,"Say it in the street, that's a knock-out"
27786,Hozier,Jackie and Wilson,But you'll come back each time you leave
5005,J. Cole,Immortal,You know the greatest films of all time were never made
900,Eminem,The Warning,But all I felt was shame and you held my lifeless frame
211,Eminem,Godzilla,But I knew you
24142,Billie Eilish,The Hill,When you were sixteen at the yogurt shop
1919,Eminem,Lucky You,"Drunk under a streetlight, I"
18330,Bruno Mars,If I Knew,"Baby doll, when it comes to a lover"
21277,Freddie Mercury,Stop All The Fighting (Non-Album B-Side),"Free of women with madness, their men and bad habits"
2671,Eminem,Normal,'Cause there we are again in the middle of the night


In [13]:
data.reset_index(inplace=True, drop=True)

In [14]:
X = data['Lyrics'].astype(str)

In [15]:
arts = dict(enumerate(data['Artist'].unique()))

In [16]:
arts

{0: 'Eminem',
 1: 'J. Cole',
 2: 'Nicki Minaj',
 3: 'Lil Pump',
 4: 'Taylor Swift',
 5: 'One Direction',
 6: 'Bruno Mars',
 7: 'Freddie Mercury',
 8: 'Billie Eilish',
 9: 'Hozier'}

In [54]:
y = data['Artist'].replace(arts.values(), arts.keys())

In [55]:
from sklearn.model_selection import train_test_split

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [57]:
from tensorflow import keras as kr

In [58]:
tok = kr.preprocessing.text.Tokenizer()

In [59]:
tok.fit_on_texts(X_train)

In [60]:
vocsize = len(tok.word_index)

In [61]:
X.apply(lambda x: len(x.split())).nlargest(10)

18135    216
14192    210
23872    161
41324     92
40726     88
41494     87
41379     83
23873     76
41416     75
14067     74
Name: Lyrics, dtype: int64

In [62]:
seqlen = X.apply(lambda x: len(x.split())).max()

In [63]:
seqlen

216

In [64]:
vec = kr.layers.TextVectorization(max_tokens=vocsize+1, output_sequence_length=216)

In [65]:
vec.adapt(X_train)

In [66]:
X_train = vec(X_train)
X_test = vec(X_test)

In [67]:
model = kr.models.Sequential()

model.add(kr.layers.Embedding(vocsize+1, seqlen, input_length=seqlen))
model.add(kr.layers.GlobalAveragePooling1D())
model.add(kr.layers.Dropout(0.5))
model.add(kr.layers.Dense(seqlen*2, activation=kr.activations.relu))
model.add(kr.layers.Dropout(0.5))
model.add(kr.layers.Dense(seqlen*3, activation=kr.activations.relu))
model.add(kr.layers.Dropout(0.5))
model.add(kr.layers.Dense(len(y.unique()), activation=kr.activations.softmax))

model.compile(optimizer=kr.optimizers.Adam(learning_rate=3e-4), loss=kr.losses.sparse_categorical_crossentropy,
              metrics=kr.metrics.sparse_categorical_accuracy)

In [68]:
early = kr.callbacks.EarlyStopping(monitor='val_sparse_categorical_accuracy', mode='max', patience=20, restore_best_weights=True)

In [69]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 216, 216)          3095280   
                                                                 
 global_average_pooling1d_1   (None, 216)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_3 (Dropout)         (None, 216)               0         
                                                                 
 dense_3 (Dense)             (None, 432)               93744     
                                                                 
 dropout_4 (Dropout)         (None, 432)               0         
                                                                 
 dense_4 (Dense)             (None, 648)               280584    
                                                      

In [30]:
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=200, callbacks=[early])

Epoch 1/200


ValueError: in user code:

    File "C:\Users\harsh\miniconda3\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\harsh\miniconda3\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\harsh\miniconda3\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\harsh\miniconda3\lib\site-packages\keras\engine\training.py", line 993, in train_step
        y_pred = self(x, training=True)
    File "C:\Users\harsh\miniconda3\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "C:\Users\harsh\miniconda3\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 216), found shape=(None, 17)


In [None]:
y_hat = model.predict(X_test)

In [None]:
print('Accuracy: {}%'.format(round((y_hat.argmax(axis=1) == y_test).mean()*100, 2)))

In [274]:
hist = pd.DataFrame(history.history)

In [202]:
import plotly.express as px

In [287]:
sr = 'takea'
hat = model.predict(np.reshape(vec(sr), (-1, 17)))
print(arts[hat.argmax()], '\n', round(hat.max()*100, 2))

Billie Eilish 
 86.24


In [335]:
px.line(hist, title='Model Training', hover_name='variable', labels={'value': 'Metrics', 'index': 'Epoch'})

In [343]:
model.save('finalModel.h5', save_format='h5')

In [337]:
ppc.copy(str(arts))

In [338]:
import tensorflow as tf

In [340]:
xmodel = kr.models.Sequential()
xmodel.add(tf.keras.Input(shape=(1,), dtype=tf.string))
xmodel.add(vec)

In [342]:
xmodel.save('vec_model', save_format='tf')

INFO:tensorflow:Assets written to: vec_model\assets
