## LOAD DATA

In [1]:
import collections
import pathlib
import re
import string

import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import tensorflow_datasets as tfds
import tensorflow_text as tf_text
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("dataset/tf/main_lines_verses.csv", encoding= 'unicode_escape')

In [3]:
from sklearn.utils import shuffle
df = shuffle(df)

In [4]:
df.head()

Unnamed: 0,artist_name,line
52184,Eminem,"Though what you sacrifice barely is half, neve..."
146363,Gunna,"They flexed on me with rolls, shit don't compa..."
107826,Logic,"I got four cards and they all black, got four ..."
157376,Napalm,She was pissed and I ended up seein' my physician
97785,Kendrick Lamar,Taylor made a career out of music from writing...


In [5]:
df['artist_name'].value_counts().sort_index()

 2Pac & Tyrone Wrice     5
 Drake                  22
 Goldie Loc             79
 King Gordy             32
 Method Man             17
                        ..
iNTeLL                  16
mark curry              18
p. diddy over Hook       8
stic.man, dead prez      9
will.i.am               13
Name: artist_name, Length: 1262, dtype: int64

In [6]:
# df.replace("Jay-Z", "JAY-Z", inplace=True)
# df.replace("KRS-One", "KRS-ONE", inplace=True)
# df.replace("LL Cool J", "L.L. Cool J", inplace=True)
# df.replace("Royce da 5'9", "Royce Da 5'9", inplace=True)
df.replace("Notorious B.I.G.", "The Notorious B.I.G.", inplace=True)
df = df.groupby('artist_name').filter(lambda x : len(x)>1000)
df = df[df['line'].apply(lambda x: len(x.split(" ")) > 3)]

In [7]:
list = ['The Notorious B.I.G.', 'Ice Cube', 'Nas', '2Pac', 'Method Man', 'Eminem',
        'Snoop Dogg', 'DMX', 'Dr. Dre', 'GZA', 'The Notorious B.I.G.', 'RZA']
df = df[df['artist_name'].isin(list)]

In [8]:
df['artist_name'].value_counts()

Eminem                  8377
Ice Cube                4381
Nas                     3821
2Pac                    3775
Method Man              3331
The Notorious B.I.G.    2695
Snoop Dogg              2626
DMX                     2562
Dr. Dre                 2235
GZA                     2033
RZA                     1762
Name: artist_name, dtype: int64

In [9]:
lyrics_features = df['line']
lyrics_labels = df.pop('artist_name')

In [10]:
lyrics_labels

52184         Eminem
70146     Method Man
76335       Ice Cube
34093            DMX
81164         Eminem
             ...    
24910         Eminem
193116    Method Man
47997         Eminem
137243    Method Man
49209         Eminem
Name: artist_name, Length: 37598, dtype: object

In [11]:
set(lyrics_labels)

{'2Pac',
 'DMX',
 'Dr. Dre',
 'Eminem',
 'GZA',
 'Ice Cube',
 'Method Man',
 'Nas',
 'RZA',
 'Snoop Dogg',
 'The Notorious B.I.G.'}

In [12]:
len(set(lyrics_labels))

11

In [13]:
lyrics_labels = lyrics_labels.astype("category")

In [14]:
artist_ids = dict(enumerate(lyrics_labels.cat.categories))
artist_ids

{0: '2Pac',
 1: 'DMX',
 2: 'Dr. Dre',
 3: 'Eminem',
 4: 'GZA',
 5: 'Ice Cube',
 6: 'Method Man',
 7: 'Nas',
 8: 'RZA',
 9: 'Snoop Dogg',
 10: 'The Notorious B.I.G.'}

In [15]:
lyrics_labels = lyrics_labels.cat.codes

In [16]:
lyrics_labels

52184     3
70146     6
76335     5
34093     1
81164     3
         ..
24910     3
193116    6
47997     3
137243    6
49209     3
Length: 37598, dtype: int8

In [17]:
len(set(lyrics_labels))

11

In [18]:
lyrics_ds = tf.data.Dataset.from_tensor_slices((lyrics_features, lyrics_labels))

In [19]:
BUFFER_SIZE = 38000
BATCH_SIZE = 64
VALIDATION_SIZE = 3000

In [20]:
all_labeled_data = lyrics_ds.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

for text, label in all_labeled_data.take(10):
  print("Line: ", text.numpy())
  print("Artist:", label.numpy())

Line:  b'You are your car, what could represent me?'
Artist: 7
Line:  b"Yours is shabby and scab, while mine's glistenin'"
Artist: 7
Line:  b"Some niggas'll get money and pay niggas to back em'"
Artist: 2
Line:  b'Over girls like you with the BIG ol butts'
Artist: 5
Line:  b'Have the biggest dick, but when your shell get hit'
Artist: 10
Line:  b"Slow music, H-Town, no that's down low"
Artist: 7
Line:  b'All I did was give you a style for you to run with'
Artist: 7
Line:  b"They want heat, I'll give it to them burnt and crispy"
Artist: 4
Line:  b"Livin' underage, but he'll blaze on your bitch-ass"
Artist: 0
Line:  b'And up in yo bitch, is where ya might find me'
Artist: 9


In [21]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

### PROCESSING DATA FOR CNN CLASSIFICATION

In [21]:
tokenizer = tf_text.UnicodeScriptTokenizer()

In [22]:
def tokenize(text, unused_label):
  lower_case = tf_text.case_fold_utf8(text)
  return tokenizer.tokenize(lower_case)

In [23]:
tokenized_ds = all_labeled_data.map(tokenize)

Instructions for updating:
`tf.batch_gather` is deprecated, please use `tf.gather` with `batch_dims=-1` instead.


In [24]:
for text_batch in tokenized_ds.take(5):
  print("Tokens: ", text_batch.numpy())

Tokens:  [b'plenty' b'a' b'times' b'a' b'nigga' b'slipped' b'and' b'fell']
Tokens:  [b'talk' b'to' b'you' b'for' b'a' b'minute' b'then' b'my' b'dick' b"'"
 b's' b'in' b'you']
Tokens:  [b'and' b'that' b"'" b's' b'how' b'you' b'get' b'it' b',20' b'years' b'in'
 b'a' b'row']
Tokens:  [b'anybody' b'in' b'your' b'crew' b'when' b'i']
Tokens:  [b'he' b'inhaled' b'so' b'deep' b',' b'shut' b'his' b'eyes' b'like' b'he'
 b'was' b'sleep']


In [26]:
VOCAB_SIZE = 10000

In [27]:
tokenized_ds = configure_dataset(tokenized_ds)

vocab_dict = collections.defaultdict(lambda: 0)
for toks in tokenized_ds.as_numpy_iterator():
  for tok in toks:
    vocab_dict[tok] += 1

vocab = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True)
vocab = [token for token, count in vocab]
vocab = vocab[:VOCAB_SIZE]
vocab_size = len(vocab)
print("Vocab size: ", vocab_size)
print("First five vocab entries:", vocab[:5])

Vocab size:  10000
First five vocab entries: [b"'", b',', b'the', b'i', b'a']


In [28]:
keys = vocab
values = range(2, len(vocab) + 2)  # reserve 0 for padding, 1 for OOV

init = tf.lookup.KeyValueTensorInitializer(
    keys, values, key_dtype=tf.string, value_dtype=tf.int64)

num_oov_buckets = 1
vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets)

In [29]:
def preprocess_text(text, label):
  standardized = tf_text.case_fold_utf8(text)
  tokenized = tokenizer.tokenize(standardized)
  vectorized = vocab_table.lookup(tokenized)
  return vectorized, label

In [30]:
example_text, example_label = next(iter(all_labeled_data))
print("Sentence: ", example_text.numpy())
vectorized_text, example_label = preprocess_text(example_text, example_label)
print("Vectorized sentence: ", vectorized_text.numpy())

Sentence:  b'Plenty a times a nigga slipped and fell'
Vectorized sentence:  [1570    6  417    6   50 2093    9  773]


In [31]:
all_encoded_data = all_labeled_data.map(preprocess_text)

In [32]:
train_data = all_encoded_data.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE)
validation_data = all_encoded_data.take(VALIDATION_SIZE)

In [33]:
train_data = train_data.padded_batch(BATCH_SIZE)
validation_data = validation_data.padded_batch(BATCH_SIZE)

In [34]:
sample_text, sample_labels = next(iter(validation_data))
print("Text batch shape: ", sample_text.shape)
print("Label batch shape: ", sample_labels.shape)
print("First text example: ", sample_text[0])
print("First label example: ", sample_labels[0])

Text batch shape:  (64, 19)
Label batch shape:  (64,)
First text example:  tf.Tensor(
[1570    6  417    6   50 2093    9  773    0    0    0    0    0    0
    0    0    0    0    0], shape=(19,), dtype=int64)
First label example:  tf.Tensor(1, shape=(), dtype=int8)


In [35]:
vocab_size += 2

In [36]:
train_data = configure_dataset(train_data)
validation_data = configure_dataset(validation_data)

## TRAIN MODELS

### CLASSIFIER 1

In [37]:
def create_model(vocab_size, num_labels):
    model = tf.keras.Sequential([
        layers.Embedding(vocab_size, 64, mask_zero=True),
        layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
        layers.GlobalMaxPooling1D(),
        layers.Dense(num_labels)
    ])
    return model

In [38]:
model = create_model(vocab_size=vocab_size, num_labels=11)
model.compile(
    optimizer='adam',
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])
history = model.fit(train_data, validation_data=validation_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [39]:
loss, accuracy = model.evaluate(validation_data)

print("Loss: ", loss)
print("Accuracy: {:2.2%}".format(accuracy))

Loss:  5.4669389724731445
Accuracy: 36.33%


In [40]:
model.save('lyrics_classificer_1')

INFO:tensorflow:Assets written to: lyrics_classificer_1\assets


### CLASSIFIER 1.1

In [41]:
def create_model(vocab_size, num_labels):
    model = tf.keras.Sequential([
        layers.Embedding(vocab_size, 16),
        layers.Dropout(0.2),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.2),
        layers.Dense(num_labels)])
    return model

In [42]:
model = create_model(vocab_size=vocab_size, num_labels=11)
model.compile(
    optimizer='adam',
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])
history = model.fit(train_data, validation_data=validation_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [43]:
loss, accuracy = model.evaluate(validation_data)

print("Loss: ", loss)
print("Accuracy: {:2.2%}".format(accuracy))

Loss:  1.7608269453048706
Accuracy: 42.30%


In [44]:
model.save('lyrics_classificer_1_1')

INFO:tensorflow:Assets written to: lyrics_classificer_1_1\assets


### CLASSIFIER 1.2

In [45]:
def create_model(vocab_size, num_labels):
    model = tf.keras.Sequential([
        layers.Embedding(vocab_size, 64, mask_zero=True),
        layers.Dropout(0.2),
        layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
        layers.Dropout(0.2),
        layers.GlobalMaxPooling1D(),
        layers.Dropout(0.2),
        layers.Dense(num_labels)
    ])
    return model

In [46]:
model = create_model(vocab_size=vocab_size, num_labels=11)
model.compile(
    optimizer='adam',
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'])
history = model.fit(train_data, validation_data=validation_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [47]:
loss, accuracy = model.evaluate(validation_data)

print("Loss: ", loss)
print("Accuracy: {:2.2%}".format(accuracy))

Loss:  3.2865211963653564
Accuracy: 37.83%


In [48]:
model.save('lyrics_classificer_1_2')

INFO:tensorflow:Assets written to: lyrics_classificer_1_2\assets
