## LOAD DATA

In [1]:
import collections
import pathlib
import re
import string

import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

import tensorflow_datasets as tfds
import tensorflow_text as tf_text
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv("dataset/tf/main_lines_verses.csv", encoding= 'unicode_escape')

In [12]:
from sklearn.utils import shuffle
df = shuffle(df)

In [13]:
df.head()

Unnamed: 0,artist_name,line
89780,Playboi Carti,"Just stay focused on your mission, and don't e..."
59970,Young Buck,Follow the sparkles on the bottles and I appear
38489,Dr. Dre,Why a motherfuckin' brother is hard to find
13491,Big Pun,"I'm like the beast with a warrant, far from a ..."
41055,Drake,"Man, someone just gave you the run-around"


In [14]:
df['artist_name'].value_counts()

Eminem              8530
Ice Cube            4490
Kendrick Lamar      4320
Juice WRLD          3941
Nas                 3872
                    ... 
Dr. Dre & MC Ren       2
Tego Calderon          2
Mae Day                1
T3                     1
Skyzoo                 1
Name: artist_name, Length: 1262, dtype: int64

In [15]:
# df.replace("Jay-Z", "JAY-Z", inplace=True)
# df.replace("KRS-One", "KRS-ONE", inplace=True)
# df.replace("LL Cool J", "L.L. Cool J", inplace=True)
# df.replace("Royce da 5'9", "Royce Da 5'9", inplace=True)
df.replace("Notorious B.I.G.", "The Notorious B.I.G.", inplace=True)
df = df.groupby('artist_name').filter(lambda x : len(x)>1000)
df = df[df['line'].apply(lambda x: len(x.split(" ")) > 3)]

In [16]:
list = ['The Notorious B.I.G.', 'Ice Cube', 'Nas', '2Pac', 'Method Man', 'Eminem',
        'Ghostface Killah', 'Snoop Dogg', 'DMX', 'Dr. Dre', 'GZA', 'RZA']
df = df[df['artist_name'].isin(list)]

In [17]:
df['artist_name'].value_counts()

Eminem                  8377
Ice Cube                4381
Nas                     3821
2Pac                    3775
Method Man              3331
Ghostface Killah        3063
The Notorious B.I.G.    2695
Snoop Dogg              2626
DMX                     2562
Dr. Dre                 2235
GZA                     2033
RZA                     1762
Name: artist_name, dtype: int64

In [19]:
west_coast_list = ['Ice Cube', '2Pac', 'Eminem', 'Snoop Dogg', 'Dr. Dre']

df["coast"] = ["W" if el in west_coast_list else "E" for el in df["artist_name"]]

In [23]:
df.drop(['artist_name'], axis=1, inplace=True)

In [24]:
df.head()

Unnamed: 0,line,coast
38489,Why a motherfuckin' brother is hard to find,W
3834,Come take a journey through my mind's eye,W
72277,"I go somewhere, don't remember how I came",E
51237,"Of rap, to take it to the next level, boost it",W
191965,"The eightball murder verse, freestyle or rehea...",E


In [25]:
lyrics_features = df['line']
lyrics_labels = df.pop('coast')

In [26]:
lyrics_labels

38489     W
3834      W
72277     E
51237     W
191965    E
         ..
186340    E
79793     W
49070     W
3438      W
67590     E
Name: coast, Length: 40661, dtype: object

In [27]:
set(lyrics_labels)

{'E', 'W'}

In [28]:
len(set(lyrics_labels))

2

In [29]:
lyrics_labels = lyrics_labels.astype("category")

In [30]:
artist_ids = dict(enumerate(lyrics_labels.cat.categories))
artist_ids

{0: 'E', 1: 'W'}

In [31]:
lyrics_labels = lyrics_labels.cat.codes

In [32]:
lyrics_labels

38489     1
3834      1
72277     0
51237     1
191965    0
         ..
186340    0
79793     1
49070     1
3438      1
67590     0
Length: 40661, dtype: int8

In [33]:
len(set(lyrics_labels))

2

In [34]:
lyrics_ds = tf.data.Dataset.from_tensor_slices((lyrics_features, lyrics_labels))

In [35]:
BUFFER_SIZE = 38000
BATCH_SIZE = 64
VALIDATION_SIZE = 3000

In [37]:
all_labeled_data = lyrics_ds.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

for text, label in all_labeled_data.take(10):
  print("Line: ", text.numpy())
  print("Coast:", label.numpy())

Line:  b"With wild imaginings that you can't discuss"
Coast: 0
Line:  b'This beat is cray-cray, Ray J, H-A-H-A-H-A'
Coast: 1
Line:  b'The police are gonna have to come and get me'
Coast: 1
Line:  b"And now I see you on a video with Michel'le"
Coast: 1
Line:  b"Eazy-E's, Ice Cube's, and D.O.C.'s"
Coast: 1
Line:  b'Gold around his neck in 14 K heaven'
Coast: 1
Line:  b"How it's Christmas time and my rhyme's steady bumpin'"
Coast: 1
Line:  b"I ain't know whether to cry or just, try to laugh it off"
Coast: 0
Line:  b"You sold me your soul when you didn't say no"
Coast: 0
Line:  b'If I sell a brick I can buy a house'
Coast: 1


In [38]:
AUTOTUNE = tf.data.AUTOTUNE

def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)

### PROCESSING DATA FOR CNN CLASSIFICATION

In [39]:
tokenizer = tf_text.UnicodeScriptTokenizer()

In [40]:
def tokenize(text, unused_label):
  lower_case = tf_text.case_fold_utf8(text)
  return tokenizer.tokenize(lower_case)

In [41]:
tokenized_ds = all_labeled_data.map(tokenize)

Instructions for updating:
`tf.batch_gather` is deprecated, please use `tf.gather` with `batch_dims=-1` instead.


In [42]:
for text_batch in tokenized_ds.take(5):
  print("Tokens: ", text_batch.numpy())

Tokens:  [b'with' b'wild' b'imaginings' b'that' b'you' b'can' b"'" b't' b'discuss']
Tokens:  [b'this' b'beat' b'is' b'cray' b'-' b'cray' b',' b'ray' b'j' b',' b'h'
 b'-' b'a' b'-' b'h' b'-' b'a' b'-' b'h' b'-' b'a']
Tokens:  [b'the' b'police' b'are' b'gonna' b'have' b'to' b'come' b'and' b'get'
 b'me']
Tokens:  [b'and' b'now' b'i' b'see' b'you' b'on' b'a' b'video' b'with' b'michel'
 b"'" b'le']
Tokens:  [b'eazy' b'-' b'e' b"'" b's' b',' b'ice' b'cube' b"'" b's' b',' b'and'
 b'd' b'.' b'o' b'.' b'c' b".'" b's']


In [43]:
VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 250

In [44]:
tokenized_ds = configure_dataset(tokenized_ds)

vocab_dict = collections.defaultdict(lambda: 0)
for toks in tokenized_ds.as_numpy_iterator():
  for tok in toks:
    vocab_dict[tok] += 1

vocab = sorted(vocab_dict.items(), key=lambda x: x[1], reverse=True)
vocab = [token for token, count in vocab]
vocab = vocab[:VOCAB_SIZE]
vocab_size = len(vocab)
print("Vocab size: ", vocab_size)
print("First five vocab entries:", vocab[:5])

Vocab size:  10000
First five vocab entries: [b',', b"'", b'the', b'i', b'a']


In [45]:
keys = vocab
values = range(2, len(vocab) + 2)  # reserve 0 for padding, 1 for OOV

init = tf.lookup.KeyValueTensorInitializer(
    keys, values, key_dtype=tf.string, value_dtype=tf.int64)

num_oov_buckets = 1
vocab_table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets)

In [46]:
def preprocess_text(text, label):
  standardized = tf_text.case_fold_utf8(text)
  tokenized = tokenizer.tokenize(standardized)
  vectorized = vocab_table.lookup(tokenized)
  return vectorized, label

In [47]:
example_text, example_label = next(iter(all_labeled_data))
print("Sentence: ", example_text.numpy())
vectorized_text, example_label = preprocess_text(example_text, example_label)
print("Vectorized sentence: ", vectorized_text.numpy())

Sentence:  b"With wild imaginings that you can't discuss"
Vectorized sentence:  [   20   539 10000    16     7    40     3    14  3667]


In [48]:
all_encoded_data = all_labeled_data.map(preprocess_text)

In [49]:
train_data = all_encoded_data.skip(VALIDATION_SIZE).shuffle(BUFFER_SIZE)
validation_data = all_encoded_data.take(VALIDATION_SIZE)

In [50]:
train_data = train_data.padded_batch(BATCH_SIZE)
validation_data = validation_data.padded_batch(BATCH_SIZE)

In [51]:
sample_text, sample_labels = next(iter(validation_data))
print("Text batch shape: ", sample_text.shape)
print("Label batch shape: ", sample_labels.shape)
print("First text example: ", sample_text[0])
print("First label example: ", sample_labels[0])

Text batch shape:  (64, 21)
Label batch shape:  (64,)
First text example:  tf.Tensor(
[   20   539 10000    16     7    40     3    14  3667     0     0     0
     0     0     0     0     0     0     0     0     0], shape=(21,), dtype=int64)
First label example:  tf.Tensor(0, shape=(), dtype=int8)


In [52]:
vocab_size += 2

In [53]:
train_data = configure_dataset(train_data)
validation_data = configure_dataset(validation_data)

## TRAIN MODELS

### CLASSIFIER 1

In [54]:
def create_model(vocab_size):
#     model = tf.keras.Sequential([
#         layers.Embedding(vocab_size, 64, mask_zero=True),
#         layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
#         layers.GlobalMaxPooling1D(),
#         layers.Dense(num_labels)
#     ])
    model = tf.keras.Sequential([
        layers.Embedding(vocab_size, 64, mask_zero=True),
        layers.Dropout(0.2),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.2),
        layers.Dense(1)])
    return model

In [56]:
model = create_model(vocab_size=vocab_size)
model.compile(
    optimizer='adam',
    loss=losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy'])
history = model.fit(train_data, validation_data=validation_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [57]:
loss, accuracy = model.evaluate(validation_data)

print("Loss: ", loss)
print("Accuracy: {:2.2%}".format(accuracy))

Loss:  0.8145785331726074
Accuracy: 68.27%


In [58]:
model.save('coast_classificer_1')

INFO:tensorflow:Assets written to: coast_classificer_1\assets


### CLASSIFIER 1.1

In [59]:
def create_model(vocab_size):
    model = tf.keras.Sequential([
        layers.Embedding(vocab_size, 16),
        layers.Dropout(0.2),
        layers.GlobalAveragePooling1D(),
        layers.Dropout(0.2),
        layers.Dense(1)])
    return model

In [60]:
model = create_model(vocab_size=vocab_size)
model.compile(
    optimizer='adam',
    loss=losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy'])
history = model.fit(train_data, validation_data=validation_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [61]:
loss, accuracy = model.evaluate(validation_data)

print("Loss: ", loss)
print("Accuracy: {:2.2%}".format(accuracy))

Loss:  0.6511495113372803
Accuracy: 68.20%


In [62]:
model.save('coast_classificer_1_1')

INFO:tensorflow:Assets written to: coast_classificer_1_1\assets


### CLASSIFIER 1.2

In [63]:
def create_model(vocab_size):
    model = tf.keras.Sequential([
        layers.Embedding(vocab_size, 64, mask_zero=True),
        layers.Dropout(0.2),
        layers.Conv1D(64, 5, padding="valid", activation="relu", strides=2),
        layers.Dropout(0.2),
        layers.GlobalMaxPooling1D(),
        layers.Dropout(0.2),
        layers.Dense(1)
    ])
    return model

In [64]:
model = create_model(vocab_size=vocab_size)
model.compile(
    optimizer='adam',
    loss=losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy'])
history = model.fit(train_data, validation_data=validation_data, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [65]:
loss, accuracy = model.evaluate(validation_data)

print("Loss: ", loss)
print("Accuracy: {:2.2%}".format(accuracy))

Loss:  1.8632304668426514
Accuracy: 66.43%


In [66]:
model.save('coast_classificer_1_2')

INFO:tensorflow:Assets written to: coast_classificer_1_2\assets
