In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd

from string import punctuation
from nltk.corpus import stopwords
from tensorflow.python import keras
from tensorflow.python.keras import layers
from tensorflow.python.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE

In [2]:
def clean_lyric_tokens(tokens):
    #remove punctuation from each token
    table = str.maketrans('', '', punctuation)
    tokens = [w.translate(table) for w in tokens]
    #remove tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    #filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if not word in stop_words]
    #filter out short tokens
    tokens = [word for word in tokens if len(word) > 2]

    return tokens

def tokens_to_line(tokens, vocab):
    # clean doc
    tokens = clean_lyric_tokens(tokens)
    # filter by vocab
    tokens = [w for w in tokens if w in vocab]
    
    return ' '.join(tokens)

def baseline_model():
    audio_model = keras.Sequential([
        layers.Dense(128, input_dim=6, activation=tf.nn.relu),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(64, activation=tf.nn.relu),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
    ])
    
    lyrics_model = keras.Sequential([
        layers.Dense(64, input_shape=(4563,), activation=tf.nn.relu),
        layers.Dropout(0.5),
        layers.Dense(64, activation=tf.nn.relu),
        layers.Dropout(0.5),
    ])
    
    merged = layers.Add()([audio_model.output, lyrics_model.output])
    merged = layers.Dense(64, activation=tf.nn.relu)(merged)
    merged = layers.Dense(5, activation=tf.nn.softmax)(merged)
    
    model = keras.Model([audio_model.input, lyrics_model.input], merged)

    model.compile(optimizer='adam', 
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

In [3]:
dataset_path = 'dataset.csv'
column_names = ['ID', 'Danceability', 'Acousticness', 'Energy', 'Loudness', 'Tempo', 'Valence', 'Category']

raw_dataset = pd.read_csv(dataset_path, names=column_names, na_values = "?", comment='\t', skipinitialspace=True)
dataset = raw_dataset.copy()

dataset.Category = pd.Categorical(dataset.Category)
dataset['Label'] = dataset.Category.cat.codes

model_variables = ['Danceability', 'Acousticness', 'Energy', 'Loudness', 'Tempo', 'Valence', 'Label']

dataset_relevant = dataset[model_variables]
dataset_relevant_encoded = pd.get_dummies(dataset_relevant)

training_features = dataset_relevant_encoded.drop(['Label'], axis=1)
training_target = dataset_relevant_encoded['Label']

std = StandardScaler()
audio_dataset_bal = std.fit_transform(training_features)
audio_dataset_bal

array([[-0.18437484, -0.25081067, -1.41576069, -1.37910795,  0.02406681,
        -0.85059275],
       [ 0.9760505 , -0.03265142, -0.25196577,  0.03337197,  0.03269329,
        -0.11590503],
       [ 1.63194309, -0.54471966, -0.10363897, -0.59157166,  0.42964577,
         0.42098216],
       ...,
       [-0.39880127, -0.85494817, -0.69314293,  0.16120665, -1.35996345,
        -1.38182849],
       [ 1.17155695, -0.6771302 , -0.99740304, -1.07281792, -0.98294922,
         0.72373808],
       [ 1.51211656, -0.73954799, -0.63989741, -1.52373842,  0.76577652,
         0.36446771]])

In [4]:
vocab = ''
with open("vocab.txt") as f:
    vocab = f.read()

vocab = vocab.split()
vocab = set(vocab)

lyrics_dataset = {}
with open("lyrics_dataset.txt") as f:
    lyrics_dataset = dict(x.rstrip().split(None, 1) for x in f)

lines = []
labels = []
for id in lyrics_dataset.keys():
    lyrics_dataset[id] = eval(lyrics_dataset[id])
    line = tokens_to_line(lyrics_dataset[id][0], vocab)
    lines.append(line)
    labels.append(lyrics_dataset[id][1])

labels = np.array(labels)
categories, target_label = np.unique(labels, return_inverse=True)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)

lyrics_features = tokenizer.texts_to_matrix(lines, mode='tfidf')

lyrics_features

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 2.53212838, 1.39257974, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 2.53212838, 2.35784246, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 3.14847665, 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [5]:
estimator = baseline_model()
kfold = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=7)

estimator.summary()

Instructions for updating:
keep_dims is deprecated, use keepdims instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
dense_1_input (InputLayer)      (None, 6)            0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 128)          896         dense_1_input[0][0]              
__________________________________________________________________________________________________
batch_normalization_1 (BatchNor (None, 128)          512         dense_1[0][0]                    
__________________________________________________________________________________________________
dense_3_input (InputLayer)      (None, 4563)         0                                            
____________________________________

In [6]:
for train_index, test_index in kfold.split(audio_dataset_bal, target_label):
    X_tr1, X_tes1 = audio_dataset_bal[train_index], audio_dataset_bal[test_index]
    X_tr2, X_tes2 = lyrics_features[train_index], lyrics_features[test_index]
    y_tr, y_tes = target_label[train_index], target_label[test_index]
    estimator.fit([X_tr1, X_tr2], y_tr, epochs=100, batch_size=128, verbose=0) 

    y_pred=estimator.predict([X_tes1, X_tes2])
    acc = accuracy_score(y_tes, np.argmax(y_pred, axis=1))
    cnf_matrix = confusion_matrix(y_tes, np.argmax(y_pred, axis=1))
    print("Accuracy:  %f" % acc)
    print(cnf_matrix)

Accuracy:  0.506250
[[  1  11  30   1  63]
 [  1  79  31   1  62]
 [  8  31 231   7 183]
 [  2   4  18   2  51]
 [ 15  32 146  14 416]]
