In [1]:
%pip install spotipy --upgrade
%pip install lyricsgenius
%pip install textblob

Collecting spotipy
  Using cached spotipy-2.18.0-py3-none-any.whl (26 kB)
Collecting urllib3>=1.26.0
  Using cached urllib3-1.26.4-py2.py3-none-any.whl (153 kB)
Collecting requests>=2.25.0
  Using cached requests-2.25.1-py2.py3-none-any.whl (61 kB)
Installing collected packages: urllib3, requests, spotipy
  Attempting uninstall: urllib3
    Found existing installation: urllib3 1.25.11
    Uninstalling urllib3-1.25.11:
      Successfully uninstalled urllib3-1.25.11
  Attempting uninstall: requests
    Found existing installation: requests 2.24.0
    Uninstalling requests-2.24.0:
      Successfully uninstalled requests-2.24.0
Successfully installed requests-2.25.1 spotipy-2.18.0 urllib3-1.26.4
You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
Collecting lyricsgenius
  Using cached lyricsgenius-3.0.1-py3-none-any.whl (59 kB)
Collecting beautifulsoup4>=4.6.0
  Using cache

In [3]:
import os
import pandas as pd
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.cache_handler import CacheFileHandler

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=os.environ['SPOTIFY_CLIENT_ID'], client_secret=os.environ['SPOTIFY_CLIENT_SECRET'], cache_handler=CacheFileHandler(username='keatonconrad')))

# Importing Data

## Million Song Dataset

To start, we import a subset of the [Million Song Dataset](http://millionsongdataset.com/) and read it into a DataFrame with pandas for easy processing. The dataset contains random songs of various genres from 1922 to 2011. We'll use this to collect non-hit songs.

The DataFrame is shuffled to mitigate any bias in the order.

In [None]:
song_data = pd.read_csv('./song-list.txt', sep='<SEP>', engine='python')
song_data = song_data.sample(frac=1)
song_data.describe()

## Spotify and Billboard Data

We import a dataset that contains historical Billboard Hot 100 data mapped to each song's Spotify ID. This is incredibly useful as it helps us collect the audio features later. This dataset will be used to collect features from hit songs.

Like the previous dataset, we shuffle it to avoid any ordering bias.

In [None]:
spotify_billboard_data = pd.read_csv('./spotify-billboard-data.csv')
spotify_billboard_data = spotify_billboard_data.sample(frac=1)
print(spotify_billboard_data.columns)
hit_track_ids = spotify_billboard_data['spotify_track_id'].tolist()
spotify_billboard_data.describe()

# Data Enrichment

In [None]:
def get_track_info(track, hit):
    # Helper function to clean up code later
    
    explicit = 1 if track['explicit'] else 0
    try:
        isrc = track['external_ids']['isrc']
    except KeyError:
        isrc = None
        
    return {
        'spotify_track_id': track['id'],
        'isrc': isrc,
        'artist': track['artists'][0]['name'],
        'artist_id': track['artists'][0]['id'],
        'song': track['name'],
        'spotify_track_album': track['album']['name'],
        'year': track['album']['release_date'][:4],
        'explicit': explicit,
        'hit': hit,
        'current_popularity': track['popularity'] # As of 4/20/21
    }

## Non-Hit Songs

We can't do much with just the song title and artist, as given in the Million Song Dataset. Here, we do a search on Spotify to find the closest matching song and retrieve the track ID, album, and explicit information, among other data.

It should be noted that this step has some inherent inaccuracy due to its reliance on Spotify's search feature. There is no guarantee the correct song will be returned.

In [None]:
from tqdm import tqdm

no_hit_arr = []
song_data_sample = song_data.head(len(spotify_billboard_data))

for i, song in tqdm(song_data_sample.iterrows(), total=song_data_sample.shape[0]):
    artist = song['Performer'].replace('Featuring ', '').replace('feat. ', '').replace('feat ', '').replace('& ', '')
    search_results = spotify.search(q=song['Song'] + ' ' + artist, limit=1, type='track', market='US')
    try:
        track = search_results['tracks']['items'][0]
    except IndexError: # If search didn't return anything
        continue
        
    if track['id'] not in hit_track_ids:
        no_hit_arr.append(get_track_info(track, hit=0))

The data is consolidated into a DataFrame for ease of processing.

In [None]:
no_hit_basic_song_data_df = pd.DataFrame(no_hit_arr)
no_hit_basic_song_data_df.describe()

## Hit Songs

While the Spotify and Billboard dataset provides more data than the Million Song Dataset we are using, we still collect more information from Spotify about the album and release year.

This step isn't necessarily required, as the album name isn't used for much. However, it is quick due to already having the track IDs.

In [None]:
hit_arr = []
track_ids = []

for i, song in tqdm(spotify_billboard_data.iterrows(), total=spotify_billboard_data.shape[0]):
    track_ids.append(song['spotify_track_id'])
    
    if len(track_ids) == 50 or i == spotify_billboard_data.shape[0]:
        results = spotify.tracks(tracks=track_ids)['tracks']
        for track in results:            
            hit_arr.append(get_track_info(track, hit=1))
        track_ids.clear()

As before, the data is turned into a DataFrame.

In [None]:
hit_basic_song_data_df = pd.DataFrame(hit_arr)
hit_basic_song_data_df.describe()

Now that we have basic information (and more importantly, the Spotify track IDs) from our dataset, we concatenate both the non-hits and the hits together into one larger dataset.

In [None]:
basic_song_data_df = pd.concat([no_hit_basic_song_data_df, hit_basic_song_data_df])
basic_song_data_df = basic_song_data_df.sample(frac=1)
basic_song_data_df.head()

## Feature Collection

We collect the audio features for each track using the Spotify API and the track IDs from the newly created DataFrame.

Due to how the code makes API calls in batches of 100, it will ignore the last few tracks.

In [None]:
features = []
track_ids = []

for track_id in tqdm(basic_song_data_df['spotify_track_id']):
    track_ids.append(track_id)
    if len(track_ids) == 100:
        results = spotify.audio_features(tracks=track_ids)
        for result in results:
            if result is None:
                # Sometimes it returns None, this sets a default so we can still add it to a df
                features.append({
                    'danceability': None,
                    'energy': None,
                    'key': None,
                    'loudness': None,
                    'mode': None,
                    'speechiness': None,
                    'acousticness': None,
                    'instrumentalness': None,
                    'liveness': None,
                    'valence': None,
                    'tempo': None,
                    'type': None,
                    'id': None,
                    'uri': None,
                    'track_href': None,
                    'analysis_url': None,
                    'duration_ms': None,
                    'time_signature': None
                })
            else:
                features.append(result)
        track_ids.clear()

In [None]:
features_df = pd.DataFrame(features)
features_df.describe()

Due to the rounding-off of the last few songs as mentioned above, we ensure both the `basic_song_data_df` and the `features_df` are of equal length.

Finally, we concatenate them, forming one DataFrame with basic song information and the audio features of each track.

In [None]:
print(len(basic_song_data_df))
print(len(features_df))

# features_df.drop(features_df.tail(24).index, inplace=True)

print(len(basic_song_data_df))
print(len(features_df))

basic_data = basic_song_data_df.reset_index(drop=True, inplace=False)
feature_data = features_df.reset_index(drop=True, inplace=False)

full_df = pd.concat([basic_data, feature_data], axis=1)
full_df.head()

In [None]:
from scipy.stats import ttest_ind

"""
for column in full_df.columns:
    try:
        t = ttest_ind(hit_df[column].astype(float), no_hit_df[column].astype(float))
        print(column + ' - T: ' + str(t[0]) + ', p: ' + str(t[1]))
    except ValueError:
        continue
"""

For sanity's sake, we save the DataFrame for easy loading later.

In [None]:
import pickle

df = full_df
with open('all_song_data_new.pickle', 'wb') as f:
    pickle.dump(pd.DataFrame(df), f)

In [None]:
with open('all_song_data_new.pickle', 'rb') as f:
    df = pickle.load(f)

## Lyric Collection

Using the Genius API, we collect the lyrics of every song in our dataset. The polarity and subjectivity of the lyrics are also saved to an array.

Similar to above, this step has inherent inaccuracies due to its reliance on Genius's search feature.

The polariy, subjectivity, and lyrics are pickled and saved for easy loading later.

In [None]:
import lyricsgenius
genius = lyricsgenius.Genius(os.environ['GENIUS_TOKEN'], verbose=False, remove_section_headers=True)

In [None]:
import pickle
from tqdm import tqdm

lyrics = []

for i, song in tqdm(df.iterrows(), total=df.shape[0]):
    try:
        song = genius.search_song(song['song'], song['artist'], get_full_info=False)
        song_lyrics = song.lyrics.replace('\n', ' ')
        lyrics.append(song_lyrics)
    except:
        lyrics.append(None)
        
with open('lyrics.pickle', 'wb') as f:
    pickle.dump(lyrics, f)

In [None]:
#%pip install demoji
import pickle
import utils
from tqdm import tqdm

with open('lyrics.pickle', 'rb') as f:
    lyrics = pickle.load(f)

# lyrics = [utils.strip_stop_words(lyric) for lyric in tqdm(lyrics)]

In [None]:
from textblob import TextBlob

polarity = []
subjectivity = []

for lyric in tqdm(lyrics):
    try:
        blob = TextBlob(lyric)
        polarity.append(blob.sentiment[0])
        subjectivity.append(blob.sentiment[1])
    except:
        polarity.append(None)
        subjectivity.append(None)

with open('polarity.pickle', 'wb') as f:
    pickle.dump(polarity, f)
with open('subjectivity.pickle', 'wb') as f:
    pickle.dump(subjectivity, f)

In [None]:
import pickle

with open('polarity.pickle', 'rb') as f:
    polarity = pickle.load(f)
with open('subjectivity.pickle', 'rb') as f:
    subjectivity = pickle.load(f)

## Artist Popularity

This gets the popularity of every artist

In [None]:
popularities = []
artist_ids = []

for i, artist_id in tqdm(enumerate(df['artist_id']), total=len(df['artist_id'])):
    artist_ids.append(artist_id)
    if len(artist_ids) == 50 or i == len(df[['artist_id']]):
        results = spotify.artists(artist_ids)
        for result in results['artists']:
            popularities.append(result['popularity'])
        artist_ids.clear()

In [None]:
df_backup = df.copy()
df.drop(df.tail(26).index, inplace=True)
df['artist_popularity'] = popularities

## Previous Artist Hits

This gets an artist's previous hits. It adds up all the artist's hit songs in the dataset that are in the same year or previous years, not counting that song, if it's a hit.

In [None]:
num_hits = []

# NOTE: This is an incredibly inefficient way to do this

for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    subset_df = df[df['year'] <= row['year']]
    artist_hits = subset_df.groupby(['artist_id'])['hit'].agg('sum').reset_index()
    hits = artist_hits.loc[artist_hits.artist_id==row['artist_id'], 'hit'].values[0]
    if hits > 0 and row['hit'] == 1: # Don't count this song if it's a hit
        hits-=1
    num_hits.append(hits)
    
df['artist_num_hits'] = num_hits

The polarity, subjectivity, and lyric arrays are added to our dataset as columns. Rows with incomplete data are dropped.

In [None]:
df['polarity'] = polarity[:-26]
df['subjectivity'] = subjectivity[:-26]
df['lyrics'] = lyrics[:-26]
df['lyric_length'] = df['lyrics'].str.len()

In [None]:
dupl_df = df[df.duplicated(subset='id')].sort_values(by='id', axis=0)
print(len(set(df['id'].tolist())))
print(len(df['id'].tolist()))
df.drop_duplicates(subset='id', inplace=True)

In [None]:
with open('all_data.pickle', 'wb') as f:
    pickle.dump(df, f)

In [92]:
import pickle

with open('all_data.pickle', 'rb') as f:
    df = pickle.load(f)

# Audio Features

In [None]:
from tqdm import tqdm
import requests


for i, song in tqdm(df.iterrows(), total=df.shape[0]):
        if song['id'] is None:
            continue
            
        try:
            track = spotify.tracks(tracks=[song['id']])['tracks'][0]

            doc = requests.get(track['preview_url'])
            with open('./songs/' + song['id'] + '.mp3', 'wb') as f:
                f.write(doc.content)
        except:
            with open('./songs/' + song['id'] + '.mp3', 'wb') as f:
                f.write(b'')

In [None]:
with open('./00s8dO3RWrFkBqC9JIy6ag.mp3', 'rb') as f:
    print(f)
    fs, song_data = audioBasicIO.read_audio_generic(f)

print(fs)
print(song_data)

In [93]:
df['lyric_length'] = df['lyric_length'].fillna(0)
df['lyrics'] = df['lyrics'].fillna('')
df.dropna(subset=['danceability'], inplace=True) # Drops all rows that don't have basic features for whatever reason

In [94]:
print(len(df))

41909


In [95]:
with open('./audio_features/zero_crossings.pickle', 'rb') as f:
    zero_crossings = pickle.load(f)
    
df['zero_crossings'] = zero_crossings

In [96]:
with open('./audio_features/mels.pickle', 'rb') as f:
    mels = pickle.load(f)
    
df['mels'] = mels

In [97]:
with open('./audio_features/mfs.pickle', 'rb') as f:
    mfs = pickle.load(f)
    
df['mfs'] = mfs

UnpicklingError: pickle data was truncated

In [98]:
with open('./audio_features/chromas.pickle', 'rb') as f:
    chromas = pickle.load(f)
    
df['chromas'] = chromas

In [99]:
df['zero_crossings'].isna().sum()

13994

In [12]:
# This drops rows without every value (drops rows that don't have lyrics)
# This is necessary for doing embeddings

# df.dropna(subset=[n for n in df if n != 'isrc'], inplace=True)

In [100]:
df = df[(df.year.astype(int) < 2011) & (df.year.astype(int) > 1957)]

In [101]:
df.dropna(subset=['zero_crossings', 'mels'], inplace=True) # Drops all rows that don't have audio features

We now have our completed dataset.

In [102]:
df.describe()

Unnamed: 0,explicit,hit,current_popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,...,valence,tempo,duration_ms,time_signature,artist_popularity,artist_num_hits,polarity,subjectivity,lyric_length,zero_crossings
count,22740.0,22740.0,22740.0,22740.0,22740.0,22740.0,22740.0,22740.0,22740.0,22740.0,...,22740.0,22740.0,22740.0,22740.0,22740.0,22740.0,18728.0,18728.0,22740.0,22740.0
mean,0.050836,0.515611,25.745471,0.560486,0.632077,5.30708,-8.998418,0.692436,0.07646,0.28652,...,0.546271,121.359167,236206.5,3.909719,48.319613,3.879991,0.113119,0.489335,18371.3646,72429.07555
std,0.219667,0.499767,21.227784,0.172875,0.230512,3.577791,4.143219,0.461495,0.093986,0.308756,...,0.258915,28.512912,97487.79,0.389834,21.490694,7.862817,0.185765,0.161196,79332.841236,23596.246983
min,0.0,0.0,0.0,0.0,2.6e-05,0.0,-52.243,0.0,0.0,0.0,...,0.0,0.0,11520.0,0.0,0.0,0.0,-1.0,0.0,0.0,1.0
25%,0.0,0.0,7.0,0.444,0.469,2.0,-11.3115,0.0,0.0332,0.0187,...,0.34,99.7235,179627.0,4.0,34.0,0.0,0.0,0.428488,609.0,56113.75
50%,0.0,1.0,21.0,0.572,0.6565,5.0,-8.3445,1.0,0.0439,0.152,...,0.557,120.029,223193.5,4.0,51.0,1.0,0.096875,0.500384,1137.0,71679.5
75%,0.0,1.0,42.0,0.687,0.824,9.0,-5.975,1.0,0.0734,0.518,...,0.764,138.06875,270203.2,4.0,65.0,4.0,0.221442,0.57573,2060.0,87627.0
max,1.0,1.0,85.0,0.986,1.0,11.0,4.072,1.0,0.962,0.996,...,0.99,233.429,3079158.0,5.0,100.0,93.0,1.0,1.0,824082.0,305996.0


In [None]:
#%pip install seaborn
import seaborn as sns

data_df = df.groupby(['year']).mean().reset_index()
data_df = data_df[data_df['year'].astype(int) > 1960]
sns_plot = sns.regplot(data_df['year'].astype(int), data_df['polarity'], order=1, ci=None)
# sns_plot.figure.savefig('polarity.pdf')

# Data Preparation

We extract our target variable (hit or not) from the dataset. Then, we drop irrelevant and non-numerical columns from the dataset to form our x inputs.

The dataset is then split into training and validation sets. The new datasets are not shuffled to ensure that the lyrics correspond to the correct auxiliary variables.

In [200]:
import sklearn as sk
from sklearn.model_selection import train_test_split

y_data = df['hit']
x_data = df.drop(['hit', 'artist', 'artist_id', 'isrc', 'lyrics', 'song', 'id', 'spotify_track_id', 'spotify_track_album', 'analysis_url', 'uri', 'track_href', 'type', 'current_popularity', 'mels', 'chromas', 'zero_crossings'], axis=1, inplace=False)
print(x_data.columns)
x_data_train, x_data_test, y_data_train, y_data_test = train_test_split(
    x_data, y_data, test_size=0.2, shuffle=False
)

lyrics_train, lyrics_eval = train_test_split(df['lyrics'], test_size=0.2, shuffle=False)
print(len(y_data_train))
print(len(lyrics_train))
print(len(x_data_train))
print(y_data_train.describe())

Index(['year', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'duration_ms', 'time_signature',
       'artist_popularity', 'artist_num_hits', 'polarity', 'subjectivity',
       'lyric_length'],
      dtype='object')
18192
18192
18192
count    18192.000000
mean         0.518470
std          0.499672
min          0.000000
25%          0.000000
50%          1.000000
75%          1.000000
max          1.000000
Name: hit, dtype: float64


In [201]:
polarity_mean = x_data_train['polarity'].mean()
subjectivity_mean = x_data_train['subjectivity'].mean()

pd.options.mode.chained_assignment = None  # default='warn'

x_data_train['polarity'] = x_data_train['polarity'].fillna(polarity_mean)
x_data_test['polarity'] = x_data_test['polarity'].fillna(polarity_mean)
x_data_train['subjectivity'] = x_data_train['subjectivity'].fillna(subjectivity_mean)
x_data_test['subjectivity'] = x_data_test['subjectivity'].fillna(subjectivity_mean)

In [202]:
import numpy as np

mels = []

for arr in df['mels']:
    mels.append(arr.flatten())
    
mels = np.asarray(mels).astype('float32')


chromas = []

for arr in df['chromas']:
    chromas.append(arr.flatten())
    
chromas = np.asarray(chromas).astype('float32')

In [203]:
mels_train, mels_test, chromas_train, chromas_test = train_test_split(
    mels, chromas, test_size=0.2, shuffle=False
)

We then scale our auxiliary input variables using the StandardScaler.

In [204]:
from sklearn.preprocessing import StandardScaler

scaler_model = StandardScaler()
scaler_model.fit(x_data_train)

x_data_train = pd.DataFrame(
    scaler_model.transform(x_data_train),
    columns=x_data_train.columns,
    index=x_data_train.index
)
x_data_test = pd.DataFrame(
    scaler_model.transform(x_data_test),
    columns=x_data_test.columns,
    index=x_data_test.index
)

mel_scaler_model = StandardScaler()
mel_scaler_model.fit(mels_train)

mels_train = pd.DataFrame(mel_scaler_model.transform(mels_train))
mels_test = pd.DataFrame(mel_scaler_model.transform(mels_test))

chroma_scaler_model = StandardScaler()
chroma_scaler_model.fit(chromas_train)

chromas_train = pd.DataFrame(chroma_scaler_model.transform(chromas_train))
chromas_test = pd.DataFrame(chroma_scaler_model.transform(chromas_test))

# Simple Classification Models

The below code blocks show the results of the classification task using the auxiliary variables alone as inputs into various classifier models.

In [192]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression(random_state=0).fit(x_data_train, y_data_train)
print(round(LR.score(x_data_test, y_data_test), 4))
lr_preds = LR.predict(x_data_test)
print(sk.metrics.precision_recall_fscore_support(y_data_test, lr_preds, average='binary'))
print(sk.metrics.confusion_matrix(y_data_test, lr_preds))

0.8162
(0.8408984557791296, 0.7836894897514174, 0.8112866817155756, None)
[[1915  340]
 [ 496 1797]]


In [193]:
SVM = sk.svm.SVC()
SVM.fit(x_data_train, y_data_train)
print(round(SVM.score(x_data_test, y_data_test), 4))
svm_preds = SVM.predict(x_data_test)
print(sk.metrics.precision_recall_fscore_support(y_data_test, svm_preds, average='binary'))
print(sk.metrics.confusion_matrix(y_data_test, svm_preds))

0.8245
(0.8417924096936442, 0.8028783253379852, 0.821875, None)
[[1909  346]
 [ 452 1841]]


In [194]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(n_estimators=500, max_depth=100, random_state=0)
RF.fit(x_data_train, y_data_train)
print(round(RF.score(x_data_test, y_data_test), 4))
rf_preds = RF.predict(x_data_test)
print(sk.metrics.precision_recall_fscore_support(y_data_test, rf_preds, average='binary'))
print(sk.metrics.confusion_matrix(y_data_test, rf_preds))

0.8533
(0.8421717171717171, 0.8726559092891408, 0.8571428571428571, None)
[[1880  375]
 [ 292 2001]]


In [195]:
from sklearn.neural_network import MLPClassifier

NN = MLPClassifier(solver='adam', alpha=0.01, hidden_layer_sizes=(10, 10, 10), random_state=1, activation='relu')
NN.fit(x_data_train, y_data_train)
print(round(NN.score(x_data_test, y_data_test), 4))
nn_preds = NN.predict(x_data_test)
print(sk.metrics.precision_recall_fscore_support(y_data_test, nn_preds, average='binary'))
print(sk.metrics.confusion_matrix(y_data_test, nn_preds))

0.8404
(0.8332624415142492, 0.8543392935019625, 0.8436692506459947, None)
[[1863  392]
 [ 334 1959]]


In [196]:
from sklearn.naive_bayes import BernoulliNB

NB = BernoulliNB()
NB.fit(x_data_train, y_data_train)
print(round(NB.score(x_data_test, y_data_test), 4))
nb_preds = NB.predict(x_data_test)
print(sk.metrics.precision_recall_fscore_support(y_data_test, nb_preds, average='binary'))
print(sk.metrics.confusion_matrix(y_data_test, nb_preds))

0.7887
(0.8024523160762943, 0.7706061927605756, 0.786206896551724, None)
[[1820  435]
 [ 526 1767]]


In [None]:
from embeddings import EmbeddingGenerator

embedding = EmbeddingGenerator(
    max_sequence_len=100,
    filename='glove.840B.300d.txt'
)

print('Pretrained embedding loaded')
print('Embedding dimension:', embedding.embedding_dimension)

embedding.generate_word_index(texts=lyrics_train)  # Fits Tokenizer on words in X training data
print('Word index:', len(embedding.word_index))

# The below generate_sequences lines converts the words into integers,
# using the word indexes from the training data
seqs_train = embedding.generate_sequences(lyrics_train)
seqs_eval = embedding.generate_sequences(lyrics_eval)

print('Sequences created')

embedding.generate_embedding_matrix()

print('Vocab size:', embedding.vocab_size)
print(embedding.embedding_matrix.shape)

with open('embedding-audio.pickle', 'wb') as f:
    pickle.dump(embedding, f)

In [21]:
with open('embedding-audio.pickle', 'rb') as f:
    embedding = pickle.load(f)
    
seqs_train = embedding.generate_sequences(lyrics_train)
seqs_eval = embedding.generate_sequences(lyrics_eval)

In [205]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout, Embedding, \
    Input, Flatten, MaxPooling1D, Conv1D, concatenate, LSTM
from tensorflow.keras.models import Model

In [206]:
print('Building model...')

lrelu = lambda x: tf.keras.activations.relu(x, alpha=0.00)

# Branch 1
aux_input = Input(shape=(len(x_data_train.columns),))
aux = Dense(400, activation=lrelu)(aux_input)
aux = BatchNormalization()(aux)
aux = Model(inputs=aux_input, outputs=aux)

# Branch 2
emb_input = Input(shape=(embedding.max_sequence_len,))
emb = Embedding(embedding.vocab_size, embedding.embedding_dimension,
                input_length=embedding.max_sequence_len,
                weights=[embedding.embedding_matrix],
                trainable=True)(emb_input)
emb = Conv1D(filters=16, kernel_size=4, activation=lrelu)(emb)
emb = LSTM(50)(emb)
emb = Flatten()(emb)
emb = BatchNormalization()(emb)
emb = Dropout(.2)(emb)
emb = Dense(200, activation=lrelu)(emb)
emb = BatchNormalization()(emb)
emb = Dropout(.2)(emb)
emb = Dense(200, activation=lrelu)(emb)
emb = BatchNormalization()(emb)
emb = Dropout(.2)(emb)
emb = Dense(200, activation=lrelu)(emb)
emb = BatchNormalization()(emb)
emb = Dropout(.2)(emb)
emb = Dense(200, activation=lrelu)(emb)
emb = BatchNormalization()(emb)
emb = Dropout(.2)(emb)
emb = Dense(200, activation=lrelu)(emb)
emb = BatchNormalization()(emb)
emb = Dropout(.2)(emb)
emb = Dense(200, activation=lrelu)(emb)
emb = BatchNormalization()(emb)
emb = Model(inputs=emb_input, outputs=emb)

# Combined
combined = concatenate([aux.output, emb.output])
z = Dense(100, activation=lrelu)(combined)
z = BatchNormalization()(z)
z = Dropout(.2)(z)
z = Dense(100, activation=lrelu)(z)
z = BatchNormalization()(z)
z = Dropout(.2)(z)
z = Dense(100, activation=lrelu)(z)
z = BatchNormalization()(z)
z = Dropout(.2)(z)
z = Dense(100, activation=lrelu)(z)
z = BatchNormalization()(z)
z = Dropout(.2)(z)
z = Dense(100, activation=lrelu)(z)
z = BatchNormalization()(z)
z = Dropout(.2)(z)
z = Dense(100)(z)
z = Dense(1, activation="sigmoid")(z)

model = Model(inputs=[aux.input, emb.input], outputs=z)

optimizer = keras.optimizers.SGD(0.01) # Try RMSprop too?

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

es = EarlyStopping(monitor='val_loss', mode='min', patience=20, verbose=1)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.0001, verbose=1)

print(x_data_train.shape)
print(seqs_train.shape)
# print(model.summary())

Building model...
(18192, 20)
(18192, 100)


In [207]:
history = model.fit(
    [x_data_train, seqs_train],
    y_data_train,
    validation_data=([x_data_test, seqs_eval], y_data_test),
    batch_size=64,
    epochs=200,
    shuffle=True,
    verbose=1,
    callbacks=[es, rlr]
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 00041: ReduceLROnPlateau reducing learning rate to 0.0019999999552965165.
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 00065: ReduceLROnPlateau reducing learning rate to 0.0003999999724328518.
Epoc

In [175]:
print('Building model...')

lrelu = lambda x: tf.keras.activations.relu(x, alpha=0.00)

mels_input = Input(shape=(128, 1))
# mels = Conv1D(filters=64, kernel_size=4, activation=lrelu)(mels_input)
mels = LSTM(10)(mels_input)
mels = Model(inputs=mels_input, outputs=mels)

chromas_input = Input(shape=(12, 1))
# chromas = Conv1D(filters=64, kernel_size=2, activation=lrelu)(chromas_input)
chromas = LSTM(10)(chromas_input)
chromas = Model(inputs=chromas_input, outputs=chromas)

# Combined
z = concatenate([mels.output, chromas.output])
z = Dense(200, activation=lrelu)(z)
z = BatchNormalization()(z)
z = Dropout(.2)(z)
z = Dense(200, activation=lrelu)(z)
z = BatchNormalization()(z)
z = Dropout(.2)(z)
z = Dense(200, activation=lrelu)(z)
z = BatchNormalization()(z)
z = Dropout(.2)(z)
z = Dense(200)(z)
z = Dense(1, activation="sigmoid")(z)

model = Model(inputs=[mels.input, chromas.input], outputs=z)

optimizer = keras.optimizers.SGD(0.01) # Try RMSprop too?

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['acc', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

es = EarlyStopping(monitor='val_loss', mode='min', patience=20, verbose=1)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.0001, verbose=1)

# print(model.summary())

Building model...


In [176]:
history = model.fit(
    [mels_train, chromas_train],
    y_data_train,
    validation_data=([mels_test, chromas_test], y_data_test),
    batch_size=64,
    epochs=200,
    shuffle=True,
    verbose=1,
    callbacks=[es, rlr]
)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 00015: ReduceLROnPlateau reducing learning rate to 0.0019999999552965165.
Epoch 16/200
Epoch 17/200

KeyboardInterrupt: 

In [None]:
pred_train = model.evaluate([x_data_train, seqs_train], y_data_train)
pred_test = model.evaluate([x_data_test, seqs_eval], y_data_test)
y_pred_train = model.predict([x_data_train, seqs_train]).astype('int').flatten()
y_pred = model.predict([x_data_test, seqs_eval]).astype('int').flatten()

In [None]:
#%pip install matplotlib
import matplotlib.pyplot as plt

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
# %pip install seaborn
import seaborn as sns
from scipy.stats import pearsonr
from matplotlib import pyplot as plt

df_corr = pd.DataFrame() # Correlation matrix
df_p = pd.DataFrame()  # Matrix of p-values
for x in x_data.columns:
    # for y in full_df.columns:
    corr = pearsonr(x_data[x].astype(float), y_data)
    df_corr.loc[x,0] = corr[0]
    df_p.loc[x,0] = corr[1]


# mask = np.triu(np.ones_like(df_corr, dtype=np.bool))
cmap = sns.diverging_palette(240, 10, as_cmap=True)

f = plt.figure(figsize=(8, 7))
# plt.rc('xtick', labelsize=10)
# plt.rc('ytick', labelsize=10)
ax = sns.heatmap(df_corr, cmap=cmap, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .6})
ax.set_title('Pearson Correlation Matrix')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.tight_layout()
plt.show()

f = plt.figure(figsize=(8, 7))
# plt.rc('xtick', labelsize=10)
# plt.rc('ytick', labelsize=10)
ax = sns.heatmap(df_p, cmap=cmap, center=0, square=True, linewidths=.5, cbar_kws={"shrink": .6})
ax.set_title('P-Value Matrix from Pearson Correlation')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.tight_layout()
plt.show()