# Train a model

Train a model to predict genre based on lyrics

In [1]:
import pandas as pd
import numpy as np
import pickle
import datetime
today = str(datetime.date.today())
print(today)

2019-04-07


# Load the data

In [2]:
%%time
raw = pd.read_json("./data/lyrics/all_songs.json")
songs = raw[['title', 'artist', 'genre', 'lyrics']].copy()
songs.head(2)

CPU times: user 27.7 s, sys: 7.87 s, total: 35.6 s
Wall time: 45.8 s


# Prep the vectorizer

TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [99]:
vectorizer = TfidfVectorizer(lowercase=True,
                             strip_accents='unicode',
                             ngram_range=(1, 2),
                             max_features=1000,
                             stop_words=None)
                             #stop_words=stopwords.words('english'))

# Genre classification model

## Data augmentation

  - Split songs into multiple training examples by splitting up lyrics

In [87]:
def split_up_lyrics(lyrics, num_chars=280):
    pass

# Split into train/test data

In [101]:
# Split data into train/test sets
RS = 0
songs_train = songs.sample(frac=.85, random_state=RS)
songs_test = songs.loc[list(set(songs.index).difference(songs_train.index))]
print(f"Training shape: {songs_train.shape}")
print(f"Testing shape: {songs_test.shape}")

Training shape: (46761, 4)
Testing shape: (8252, 4)


In [102]:
%%time
# Train the TF-IDF vectorizer on the training data
X_train = vectorizer.fit_transform(songs_train.lyrics)
y_train = songs_train.genre

CPU times: user 1min 12s, sys: 3.16 s, total: 1min 15s
Wall time: 1min 17s


In [103]:
%%time
# Project the test lyrics into the TF-IDF space
X_test = vectorizer.transform(songs_test.lyrics)
y_test = songs_test.genre

CPU times: user 8.33 s, sys: 288 ms, total: 8.62 s
Wall time: 8.79 s


In [104]:
vectorizer.stop_words_ = None

In [105]:
# Save the trained vectorizer
pickle.dump(vectorizer, open(f"tfidf_vectorizer_{today}.pkl", "wb"))

## Train the model

In [93]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [106]:
clf = RandomForestClassifier(n_estimators=50, n_jobs=-1)

In [107]:
%%time
clf.fit(X_train, y_train)

CPU times: user 4min 25s, sys: 2.49 s, total: 4min 28s
Wall time: 1min 27s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [110]:
pred = clf.predict(X_test)
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

accuracy:   0.801


In [112]:
# Add the vectorizer to the trained model
clf.vectorizer = vectorizer
pickle.dump(clf, open(f"model_rfc_with_vectorizer_{today}.pkl", "wb"))