# Train a model

Train a model to predict genre based on lyrics

In [1]:
import pandas as pd
import numpy as np
import pickle
import datetime
today = str(datetime.date.today())
print(today)

2019-04-07


# Load the data

In [2]:
raw = pd.read_json("./data/lyrics/all_songs.json")
songs = raw[['title', 'artist', 'genre', 'lyrics']].copy()
songs.head(2)

Unnamed: 0,title,artist,genre,lyrics
0,16 on Death Row,2Pac,rap,Death Row\nThat's where mothafuckas is endin' ...
1,1995 Police Station Testimony,2Pac,rap,"Woman – Sir, will you raise your right hand, p..."


# Prep the vectorizer

TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [5]:
vectorizer = TfidfVectorizer(lowercase=True,
                             strip_accents='unicode',
                             ngram_range=(1, 2),
                             stop_words=stopwords.words('english'))

# Genre classification model

## Data augmentation

  - Split songs into multiple training examples by splitting up lyrics

In [87]:
def split_up_lyrics(lyrics, num_chars=280):
    pass

# Split into train/test data

In [6]:
# Split data into train/test sets
songs_train = songs.sample(frac=.7)
songs_test = songs.loc[list(set(songs.index).difference(songs_train.index))]
print(f"Training shape: {songs_train.shape}")
print(f"Testing shape: {songs_test.shape}")

Training shape: (38509, 4)
Testing shape: (16504, 4)


In [7]:
# Train the TF-IDF vectorizer on the training data
X_train = vectorizer.fit_transform(songs_train.lyrics)
y_train = songs_train.genre

In [8]:
# Project the test lyrics into the TF-IDF space
X_test = vectorizer.transform(songs_test.lyrics)
y_test = songs_test.genre

In [13]:
# Save the trained vectorizer
pickle.dump(vectorizer, open(f"tfidf_vectorizer_{today}.pkl", "wb"))

## Train the model

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [15]:
clf = RandomForestClassifier(n_estimators=25)

In [None]:
%%time
clf.fit(X_train, y_train,)

In [None]:
%%time
pred = clf.predict(X_test)

In [None]:
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)

In [None]:
print("confusion matrix:")
print(metrics.confusion_matrix(y_test, pred))

In [None]:
# Save the trained model
pickle.dump(clf, open(f"model_rfc_{today}.pkl", "wb"))

In [None]:
# Add the vectorizer to the trained model
clf.vectorizer = vectorizer
pickle.dump(clf, open(f"model_rfc_with_vectorizer_{today}.pkl", "wb"))

# IDEA:

 - Get extra training data by training the model on subsections (140 chars?) of each song