# Notebook 4 - Modeling

This notebook will perform the splitting of data, training and testing the selected models, and identifying the final model to be selected.

In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from ast import literal_eval
from scipy import sparse
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.pipeline import Pipeline
import pickle

In [12]:
#import data
# Read in preprocessed data
preproc_df = pd.read_csv("data/genre_prepped.csv.gz", compression = "gzip",
                         converters = {"tokens": literal_eval, "genre" : literal_eval})

# drop unnecessary columns (index and unnamed index columns)
preproc_df = preproc_df.drop(preproc_df.columns[0:2], axis = 1)

# Sample Table
preproc_df.head(5)

Unnamed: 0,artist,title,lyrics,genre,tokens,lyrics_clean
0,Taylor Swift,​betty,"Betty, I won't make assumptions\nAbout why you...",[country],"[betty, make, assumptions, switched, homeroom,...",betty make assumptions switched homeroom think...
1,John Denver,"Take Me Home, Country Roads","Almost Heaven, West Virginia\nBlue Ridge Mount...",[country],"[almost, heaven, west, virginia, blue, ridge, ...",almost heaven west virginia blue ridge mountai...
2,Post Malone,Feeling Whitney,"I've been looking for someone...\nOoh, ooh, oo...",[country],"[looking, someone, ooh, ooh, ooh, ooh, ooh, oo...",looking someone ooh ooh ooh ooh ooh oohooh ooh...
3,Cam,Burning House,\n[Verse 1]\nI had a dream about a burning hou...,[country],"[dream, burning, house, stuck, inside, get, la...",dream burning house stuck inside get laid besi...
4,Johnny Cash,Folsom Prison Blues,"I hear the train a-comin', it's rolling 'round...",[country],"[hear, train, acomin, rolling, round, bend, ai...",hear train acomin rolling round bend aint seen...


# Format the labels

In [13]:
# Convert genres to set for multilabel encoding
preproc_df["genre_set"] = preproc_df["genre"].map(set)

In [14]:
mlb = MultiLabelBinarizer()
genre_array = mlb.fit_transform(preproc_df["genre_set"])
genre_array

array([[1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0]])

In [15]:
# Convert label array to sparse matrix
genre_sparse = sparse.csr_matrix(genre_array)

# Vectorize text input
We'll use TF-IDF vectors for now

In [16]:
tfidf_vectorizer = TfidfVectorizer()
text_sparse = tfidf_vectorizer.fit_transform(preproc_df['lyrics_clean'])

# Splitting the data into training and testing
Using 70/30 train/test split.


In [17]:
X_train, X_test, y_train, y_test = train_test_split(text_sparse, genre_array,
                                                    test_size= 0.3 , random_state= 2023)

# Models 

For classification, the selected models will demonstrate the commonly used for text classification: K-Nearest Neighbors

## K-Nearest Neighbors
Train the model

In [18]:
# Set up hyperparameter options
n_neighbors = range(1,21, 2)
weights = ['uniform', 'distance']
metric = ['euclidean', 'minkowski'] # Manhattan was failing to fit - these two should work alright though

# Setup model pipeline so we can reference it in param_distributions
knn_pipe = Pipeline(
    steps = [
        ('knn', MultiOutputClassifier(KNeighborsClassifier(), n_jobs= - 1))
    ]
)

# Search
knn_tuned = GridSearchCV(
    estimator= knn_pipe,
    param_grid = {'knn__estimator__n_neighbors' : n_neighbors,
                   'knn__estimator__weights' : weights,
                   'knn__estimator__metric' : metric}
).fit(X_train, y_train)

# Return the best model parameters
knn_tuned.best_params_

{'knn__estimator__metric': 'euclidean',
 'knn__estimator__n_neighbors': 1,
 'knn__estimator__weights': 'uniform'}

Save so we don't need to retrain in the future

In [19]:
pickle.dump(knn_tuned, open('models/knn_tuned.sav', 'wb'))

# Note Delete Later -
It will probably be quicker without losing to much performance to run RandomizedSearchCV() for more complex models. Everything is the same as GridSearchCV() except param_grid needs to be param_distributions and you can set n_iter to control how many randomized hyperparameters to search through.

# Model Results

In [20]:
# Example Load
knn_load = pickle.load(open('models/knn_tuned.sav', 'rb'))
knn_load.predict(X_test)

array([[0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 1, 1, 0, 0],
       [0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1]])