In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

import nltk
from nltk.util import ngrams

In [3]:
def read_lyrics_data(file_name):
    return pd.read_csv(file_name)

# Model Builder

In [4]:
def get_vocab_words(genre_df, max_features, ngram_range):
    vocab_set = set()
    vectorizer = TfidfVectorizer(max_features = max_features, ngram_range = ngram_range, max_df = .999, min_df = .00001,\
                                stop_words = 'english')
    
    vectorizer.fit_transform(genre_df.Lyrics)
    genre_vocab = vectorizer.vocabulary_
    vocab_set.update(set(genre_vocab.keys()))
    return vocab_set

In [5]:
from sklearn.linear_model import LogisticRegression
def get_logistic_metrics(features, labels):
    penalty = ['l1', 'l2']
    regs = [2, 1, .1, .01]
    
    hyperparameters = dict(C=regs, penalty=penalty)
    model = LogisticRegression(random_state=0, solver = 'saga',class_weight='balanced', multi_class='multinomial', \
                               max_iter=3000)
    
    clf = GridSearchCV(model, hyperparameters, cv=2, verbose=0, n_jobs = 5)
    grid_model = clf.fit(features, labels)
    return grid_model

In [6]:
from sklearn.ensemble import RandomForestClassifier
def get_forest_metrics(features, labels):
    num_trees = [30, 40, 50]
    max_features = ['sqrt', 'log2']
    depth = [11, 13, 15]
    
    hyperparameters = dict(n_estimators=num_trees, max_depth=depth, max_features = max_features)
    model = RandomForestClassifier(min_samples_leaf = int(.05 * len(labels)), random_state=0, class_weight='balanced')
    
    clf = GridSearchCV(model, hyperparameters, cv=2, verbose=0, n_jobs = 5)
    grid_model = clf.fit(features, labels)
    return grid_model

In [7]:
def run_cross_validation(lyrics_df, ngram):
    unique_genres = lyrics_df.Genre.unique()
    
    features = lyrics_df.Lyrics
    labels = lyrics_df.Genre
    vocab_set = set()
    
    for genre in unique_genres: # Get top phrases for each genre
        genre_df = lyrics_df[lyrics_df.Genre == genre]
        vocab = get_vocab_words(genre_df, 300, (ngram, ngram))
        vocab_set.update(vocab)
    
    
    vectorizer = TfidfVectorizer(ngram_range = (ngram, ngram), vocabulary = vocab_set, stop_words = 'english')
    train_features = vectorizer.fit_transform(features)
    
    logistic_grid = get_logistic_metrics(train_features, labels)
   
    forest_grid = get_forest_metrics(train_features, labels)
   
    return logistic_grid, forest_grid

In [10]:
lyrics = read_lyrics_data("LyricsData.csv")
logistic_df = []
forest_df = []

for g in range(1, 7):
    run_cross_validation(lyrics, g)
    logistic_grid, forest_grid = run_cross_validation(lyrics, g)
    
    best_logistic = logistic_grid.best_params_
    best_logistic['gram'] = g
    best_logistic['score'] = logistic_grid.best_score_
    logistic_df.append(best_logistic)
    
    best_forest = forest_grid.best_params_
    best_forest['gram'] = g
    best_forest['score'] = forest_grid.best_score_
    forest_df.append(best_forest)

AttributeError: 'DataFrame' object has no attribute 'Genre'

In [None]:
logistic_df = pd.DataFrame(logistic_df)
logistic_df.to_csv("logistic_metrics.csv", index = False)

forest_df = pd.DataFrame(forest_df)
forest_df.to_csv("forest_metrics.csv", index = False)

In [11]:
lyrics

Unnamed: 0.1,Unnamed: 0,artist,genre,song,lyrics,editedlyrics
0,0,Drake,Rap,God’s Plan,"['[Intro]', ""And they wishin' and wishin' and ...","['[Intro]', ""And they wishin' and wishin' and ..."
1,1,The Kings of Leon,Rock,Circle of Life (Broadway Version),"['[RAFIKI & ENSEMBLE]', 'Nants ingonyama bagit...","['[RAFIKI & ENSEMBLE]', 'Nants ingonyama bagit..."
2,2,Adele,Pop,Hello,"['[Verse 1]', ""Hello, it's me"", ""I was wonderi...","['[Verse 1]', ""Hello, it's me"", ""I was wonderi..."
3,3,Outkast,Hip-Hop/R&B,Hey Ya!,"['[Intro: André 3000]', '1, 2, 3, uh!', '', '[...","['[Intro: André 3000]', '1, 2, 3, uh!', '[Vers..."
4,4,Luke Bryan,Country,Most People Are Good,"['[Verse 1]', 'I believe kids oughta stay kids...","['[Verse 1]', 'I believe kids oughta stay kids..."
