In [1]:
# importing modules
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, VotingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction import text
from sklearn.compose import make_column_transformer, make_column_selector
import re
import nltk
import eli5

# setting pandas settings to display additional rows within Jupyter Labs
pd.set_option('display.max_rows', 500)

In [2]:
# importing clean Reddit CSV 
lyric_df = pd.read_csv('../data/lyrics_final.csv')

In [3]:
# adding a category_id column to encode decades
lyric_df['category_id'] = lyric_df['decade'].factorize()[0]
lyric_df.head(2)

Unnamed: 0.1,Unnamed: 0,song_rank,artist_name,song_title,year,lyrics,decade,word_count,characters,word_length,clean_lyrics,profanity,suggestive,lyric_valence,is_positive,is_negative,is_neutral,category_id
0,0,1,Glenn Miller,Chattanooga Choo Choo,1941,"pardon me, boy is that the chattanooga choo ch...",1940,151,806,5.3,pardon boy chattanooga choo choo track twenty ...,False,0,0.8979,1,0,0,0
1,1,2,Sammy Kaye,Daddy,1941,ladada ladada da dada ladada ladada da dada la...,1940,201,1021,5.1,ladada ladada da dada ladada ladada da dada la...,True,1,0.9957,1,0,0,0


In [13]:
# creating X from the cleaned Reddit post titles
X = lyric_df[['clean_lyrics', 'word_count', 'is_negative','suggestive']]

In [14]:
X.shape

(5369, 4)

In [15]:
y = lyric_df['category_id']

In [16]:
y.shape

(5369,)

In [17]:
# train test split with random state
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=48, stratify = y)

## Baseline Model

Baseline accuracy is 17.79%, the equivalent of choosing the most common decade (the 1980s).

In [18]:
lyric_df['decade'].value_counts(normalize = True)

1980    0.177873
1970    0.172099
1960    0.167443
1990    0.156081
2000    0.145278
1950    0.085305
2010    0.061836
1940    0.034085
Name: decade, dtype: float64

## Attempt #1 - Naive Bayes

In [19]:
# creating pipe with naive bayes and countvectorizer
ct1 = make_column_transformer(
    (CountVectorizer(stop_words = 'english', max_features = 1000), 'clean_lyrics'),
    remainder='passthrough',
    verbose_feature_names_out = False)

bayes_pipe = make_pipeline(ct1, StandardScaler(with_mean = False), MultinomialNB())

# gridsearching across ngram ranges
params = {'multinomialnb__alpha': [.25, .5, .75, 1, 1.25]}

# creating gridsearch and fitting to training data
bayes_grid = GridSearchCV(bayes_pipe, params, n_jobs =-1)
bayes_grid.fit(X_train,y_train)

In [22]:
# creating predictions for test data
preds = bayes_grid.predict(X_test)

In [23]:
# accuracy score for test data
bayes_grid.score(X_test, y_test)

0.3425167535368578

In [24]:
# accuracy score for training data
bayes_grid.score(X_train, y_train)

0.583705911574764

In [25]:
# pulling best parameters from gridsearch
bayes_grid.best_params_

{'multinomialnb__alpha': 0.25}

## Attempt 2 - Logistic Regression

In [26]:
# creating pipe with logistic regression and countvectorizer
ct1 = make_column_transformer(
    (CountVectorizer(stop_words = 'english', max_features = 1000), 'clean_lyrics'),
    remainder='passthrough',
    verbose_feature_names_out = False)

logreg_pipe = make_pipeline(ct1, StandardScaler(with_mean = False), LogisticRegression(random_state = 48))

# gridsearching across ngram ranges
params = {'logisticregression__C': [.25, .5, .75, 1], 'logisticregression__max_iter': [20_000]}

# creating gridsearch and fitting to training data
logreg_grid = GridSearchCV(logreg_pipe, params, n_jobs =-1)
logreg_grid.fit(X_train,y_train)

In [29]:
# creating predictions from training data
preds = logreg_grid.predict(X_test)

In [30]:
# accuracy score on test set
logreg_grid.score(X_test, y_test)

0.31347728965003724

In [31]:
# accuracy score on training set
logreg_grid.score(X_train, y_train)

0.9083457526080477

In [32]:
# pulling best parameters from gridsearch
logreg_grid.best_params_

{'logisticregression__C': 0.25, 'logisticregression__max_iter': 20000}

### Logistic Regression Coefficients

In [33]:
weights = eli5.show_weights(estimator = logreg_grid.best_estimator_.named_steps['logisticregression'], 
                            feature_names = logreg_grid.best_estimator_.named_steps['columntransformer'].get_feature_names_out(), 
                            top=15,
                            target_names = {0: '1940s', 1: '1950s', 2: '1960s', 3: '1970s', 4: '1980s', 5: '1990s', 6: '2000s', 7: '2010s'})
weights.data = weights.data.replace('background-color:', 'color:black; background-color:')
weights

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7
+3.491,<BIAS>,,,,,,
+0.501,dead,,,,,,
+0.462,soon,,,,,,
+0.443,trip,,,,,,
+0.428,park,,,,,,
+0.425,fear,,,,,,
… 342 more positive …,… 342 more positive …,,,,,,
… 647 more negative …,… 647 more negative …,,,,,,
-0.425,wind,,,,,,
-0.428,john,,,,,,

Weight?,Feature
+3.491,<BIAS>
+0.501,dead
+0.462,soon
+0.443,trip
+0.428,park
+0.425,fear
… 342 more positive …,… 342 more positive …
… 647 more negative …,… 647 more negative …
-0.425,wind
-0.428,john

Weight?,Feature
+3.861,<BIAS>
+0.561,tryna
+0.487,honey
+0.486,doll
… 454 more positive …,… 454 more positive …
… 535 more negative …,… 535 more negative …
-0.490,voice
-0.491,loved
-0.495,is_negative
-0.506,key

Weight?,Feature
+3.798,<BIAS>
+0.549,burn
+0.510,lil
… 501 more positive …,… 501 more positive …
… 488 more negative …,… 488 more negative …
-0.517,bar
-0.539,roll
-0.540,deny
-0.549,state
-0.583,chill

Weight?,Feature
+1.871,<BIAS>
+0.496,god
+0.469,love
… 566 more positive …,… 566 more positive …
… 423 more negative …,… 423 more negative …
-0.462,heart
-0.466,raise
-0.477,giving
-0.500,tryna
-0.509,red

Weight?,Feature
+0.544,smell
+0.496,tonight
+0.487,control
+0.472,ooh
+0.458,word_count
+0.453,hee
… 555 more positive …,… 555 more positive …
… 434 more negative …,… 434 more negative …
-0.452,lord
-0.461,hey

Weight?,Feature
+1.112,word_count
+0.879,yo
+0.575,thang
+0.553,pump
+0.511,ow
+0.467,twist
+0.449,passion
… 561 more positive …,… 561 more positive …
… 428 more negative …,… 428 more negative …
-0.429,chick

Weight?,Feature
+1.510,word_count
+0.621,club
+0.593,bag
+0.544,lil
+0.544,crazy
… 542 more positive …,… 542 more positive …
… 447 more negative …,… 447 more negative …
-0.536,dead
-0.539,dawn
-0.554,tender

Weight?,Feature
+0.868,fuckin
+0.724,word_count
+0.537,tryna
+0.532,til
+0.501,oh
+0.491,ma
+0.482,close
+0.478,drunk
+0.466,leaving
+0.460,sexy


## Attempt 3 - Random Forests

In [108]:
# creating pipe with naive bayes and countvectorizer
ct1 = make_column_transformer(
    (CountVectorizer(stop_words = 'english', max_features = 750), 'clean_lyrics'),
    remainder='passthrough',
    verbose_feature_names_out = False)

forest_pipe = make_pipeline(ct1, StandardScaler(with_mean = False), RandomForestClassifier(random_state = 48))

# gridsearching across ngram ranges
params = {}

# creating gridsearch and fitting to training data
forest_grid = GridSearchCV(forest_pipe, params, n_jobs =-1)
forest_grid.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('countvectorizer',
                                                                         CountVectorizer(max_features=750,
                                                                                         stop_words='english'),
                                                                         'clean_lyrics')],
                                                          verbose_feature_names_out=False)),
                                       ('standardscaler',
                                        StandardScaler(with_mean=False)),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(random_state=48))]),
             n_jobs=-1, param_grid={})

In [119]:
forest_grid.get_params()

{'cv': None,
 'error_score': nan,
 'estimator__memory': None,
 'estimator__steps': [('columntransformer',
   ColumnTransformer(remainder='passthrough',
                     transformers=[('countvectorizer',
                                    CountVectorizer(max_features=750,
                                                    stop_words='english'),
                                    'clean_lyrics')],
                     verbose_feature_names_out=False)),
  ('standardscaler', StandardScaler(with_mean=False)),
  ('randomforestclassifier', RandomForestClassifier(random_state=48))],
 'estimator__verbose': False,
 'estimator__columntransformer': ColumnTransformer(remainder='passthrough',
                   transformers=[('countvectorizer',
                                  CountVectorizer(max_features=750,
                                                  stop_words='english'),
                                  'clean_lyrics')],
                   verbose_feature_names_out=False),
 'esti

In [109]:
# creating predictions from training data
preds = forest_grid.predict(X_test)

# accuracy score on test set
forest_grid.score(X_test, y_test)

In [111]:
# accuracy score on training set
forest_grid.score(X_train, y_train)

0.9995032290114257

In [112]:
# pulling best parameters from gridsearch
forest_grid.best_params_

{}

In [113]:
forest_grid.best_estimator_

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('countvectorizer',
                                                  CountVectorizer(max_features=750,
                                                                  stop_words='english'),
                                                  'clean_lyrics')],
                                   verbose_feature_names_out=False)),
                ('standardscaler', StandardScaler(with_mean=False)),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=48))])

### Random Forest Weights

In [114]:
weights = eli5.show_weights(estimator = forest_grid.best_estimator_.named_steps['randomforestclassifier'], 
                            feature_names = forest_grid.best_estimator_.named_steps['columntransformer'].get_feature_names_out(), 
                            top=25,
                            target_names = {0: '1940s', 1: '1950s', 2: '1960s', 3: '1970s', 4: '1980s', 5: '1990s', 6: '2000s', 7: '2010s'})
weights.data = weights.data.replace('background-color:', 'color:black; background-color:')
weights

Weight,Feature
0.0422  ± 0.0268,word_count
0.0120  ± 0.0069,know
0.0115  ± 0.0056,love
0.0102  ± 0.0078,like
0.0089  ± 0.0051,oh
0.0086  ± 0.0055,yeah
0.0083  ± 0.0049,time
0.0080  ± 0.0064,cause
0.0078  ± 0.0053,baby
0.0078  ± 0.0098,wanna


## Attempt 4 - Gradient Boosting Classifier

In [120]:
# creating pipe with naive bayes and countvectorizer
ct1 = make_column_transformer(
    (CountVectorizer(stop_words = 'english', max_features = 750), 'clean_lyrics'),
    remainder='passthrough',
    verbose_feature_names_out = False)

boost_pipe = make_pipeline(ct1, StandardScaler(with_mean = False), GradientBoostingClassifier(random_state = 48))

# gridsearching across ngram ranges
params = {}

# creating gridsearch and fitting to training data
boost_grid = GridSearchCV(boost_pipe, params, n_jobs =-1)
boost_grid.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('columntransformer',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('countvectorizer',
                                                                         CountVectorizer(max_features=750,
                                                                                         stop_words='english'),
                                                                         'clean_lyrics')],
                                                          verbose_feature_names_out=False)),
                                       ('standardscaler',
                                        StandardScaler(with_mean=False)),
                                       ('gradientboostingclassifier',
                                        GradientBoostingClassifier(random_state=48))]),
             n_jobs=-1, param_grid={})

In [122]:
# creating predictions from training data
preds = boost_grid.predict(X_test)

# accuracy score on test set
boost_grid.score(X_test, y_test)

0.37453462397617276

In [123]:
# accuracy score on training set
boost_grid.score(X_train, y_train)

0.7560854446100348