In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from ipython_config import RDS_pwd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import ComplementNB,GaussianNB
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, LogisticRegression, LinearRegression 
from sklearn.feature_selection import SelectFromModel 
from sklearn.ensemble import RandomForestClassifier 
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from collections import Counter
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import pickle
import os
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
import joblib
import h5py
import statsmodels.formula.api as sm

In [2]:
# Connect to database (Note: The package psychopg2 is required for Postgres to work with SQLAlchemy)
db_string = f"postgres://postgres:{RDS_pwd}@platinum-rds.cbu3an3ywyth.us-east-2.rds.amazonaws.com/Platinum_Lyrics"
engine = create_engine(db_string)

In [3]:
#printing names of the tables present in the database
print(engine.table_names())

['pivot', 'spotify', 'bb_id', 'bb_no_id', 'million_songs', 'pivot_songs', 'pivot_songs_bb', 'platinum_lyrics', 'platinum_features', 'platinum_lyrics_features']


In [4]:
# lyrics_features = engine.table_names()[0]
lyrics_only = engine.table_names()[7]
lyrics_only

'platinum_lyrics'

In [5]:
# Read SQL database table into a DataFrame.
lyrics_only_df = pd.read_sql_table(lyrics_only,engine)
lyrics_only_df.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,target_success,target_weeks,target_peak,word_abov,word_accept,word_ach,...,word_yeah,word_year,word_yellow,word_yes,word_yesterday,word_yet,word_york,word_young,word_yourself,word_youth
0,TRSYKBX128F427F0F1,wilson phillips,get together,2004,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TRSYKCU128F4277A59,stryper,always there for you,1988,1,71,8,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,TRSYKHG12903CEDBD6,sam cooke,crazy she calls me,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TRSYKNK128F933B674,up up down down left right left right b a start,i know you'll find out that i'm a geek,2003,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,TRSYLQO128F931B4A3,alison krauss,crazy faith,2001,0,0,0,1,0,0,...,0,0,0,0,0,3,0,0,0,0


In [6]:
# Remove the prefix "word_":
column_names = lyrics_only_df.columns.values
new_column_names = {}
for column in column_names:
                new_column_names[column] = column.replace('word_', '')
lyrics_only_df = lyrics_only_df.rename(columns=new_column_names)

# PREPROCESSING

In [7]:
# Get the shape of lyrics_only_df 
lyrics_only_df.shape

(39094, 1554)

In [8]:
# Get list of columns
list(lyrics_only_df.columns)

['track_id',
 'artist_name',
 'song_title',
 'song_year',
 'target_success',
 'target_weeks',
 'target_peak',
 'abov',
 'accept',
 'ach',
 'across',
 'act',
 'action',
 'addict',
 'admit',
 'ador',
 'afraid',
 'against',
 'age',
 'ago',
 'ahead',
 'aim',
 'air',
 'album',
 'algo',
 'aliv',
 'alla',
 'alma',
 'almost',
 'alon',
 'along',
 'alreadi',
 'alright',
 'although',
 'alway',
 'amaz',
 'america',
 'american',
 'amigo',
 'amo',
 'amor',
 'amour',
 'angel',
 'anger',
 'ani',
 'anim',
 'anoth',
 'answer',
 'ant',
 'anybodi',
 'anymor',
 'anyon',
 'anyth',
 'anyway',
 'anywher',
 'apart',
 'appear',
 'arm',
 'around',
 'arriv',
 'art',
 'ash',
 'ask',
 'asleep',
 'ass',
 'att',
 'attack',
 'aus',
 'aux',
 'awak',
 'away',
 'babe',
 'babi',
 'back',
 'bad',
 'bag',
 'ball',
 'band',
 'bang',
 'bank',
 'bar',
 'bare',
 'bass',
 'battl',
 'beach',
 'bear',
 'beast',
 'beat',
 'beauti',
 'becam',
 'becaus',
 'becom',
 'bed',
 'beer',
 'befor',
 'beg',
 'began',
 'begin',
 'begun',
 'beh

In [9]:
# Get columns with null values
null_columns=lyrics_only_df.columns[lyrics_only_df.isnull().any()]
lyrics_only_df[null_columns].isnull().sum()

Series([], dtype: float64)

In [10]:
# Drop columns we don't need 
lyrics_only_df.drop(columns=["track_id","artist_name",
                                              "song_title","song_year","target_weeks",
                                              "target_peak"],axis=1,inplace=True)
lyrics_only_df.head()


Unnamed: 0,target_success,abov,accept,ach,across,act,action,addict,admit,ador,...,yeah,year,yellow,yes,yesterday,yet,york,young,yourself,youth
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,0,0,0


In [11]:
# Remove some noisy features found through previous runs of the model
lyrics_only_df.drop(columns=["que","the","ooh","con","tri","una",
                            "por","noth","mai","whi","como","qui",
                            "qui", "das", "doe", "der", "des", "dan",
                             "che", "mani", "vida", "mit", "pas", "per",
                             "cos", "dri", "mir", "nos", "dir", "poi", "voi",
                             "och", "ver", "mari", "har", "doo", "ima"],
                    axis=1,inplace=True)
lyrics_only_df.head()

Unnamed: 0,target_success,abov,accept,ach,across,act,action,addict,admit,ador,...,yeah,year,yellow,yes,yesterday,yet,york,young,yourself,youth
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,0,0,0,0


In [12]:
# Get data types 
for dtype in lyrics_only_df.dtypes: 
    print(dtype)

int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int6

# FEATURE ENGINEERING

In [13]:
# Split data to test and train
X = lyrics_only_df.drop(columns=['target_success'],axis=1)
y = lyrics_only_df['target_success']

METHOD 1 - Sequential Backward Selection

In [14]:
#Sequential backward selection(sbs)
# sbs = SFS(LinearRegression(), 
#           k_features=100, 
#           forward=False, 
#           floating=False,
#           cv=0)
# sbs.fit(X, y)
# sbs.k_feature_names_


In [15]:
# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

In [16]:
# Resample 
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 24225, 1: 24225})

In [17]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler= scaler.fit(X_resampled)

# Scale the data
X_train_scaled = X_scaler.transform(X_resampled)
X_test_scaled = X_scaler.transform(X_test)

METHOD 2 - PCA

In [18]:
# Initialize PCA model
# pca = PCA(n_components=3)
# X_pca = pca.fit_transform(X_train_scaled)
# X_pca_df = pd.DataFrame(data=X_pca,columns=["principal component 1", "principal component 2", "principal component 3"])
# X_pca_df.head()

In [19]:
#Fetch the explained variance
# pca.explained_variance_ratio_

METHOD 3 - Lasso Regularization 

In [20]:
# Selecting features using Lasso regularization using SelectFromModel
sel_ = SelectFromModel(LogisticRegression(C=1,fit_intercept=False,penalty='l2'))
sel_.fit(scaler.transform(X_resampled), y_resampled)

SelectFromModel(estimator=LogisticRegression(C=1, fit_intercept=False))

In [21]:
# Visualising features that were kept by the lasso regularisation
sel_.get_support()

array([ True, False,  True, ..., False, False, False])

In [22]:
# Make a list of with the selected features
selected_feat = X_resampled.columns[(sel_.get_support())]
print('total features: {}'.format((X_resampled.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 1511
selected features: 575
features with coefficients shrank to zero: 0


In [23]:
# Number of features which coefficient was shrank to zero 
np.sum(sel_.estimator_.coef_ == 0)

0

In [24]:
# Identifying the removed features
removed_feats = X_resampled.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index([], dtype='object')

In [25]:
# create new X test and train with 579 features
X_train_selected = sel_.transform(X_resampled.fillna(0))
X_test_selected = sel_.transform(X_test.fillna(0))
X_train_selected.shape, X_test_selected.shape

((48450, 575), (9774, 575))

# Model 1 -  ComplementNB Naive Bayes (with Lasso)

In [26]:
#Create classifier  
compNB_model = ComplementNB()

In [27]:
# Train the classifier 
compNB_model.fit(X_train_selected, y_resampled)

ComplementNB()

In [28]:
# Get predictions
y_pred = compNB_model.predict(X_test_selected)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,1,0
2,0,1
3,1,1
4,0,0
5,0,0
6,1,0
7,0,1
8,0,0
9,1,0


In [29]:
# Get model's accuracy
print(accuracy_score(y_test, y_pred))

0.7230407202782894


# Model 1 -  ComplementNB Naive Bayes (with Lasso)

In [31]:
# Create classifier
GNB_model = GaussianNB()

In [32]:
# Train the classifier 
GNB_model.fit(X_train_selected, y_resampled)

GaussianNB()

In [33]:
# Get predictions
y_pred = GNB_model.predict(X_test_selected)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,1,0
2,1,1
3,1,1
4,0,0
5,1,0
6,1,0
7,1,1
8,1,0
9,1,0


In [34]:
# Get model's accuracy
print(accuracy_score(y_test, y_pred))

0.4827092285655822


# Model 3 - Deep Neural Net model (with Lasso)

In [35]:
# Define the model - deep neural net
number_input_features = len(X_train_selected[0])
hidden_nodes_layer1 = 10
# hidden_nodes_layer2 = 5


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,input_dim= number_input_features, activation="relu"))

# # Second hidden layer 
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))

# Restore the model weights
#nn.load_weights("checkpoints/weights.100.hdf5")

# Check the structure of the model 
nn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 10)                5760      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 5,771
Trainable params: 5,771
Non-trainable params: 0
_________________________________________________________________


In [36]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [37]:
# Change y_train and y_test type for tf
y_resampled_np = np.array(y_resampled)
y_test_np = np.array(y_test)

In [38]:
# Train the model
fit_model = nn.fit(X_train_selected,y_resampled_np,epochs=20)

Train on 48450 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [39]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_selected,y_test_np,verbose=2)
print(f"Test Loss: {model_loss}, Test Accuracy: {model_accuracy}")

9774/1 - 0s - loss: 0.8204 - accuracy: 0.7174
Test Loss: 0.7594000687012771, Test Accuracy: 0.7174135446548462


In [40]:
model_loss, model_accuracy = nn.evaluate(X_train_selected,y_resampled_np,verbose=2)
print(f"Train Loss: {model_loss}, Train Accuracy: {model_accuracy}")

48450/1 - 1s - loss: 0.2035 - accuracy: 0.8773
Train Loss: 0.2945248732687396, Train Accuracy: 0.8773374557495117


# Model 4 - Decision Tree (With Lasso)

In [76]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=50,random_state=78)

In [77]:
# Fitting the model
rf_model.fit(X_train_selected, y_resampled)

RandomForestClassifier(min_samples_split=6, n_estimators=50, random_state=78)

In [78]:
# Get predictions
y_pred = rf_model.predict(X_test_selected)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,1
3,1,1
4,0,0
5,0,0
6,0,0
7,0,1
8,0,0
9,0,0


In [79]:
# Get model's accuracy
print(accuracy_score(y_test, y_pred))

0.7487210967873952


In [80]:
# Calculate feature importance in the Random Forest model
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.03815423782969685, 'death'),
 (0.01789634712949616, 'although'),
 (0.011953217418634305, 'grown'),
 (0.01149828186070921, 'cruel'),
 (0.010037310983029086, 'blue'),
 (0.00889814516689406, 'darl'),
 (0.008111547316714075, 'dancin'),
 (0.007980363504007616, 'chanc'),
 (0.007881061315661157, 'free'),
 (0.00783962932346706, 'behind'),
 (0.007346195219062903, 'bis'),
 (0.007296954141739077, 'chest'),
 (0.007295279259238205, 'forgotten'),
 (0.007245452309002576, 'crew'),
 (0.007164002438133824, 'charm'),
 (0.006922760694677213, 'fan'),
 (0.0068574599969180635, 'demand'),
 (0.0068404348435010184, 'clean'),
 (0.006839670083235926, 'grew'),
 (0.006825963407915477, 'danc'),
 (0.006683592455520301, 'alway'),
 (0.006597950580477874, 'dove'),
 (0.006155822810144347, 'foi'),
 (0.0060117446022470815, 'break'),
 (0.0058897078463818866, 'away'),
 (0.0058759299722737236, 'birth'),
 (0.005860139349346768, 'cheap'),
 (0.005790210672582576, 'bend'),
 (0.00568764316958391, 'climb'),
 (0.0056467045357059

# Save the model

In [None]:
# Save the NB model as a pickle in a file 
joblib.dump(compNB_model, 'NB_model_v2.pkl') 

In [None]:
# Save the nn model as an h5 file
nn.save("nn_model_v2.h5")

In [None]:
# Save the NB model as a pickle in a file 
joblib.dump(rf_model, 'rf_model_joblib_v1.pkl') 

In [None]:
# Save the NB model as a pickle in a file 
pickle.dump(rf_model, open('rf_model_pickle_v1.pkl', 'wb'))