In [104]:
# Import dependencies
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from ipython_config import RDS_pwd 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import ComplementNB
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from sklearn.naive_bayes import GaussianNB
import tensorflow as tf
import pickle
import os
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.linear_model import LinearRegression 
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
import joblib
import h5py
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
import statsmodels.formula.api as sm

In [2]:
# Connect to database (Note: The package psychopg2 is required for Postgres to work with SQLAlchemy)
# db_string = f"postgres://postgres:{RDS_pwd}@platinum-rds.cbu3an3ywyth.us-east-2.rds.amazonaws.com/Platinum_Lyrics"
# engine = create_engine(db_string)

In [3]:
#printing names of the tables present in the database
# print(engine.table_names())

In [4]:
# lyrics_features = engine.table_names()[0]
# lyrics_only = engine.table_names()[7]
# lyrics_only

In [5]:
# Read SQL database table into a DataFrame.
# lyrics_only_df = pd.read_sql_table(lyrics_only,engine)
# lyrics_only_df.head()

In [6]:
# Create dataframe
lyrics_only_df = pd.read_csv('../Resources/platinum_lyrics.csv', index_col='Unnamed: 0')
lyrics_only_df.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,target_success,target_weeks,target_peak,abov,accept,ach,...,yeah,year,yellow,yes,yesterday,yet,york,you,young,youth
0,TRAAAAV128F421A322,western addiction,a poor recipe for civic cohesion,2005,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
1,TRAAABD128F429CF47,the box tops,soul deep,1969,1,18,14,0,0,0,...,4,0,0,0,0,0,0,17,0,0
2,TRAAAGF12903CEC202,halvdan sivertsen,smã¥ ord,2005,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TRAAAZF12903CCCF6B,matthew wilder,break my stride,1984,1,29,95,0,0,0,...,0,0,0,0,0,0,0,14,0,0
4,TRAABEV12903CC53A4,suicide commando,blood in face,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Remove the prefix "word_":
# column_names = lyrics_only_df.columns.values
# new_column_names = {}
# for column in column_names:
#                 new_column_names[column] = column.replace('word_', '')
# lyrics_only_df = lyrics_only_df.rename(columns=new_column_names)

# PREPROCESSING

In [7]:
# Get the shape of lyrics_only_df 
lyrics_only_df.shape

(41784, 1562)

In [8]:
# Get list of columns
list(lyrics_only_df.columns)

['track_id',
 'artist_name',
 'song_title',
 'song_year',
 'target_success',
 'target_weeks',
 'target_peak',
 'abov',
 'accept',
 'ach',
 'across',
 'act',
 'action',
 'addict',
 'admit',
 'ador',
 'afraid',
 'age',
 'ago',
 'ahead',
 'aim',
 'air',
 'album',
 'algo',
 'aliv',
 'alla',
 'alma',
 'almost',
 'alon',
 'along',
 'alreadi',
 'alright',
 'although',
 'alway',
 'amaz',
 'america',
 'american',
 'amigo',
 'amo',
 'amor',
 'amour',
 'angel',
 'anger',
 'ani',
 'anim',
 'anoth',
 'answer',
 'ant',
 'anybodi',
 'anymor',
 'anyon',
 'anyth',
 'anyway',
 'anywher',
 'apart',
 'appear',
 'arm',
 'around',
 'arriv',
 'art',
 'ash',
 'ask',
 'asleep',
 'ass',
 'att',
 'attack',
 'aus',
 'aux',
 'awak',
 'away',
 'babe',
 'babi',
 'back',
 'bad',
 'bag',
 'ball',
 'band',
 'bang',
 'bank',
 'bar',
 'bare',
 'bass',
 'battl',
 'beach',
 'bear',
 'beast',
 'beat',
 'beauti',
 'becam',
 'becaus',
 'becom',
 'bed',
 'beer',
 'befor',
 'beg',
 'began',
 'begin',
 'begun',
 'behind',
 'bei'

In [9]:
# Get columns with null values
null_columns=lyrics_only_df.columns[lyrics_only_df.isnull().any()]
lyrics_only_df[null_columns].isnull().sum()

artist_name        1
song_title         7
cost           41784
oder           41784
dtype: int64

In [10]:
# Drop columns we don't need or have on ly null values
lyrics_only_df.drop(columns=["track_id","artist_name",
                                              "song_title","song_year","target_weeks",
                                              "target_peak","cost","oder"],axis=1,inplace=True)
lyrics_only_df.head()

Unnamed: 0,target_success,abov,accept,ach,across,act,action,addict,admit,ador,...,yeah,year,yellow,yes,yesterday,yet,york,you,young,youth
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
1,1,0,0,0,0,0,0,0,0,0,...,4,0,0,0,0,0,0,17,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,14,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Get data types 
for dtype in lyrics_only_df.dtypes: 
    print(dtype)

int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int64
int6

# FEATURE ENGINEERING

In [12]:
# Split data to test and train
X = lyrics_only_df.drop(columns=['target_success'],axis=1)
y = lyrics_only_df['target_success']

In [None]:
#Sequential backward selection(sbs)
# sbs = SFS(LinearRegression(), 
#           k_features=100, 
#           forward=False, 
#           floating=False,
#           cv=0)
# sbs.fit(X, y)
# sbs.k_feature_names_


In [13]:
# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

In [14]:
# Resample 
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(
    X_train, y_train
)
Counter(y_resampled)

Counter({0: 24226, 1: 24226})

In [25]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler= scaler.fit(X_resampled)

# Scale the data
X_train_scaled = X_scaler.transform(X_resampled)
X_test_scaled = X_scaler.transform(X_test)

In [16]:
# Initialize PCA model
# pca = PCA(n_components=3)
# X_pca = pca.fit_transform(X_train_scaled)
# X_pca_df = pd.DataFrame(data=X_pca,columns=["principal component 1", "principal component 2", "principal component 3"])
# X_pca_df.head()

Unnamed: 0,principal component 1,principal component 2,principal component 3
0,-0.087278,-0.002632,-0.057129
1,-0.001409,0.034123,-0.001244
2,-0.001788,-0.049974,-0.135883
3,-0.065582,-0.034209,0.015271
4,-0.124586,0.005507,-0.000431


In [37]:
#Fetch the explained variance
# pca.explained_variance_ratio_

In [73]:
# Selecting features using Lasso regularisation using SelectFromModel
sel_ = SelectFromModel(LogisticRegression(C=1,fit_intercept=False,penalty='l2'))
sel_.fit(scaler.transform(X_resampled), y_resampled)

SelectFromModel(estimator=LogisticRegression(C=1, fit_intercept=False))

In [74]:
# Visualising features that were kept by the lasso regularisation
sel_.get_support()

array([False, False,  True, ...,  True, False, False])

In [75]:
# Make a list of with the selected features
selected_feat = X_resampled.columns[(sel_.get_support())]
print('total features: {}'.format((X_resampled.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

total features: 1553
selected features: 608
features with coefficients shrank to zero: 0


In [76]:
# Number of features which coefficient was shrank to zero 
np.sum(sel_.estimator_.coef_ == 0)

0

In [77]:
# Identifying the removed features
removed_feats = X_resampled.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

Index([], dtype='object')

In [78]:
X_train_selected = sel_.transform(X_resampled.fillna(0))
X_test_selected = sel_.transform(X_test.fillna(0))
X_train_selected.shape, X_test_selected.shape

((48452, 608), (10446, 608))

In [100]:
model = LinearRegression()
model.fit(X_resampled,y_resampled)
print(model.score(X_test,y_test))

-0.32523440950456917


In [103]:
X_resampled_new = np.append (arr=np.ones([X_resampled.shape[0],1]).astype(int), values = X_resampled, axis = 1)

In [105]:
X_opt = [0,1,2,3,4,5,6]
regressor = sm.OLS(y_resampeld, X_resampled_new[:,X_opt]).fit()
print(regressor.summary())

AttributeError: module 'statsmodels.formula.api' has no attribute 'OLS'

# Model 1 -  ComplementNB Naive Bayes

In [86]:
#Create classifier  
compNB_model = ComplementNB()

In [88]:
# Train the classifier 
compNB_model.fit(X_train_selected, y_resampled)

ComplementNB()

In [90]:
# Get predictions
y_pred = compNB_model.predict(X_test_selected)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,0
1,0,0
2,1,1
3,0,0
4,0,0
5,0,0
6,1,1
7,0,0
8,0,1
9,0,0


In [91]:
# Get model's accuracy
print(accuracy_score(y_test, y_pred))

0.6701129618992916


# Model 2 -  GaussianNB Naive Bayes

In [92]:
# Create classifier
GNB_model = GaussianNB()

In [93]:
# Train the classifier 
GNB_model.fit(X_train_selected, y_resampled)

GaussianNB()

In [94]:
# Get predictions
y_pred = GNB_model.predict(X_test_selected)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,0
1,0,0
2,1,1
3,0,0
4,0,0
5,1,0
6,1,1
7,1,0
8,1,1
9,1,0


In [95]:
# Get model's accuracy
print(accuracy_score(y_test, y_pred))

0.46678154317442083


# Model 3 - Deep Neural Net model

In [82]:
# Define the model - deep neural net
number_input_features = len(X_train_selected[0])
hidden_nodes_layer1 = 10
# hidden_nodes_layer2 = 5


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,input_dim= number_input_features, activation="relu"))

# # Second hidden layer 
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))

# Restore the model weights
#nn.load_weights("checkpoints/weights.100.hdf5")

# Check the structure of the model 
nn.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 10)                6090      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 11        
Total params: 6,101
Trainable params: 6,101
Non-trainable params: 0
_________________________________________________________________


In [83]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [84]:
# Change y_train and y_test type for tf
y_train = np.array(y_resampled)
y_test = np.array(y_test)

In [85]:
# Train the model
fit_model = nn.fit(X_train_selected,y_train,epochs=50)

Train on 48452 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [22]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

10446/1 - 1s - loss: 0.8529 - accuracy: 0.7733
Loss: 0.8998321496205565, Accuracy: 0.7733103632926941


In [23]:
model_loss, model_accuracy = nn.evaluate(X_train_scaled,y_train,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

48452/1 - 2s - loss: 0.1149 - accuracy: 0.9637
Loss: 0.12314075665374835, Accuracy: 0.9636753797531128


# Save the model

In [None]:
# Save the trained model as a pickle string. 
saved_NB_model = pickle.dumps(compNB_model) 
#saved_ANN_model = pickle.dumps(nn) 

In [None]:
# Save the NB model as a pickle in a file 
joblib.dump(saved_NB_model, 'NB_model_v1.1.pkl') 

In [None]:
# Save the nn model as an h5 file
nn.save("nn_model_v1.1.h5")