In [None]:
# Import dependencies
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
from ipython_config import RDS_pwd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.naive_bayes import ComplementNB,GaussianNB
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import Lasso, LogisticRegression, LinearRegression 
from sklearn.feature_selection import SelectFromModel 
from sklearn.ensemble import RandomForestClassifier 
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTE
from collections import Counter
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
import pickle
import os
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
import matplotlib.pyplot as plt
import joblib
import h5py
import statsmodels.formula.api as sm

In [None]:
# Connect to database (Note: The package psychopg2 is required for Postgres to work with SQLAlchemy)
db_string = f"postgres://postgres:{RDS_pwd}@platinum-rds.cbu3an3ywyth.us-east-2.rds.amazonaws.com/Platinum_Lyrics"
engine = create_engine(db_string)

In [None]:
#printing names of the tables present in the database
print(engine.table_names())

In [None]:
# lyrics_features = engine.table_names()[0]
lyrics_only = engine.table_names()[7]
lyrics_only

In [None]:
# Read SQL database table into a DataFrame.
lyrics_only_df = pd.read_sql_table(lyrics_only,engine)
lyrics_only_df.head()

In [None]:
# Remove the prefix "word_":
column_names = lyrics_only_df.columns.values
new_column_names = {}
for column in column_names:
                new_column_names[column] = column.replace('word_', '')
lyrics_only_df = lyrics_only_df.rename(columns=new_column_names)

# PREPROCESSING

In [None]:
# Get the shape of lyrics_only_df 
lyrics_only_df.shape

In [None]:
# Get list of columns
list(lyrics_only_df.columns)

In [None]:
# Get columns with null values
null_columns=lyrics_only_df.columns[lyrics_only_df.isnull().any()]
lyrics_only_df[null_columns].isnull().sum()

In [None]:
# Drop columns we don't need 
lyrics_only_df.drop(columns=["track_id","artist_name",
                                              "song_title","song_year","target_weeks",
                                              "target_peak"],axis=1,inplace=True)
lyrics_only_df.head()


In [None]:
# Remove some noisy features found through previous runs of the model
lyrics_only_df.drop(columns=["que","the","ooh","con","tri","una",
                            "por","noth","mai","whi","como","qui",
                            "qui", "das", "doe", "der", "des", "dan",
                             "che", "mani", "vida", "mit", "pas", "per",
                             "cos", "dri", "mir", "nos", "dir", "poi", "voi",
                             "och", "ver", "mari", "har", "doo", "ima"],
                    axis=1,inplace=True)
lyrics_only_df.head()

In [None]:
# Get data types 
for dtype in lyrics_only_df.dtypes: 
    print(dtype)

# FEATURE ENGINEERING

In [None]:
# Split data to test and train
X = lyrics_only_df.drop(columns=['target_success'],axis=1)
y = lyrics_only_df['target_success']

METHOD 1 - Sequential Backward Selection

In [None]:
#Sequential backward selection(sbs)
# sbs = SFS(LinearRegression(), 
#           k_features=100, 
#           forward=False, 
#           floating=False,
#           cv=0)
# sbs.fit(X, y)
# sbs.k_feature_names_


In [None]:
# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

In [None]:
# Resample 
X_resampled, y_resampled = SMOTE(random_state=1).fit_resample(
    X_train, y_train
)
Counter(y_resampled)

In [None]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler= scaler.fit(X_resampled)

# Scale the data
X_train_scaled = X_scaler.transform(X_resampled)
X_test_scaled = X_scaler.transform(X_test)

METHOD 2 - PCA

In [None]:
# Initialize PCA model
# pca = PCA(n_components=3)
# X_pca = pca.fit_transform(X_train_scaled)
# X_pca_df = pd.DataFrame(data=X_pca,columns=["principal component 1", "principal component 2", "principal component 3"])
# X_pca_df.head()

In [None]:
#Fetch the explained variance
# pca.explained_variance_ratio_

METHOD 3 - Lasso Regularization 

In [None]:
# Selecting features using Lasso regularization using SelectFromModel
sel_ = SelectFromModel(LogisticRegression(C=1,fit_intercept=False,penalty='l2'))
sel_.fit(scaler.transform(X_resampled), y_resampled)

In [None]:
# Visualising features that were kept by the lasso regularisation
sel_.get_support()

In [None]:
# Make a list of with the selected features
selected_feat = X_resampled.columns[(sel_.get_support())]
print('total features: {}'.format((X_resampled.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

In [None]:
# Number of features which coefficient was shrank to zero 
np.sum(sel_.estimator_.coef_ == 0)

In [None]:
# Identifying the removed features
removed_feats = X_resampled.columns[(sel_.estimator_.coef_ == 0).ravel().tolist()]
removed_feats

In [None]:
# create new X test and train with 579 features
X_train_selected = sel_.transform(X_resampled.fillna(0))
X_test_selected = sel_.transform(X_test.fillna(0))
X_train_selected.shape, X_test_selected.shape

# Model 1 -  ComplementNB Naive Bayes (with Lasso)

In [None]:
#Create classifier  
compNB_model = ComplementNB()

In [None]:
# Train the classifier 
compNB_model.fit(X_train_selected, y_resampled)

In [None]:
# Get predictions
y_pred = compNB_model.predict(X_test_selected)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

In [None]:
# Get model's accuracy
print(accuracy_score(y_test, y_pred))

# Model 2 -  GaussianNB Naive Bayes (with Lasso)

In [None]:
# Create classifier
GNB_model = GaussianNB()

In [None]:
# Train the classifier 
GNB_model.fit(X_train_selected, y_resampled)

In [None]:
# Get predictions
y_pred = GNB_model.predict(X_test_selected)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

In [None]:
# Get model's accuracy
print(accuracy_score(y_test, y_pred))

# Model 3 - Deep Neural Net model (with Lasso)

In [None]:
# Define the model - deep neural net
number_input_features = len(X_train_selected[0])
hidden_nodes_layer1 = 10
# hidden_nodes_layer2 = 5


nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1,input_dim= number_input_features, activation="relu"))

# # Second hidden layer 
# nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1,activation="sigmoid"))

# Restore the model weights
#nn.load_weights("checkpoints/weights.100.hdf5")

# Check the structure of the model 
nn.summary()

In [None]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Change y_train and y_test type for tf
y_resampled_np = np.array(y_resampled)
y_test_np = np.array(y_test)

In [None]:
# Train the model
fit_model = nn.fit(X_train_selected,y_resampled_np,epochs=20)

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_selected,y_test_np,verbose=2)
print(f"Test Loss: {model_loss}, Test Accuracy: {model_accuracy}")

In [None]:
model_loss, model_accuracy = nn.evaluate(X_train_selected,y_resampled_np,verbose=2)
print(f"Train Loss: {model_loss}, Train Accuracy: {model_accuracy}")

# Model 4 - Random Forest Classifier (With Lasso)

In [None]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=50,random_state=78)

In [None]:
# Fitting the model
rf_model.fit(X_train_selected, y_resampled)

In [None]:
# Get predictions
y_pred = rf_model.predict(X_test_selected)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

In [None]:
# Get model's accuracy
print(accuracy_score(y_test, y_pred))

In [None]:
# Calculate feature importance in the Random Forest model
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

# Save the models

In [None]:
# Save the NB model as a pickle in a file 
joblib.dump(compNB_model, 'NB_model_v3.pkl') 

In [None]:
# Save the nn model as an h5 file
nn.save("nn_model_v3.h5")

In [None]:
# Save the NB model as a pickle in a file 
joblib.dump(rf_model, 'rf_model_joblib_v2.pkl') 

In [None]:
# Save the NB model as a pickle in a file 
pickle.dump(rf_model, open('rf_model_pickle_v2.pkl', 'wb'))