# Feature Engineering 

In [10]:
# Importing required libraries.
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt 
from imblearn.over_sampling import SMOTE

In [18]:
# Importing required libraries
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics 
from sklearn.metrics import f1_score
# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 
from sklearn.tree import DecisionTreeClassifier

In [19]:
# Train / Split Data
X_train, X_test, y_train, y_test = train_test_split(features.drop(columns='favorite'), features.favorite,test_size = .20)

In [20]:
X_train = X_train.drop(columns=['genre','artist','key','mode','time_signature'])

In [21]:
# Applying SMOTE 
oversample = SMOTE()
X_train, y_train = oversample.fit_resample(X_train, y_train)

For each model, I will use cross-validation to see how well each model generalizes and obtain a F1 score.

In [28]:
%%time
# Logistic Regression
lr = LogisticRegression()
lr_scores = cross_val_score(lr, X_train, y_train, cv=10, scoring="f1")
np.mean(lr_scores)

CPU times: user 26.4 s, sys: 803 ms, total: 27.2 s
Wall time: 14 s


0.7904689733827925

In [29]:
# Hyperparameter optimization for Decision Tree Classifier
parameters = {
    'max_depth':[15,20,30],
}
dtc = Pipeline([('CV',GridSearchCV(DecisionTreeClassifier(), parameters, cv = 5))])
dtc.fit(X_train, y_train)
dtc.named_steps['CV'].best_params_

{'max_depth': 30}

In [31]:
%%time
# Decision Tree
dt = DecisionTreeClassifier(max_depth=30)
dt_scores = cross_val_score(dt, X_train, y_train, cv=10, scoring="f1")
np.mean(dt_scores)

CPU times: user 59.4 s, sys: 495 ms, total: 59.9 s
Wall time: 1min


0.9785300408107732

In [22]:
# Hyperparameter optimization of RandomForestClassifier
parameters = {
    'max_depth':[6,12,15,20],
    'n_estimators':[20,30]
}
clf = Pipeline([('CV',GridSearchCV(RandomForestClassifier(), parameters, cv = 5))])
clf.fit(X_train, y_train)
clf.named_steps['CV'].best_params_

{'max_depth': 20, 'n_estimators': 30}

In [23]:
%%time
# Cross-validation for RandomForestClassifier
rf = Pipeline([('rf', RandomForestClassifier(n_estimators = 20, max_depth = 30))])
rf_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring="f1")
np.mean(rf_scores)

CPU times: user 4min 53s, sys: 6.08 s, total: 4min 59s
Wall time: 5min 6s


0.9917851082368905

### Predicting Songs and Saving Dataset for Personal Use

In [33]:
dt.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=30)

In [34]:
y_pred = dt.predict(X_test.drop(columns=['genre','artist','key','mode','time_signature']))

In [35]:
prediction = dt.predict(future.drop(columns=['track','track_id','favorite','genre','artist','key','time_signature','mode']))

In [36]:
future['prediction'] = prediction

In [37]:
# Gets only songs that were not favorites but are predicted to be
future = future[(future['favorite']==0) & (future['prediction'] == 1)]

In [38]:
future = future.drop(columns=['track_id','energy','duration_ms','acousticness','key','liveness','loudness','mode','speechiness','tempo','time_signature','valence'])

In [39]:
future

Unnamed: 0,genre,artist,track,popularity,danceability,instrumentalness,favorite,prediction
138,R&B,Surfaces,Heaven Falls / Fall on Me,61,0.563,0.002430,0,1
139,R&B,Olivia O'Brien,Love Myself,68,0.653,0.000000,0,1
146,R&B,Beyoncé,Flawless Remix,69,0.639,0.000008,0,1
149,R&B,Big Sean,I Know,66,0.741,0.000000,0,1
152,R&B,Jessie Reyez,Body Count (feat. Normani & Kehlani) - Remix,62,0.703,0.000000,0,1
...,...,...,...,...,...,...,...,...
230844,Soul,Seinabo Sey,Younger - Bonus Track / Acoustic Version,53,0.479,0.000000,0,1
230914,Soul,Paolo Nutini,These Streets,52,0.721,0.000000,0,1
231246,Soul,LION BABE,Honey Dew,45,0.674,0.007340,0,1
231895,Soul,Sports,Shiggy,41,0.554,0.000224,0,1


In [218]:
# Saving csv
future.to_csv("recommendations.csv")

### Conclusions
After converting recommendations.csv into a Spoitfy playlist, I was able to browse and listen to the recommendations. Ultimately, I quite enjoyed the recommendations except for a couple "what?" reactions to a few songs here and there. 
You can test the model for yourself at [whatspopping.xyz](http://www.whatspopping.xyz/)! Let me know what you think. :)