# Feature Engineering 

In [2]:
# Importing required libraries.
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Feature Engineering 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
# Testing 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import f1_score

In [3]:
# import data
df = pd.read_csv("data/features.csv")
df.columns

Index(['Unnamed: 0', 'genre', 'artist', 'track', 'popularity', 'acousticness',
       'danceability', 'duration_ms', 'energy', 'instrumentalness', 'key',
       'liveness', 'loudness', 'mode', 'speechiness', 'tempo',
       'time_signature', 'valence', 'favorite'],
      dtype='object')

In [4]:
df[df.favorite == 1]['time_signature'].value_counts()

4/4    3015
3/4     170
5/4      33
1/4      11
Name: time_signature, dtype: int64

In [12]:
# Just genre, popularity, danceability
test = df[['artist','time_signature','popularity','danceability','favorite']]
test

Unnamed: 0,artist,time_signature,popularity,danceability,favorite
0,Henri Salvador,4/4,0,0.389,0
1,Martin & les fées,4/4,1,0.590,0
2,Joseph Williams,5/4,3,0.663,0
3,Henri Salvador,4/4,0,0.240,0
4,Fabien Nataf,4/4,4,0.331,0
...,...,...,...,...,...
223039,Slave,4/4,39,0.687,0
223040,Jr Thomas & The Volcanos,4/4,38,0.785,0
223041,Muddy Waters,4/4,47,0.517,0
223042,R.LUM.R,4/4,44,0.745,0


In [None]:
# Train / Split Data
X_train, X_test, y_train, y_test = train_test_split(test.drop(columns='favorite'), test.favorite,test_size = .20)
# Applying SMOTE-ENC
smote_nc = SMOTENC(categorical_features=[0,1], random_state=0)
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train)
X_re_test, y_re_test = smote_nc.fit_resample(X_test, y_test)

In [None]:
# pipeline 
cat_feats = ['artist','time_signature']
cat_transformer = Pipeline([
    ('one-hot', OneHotEncoder())
])

In [8]:
# preprocessing pipeline (put them together)
preproc = ColumnTransformer(transformers=[('cat', cat_transformer, cat_feats)],remainder='passthrough')
pl = Pipeline(steps=[('preprocessor', preproc), ('DecisionTree', DecisionTreeClassifier())])

In [9]:
pl.fit(X_resampled,y_resampled)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  Pipeline(steps=[('one-hot',
                                                                   OneHotEncoder())]),
                                                  ['genre',
                                                   'time_signature'])])),
                ('DecisionTree', DecisionTreeClassifier())])

In [10]:
preds = pl.predict(X_re_test)

In [11]:
f1_score(y_re_test,preds)

0.7275231885388925

In [124]:
dt_scores = cross_val_score(pl, X_resampled, y_resampled, cv=5, scoring="f1")
dt_scores

array([0.8683726 , 0.87660893, 0.87951876, 0.87495689, 0.87570102])