FEATURE ENGINEERING

In [2]:
# Importing required libraries.
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

In [3]:
# Feature Engineering 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA

In [4]:
# Testing 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import SMOTENC
from sklearn.metrics import f1_score

In [5]:
# import data
df = pd.read_csv("data/tracks_features.csv")
df.columns

Index(['id', 'name', 'album', 'album_id', 'artists', 'artist_ids',
       'track_number', 'disc_number', 'explicit', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms',
       'time_signature', 'year', 'release_date'],
      dtype='object')

In [6]:
df[df.favorite == 1]['time_signature'].value_counts()

AttributeError: 'DataFrame' object has no attribute 'favorite'

In [None]:
# Just genre, popularity, danceability
test = df[['artist','time_signature','popularity','danceability','favorite']]
test

In [None]:
# Train / Split Data
X_train, X_test, y_train, y_test = train_test_split(test.drop(columns='favorite'), test.favorite,test_size = .20)
# Applying SMOTE-ENC
smote_nc = SMOTENC(categorical_features=[0,1], random_state=0)
X_resampled, y_resampled = smote_nc.fit_resample(X_train, y_train)
X_re_test, y_re_test = smote_nc.fit_resample(X_test, y_test)

In [None]:
# pipeline 
cat_feats = ['artist','time_signature']
cat_transformer = Pipeline([
    ('one-hot', OneHotEncoder())
])

In [None]:
# preprocessing pipeline (put them together)
preproc = ColumnTransformer(transformers=[('cat', cat_transformer, cat_feats)],remainder='passthrough')
pl = Pipeline(steps=[('preprocessor', preproc), ('DecisionTree', DecisionTreeClassifier())])

In [None]:
pl.fit(X_resampled,y_resampled)

In [None]:
preds = pl.predict(X_re_test)

In [None]:
f1_score(y_re_test,preds)

In [None]:
dt_scores = cross_val_score(pl, X_resampled, y_resampled, cv=5, scoring="f1")
dt_scores