# First Import necessary library and read the data

In [None]:
# import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,GridSearchCV

# Read in track metadata with genre labels
tracks = pd.read_csv('datasets/fma-rock-vs-hiphop.csv')

# Read in track metrics with the features
echonest_metrics = pd.read_json('datasets/echonest-metrics.json', precise_float=True)

# Merge the relevant columns of tracks and echonest_metrics
echo_tracks = echonest_metrics.merge(tracks[['genre_top', 'track_id']], on='track_id')

# Create features
features = echo_tracks.drop(["genre_top", "track_id"], axis=1).values

# Create labels
labels = echo_tracks["genre_top"].values

# Partition the data into traning and testing set.
# Apply normalization to scale the data
# Apply PCA to reduce the dimension of the data. (the optimal number of PCA components is obtained from EDA., see EDA and Data Preprocessing for detail)

In [26]:
# Split our data
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, 
                                                                            random_state=10)
# Scale train_features and set the values to a new variable
scaler = StandardScaler()

# Scale train_features and test_features
scaled_train_features = scaler.fit_transform(train_features)
scaled_test_features = scaler.transform(test_features)

# Perform PCA with the chosen number of components and project data onto components
pca = PCA(n_components=6, random_state=10)

# Fit and transform the scaled training features using pca
train_pca = pca.fit_transform(scaled_train_features)

# Fit and transform the scaled test features using pca
test_pca = pca.transform(scaled_test_features)

# Several variants of SVM classifier can be found in train_SVM_Variants.ipynb. To use functions in that file, first import it.
# Here we called CV_Tuned_SVM classifier in which the hyperparameters are tuned using cross validation and graidsearch method. 

In [27]:
#import train_SVM_Variants to call train function
#training SVM with various options can be found in train_SVM_Variants
from ipynb.fs.full.train_SVM_Variants import *

CV_Tuned_SVM=CV_Tuned_SVM(train_pca,train_labels)

print("Tuned SVM Parameters: {}".format(CV_Tuned_SVM.best_params_)) 
print("Best score is {}".format(CV_Tuned_SVM.best_score_))
print("Best Estimator is {}".format(CV_Tuned_SVM.best_estimator_))

Tuned SVM Parameters: {'C': 1, 'gamma': 0.1}
Best score is 0.9141917090460779
Best Estimator is SVC(C=1, gamma=0.1)


# Several variants of Decision Tree classifier can be found in train_DecisionTree_Variants.ipynb. To use functions in that file, first import it.
# Here we called CV_Tuned_DTclassifier in which the hyperparameters are tuned using cross validation and graidsearch method. 

In [28]:
from ipynb.fs.full.train_DecisionTree_Variants import *

CV_Tuned_DT=CV_Tuned_DT(train_pca,train_labels)

print("Tuned Decision Tree Parameters: {}".format(CV_Tuned_DT.best_params_)) 
print("Best score is {}".format(CV_Tuned_DT.best_score_))
print("Best Estimator is {}".format(CV_Tuned_DT.best_estimator_))

Tuned Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': 4}
Best score is 0.8847565110186469
Best Estimator is DecisionTreeClassifier(criterion='entropy', max_depth=4)


In [None]:
# Here is fine tuned LogisticRegression

In [29]:
from ipynb.fs.full.train_LogisticRegression_Variants import *
CV_Tune_LogR=CV_Tune_LogR(train_pca,train_labels)

print("Tuned LogR Parameters: {}".format(CV_Tune_LogR.best_params_)) 
print("Best score is {}".format(CV_Tune_LogR.best_score_))
print("Best Estimator is {}".format(CV_Tune_LogR.best_estimator_))



Tuned LogR Parameters: {'C': 0.4393970560760795}
Best score is 0.8769814301124981
Best Estimator is LogisticRegression(C=0.4393970560760795)


In [None]:
# KNN classifier

In [30]:
from ipynb.fs.full.train_KNN_Variants import *

CV_KNN=CV_KNN(train_pca,train_labels)

print("Tuned KNN Parameters: {}".format(CV_KNN.best_params_)) 
print("Best score is {}".format(CV_KNN.best_score_))
print("Best Estimator is {}".format(CV_KNN.best_estimator_))


Tuned KNN Parameters: {'n_neighbors': 9}
Best score is 0.9114147018030513
Best Estimator is KNeighborsClassifier(n_neighbors=9)


In [None]:
# Random Forest Classifier

In [31]:
from ipynb.fs.full.train_ensemble_Variants import *

CV_RF=CV_RandomForest(train_pca,train_labels)

print("Tuned Random Forest Parameters: {}".format(CV_RF.best_params_)) 
print("Best score is {}".format(CV_RF.best_score_))
print("Best Estimator is {}".format(CV_RF.best_estimator_))

Tuned Random Forest Parameters: {'max_features': 'sqrt', 'min_samples_leaf': 2, 'n_estimators': 350}
Best score is 0.9075277392510401
Best Estimator is RandomForestClassifier(max_features='sqrt', min_samples_leaf=2,
                       n_estimators=350)


In [None]:
Compare the cross validation results of each classifier

In [32]:
# Compare the accuracy of CV results on train data

accuracy=[CV_Tune_LogR.best_score_,CV_Tuned_DT.best_score_,CV_Tuned_SVM.best_score_,CV_KNN.best_score_,CV_RF.best_score_]
classifiers=['LogR','Decision Tree','SVM','KNN','Random Forest']
print('Cross Validation Accuracy on Traning Data')
print(pd.DataFrame(accuracy,index=classifiers,columns=['Accuracy']))

Cross Validation Accuracy on Traning Data
               Accuracy
LogR           0.876981
Decision Tree  0.884757
SVM            0.914192
KNN            0.911415
Random Forest  0.907528


Using the best parameters found during hyperparatmeters with cross validation, we will test the trained classifiers on the new data and measure the performance.

In [37]:
#Compare the performance of the model
print("Logistic Regression: \n", classification_report(test_labels, CV_Tune_LogR.predict(test_pca)))
print("\n Decision Tree: \n", classification_report(test_labels, CV_Tuned_DT.predict(test_pca)))
print("\n Support Vector Machine: \n", classification_report(test_labels, CV_Tuned_SVM.predict(test_pca)))
print("\n K Nearest Neighbors: \n", classification_report(test_labels, CV_KNN.predict(test_pca)))
print("\n Random Forest: \n", classification_report(test_labels, CV_RF.predict(test_pca)))

Logistic Regression: 
               precision    recall  f1-score   support

     Hip-Hop       0.77      0.54      0.64       235
        Rock       0.90      0.96      0.93       966

    accuracy                           0.88      1201
   macro avg       0.83      0.75      0.78      1201
weighted avg       0.87      0.88      0.87      1201


 Decision Tree: 
               precision    recall  f1-score   support

     Hip-Hop       0.75      0.61      0.67       235
        Rock       0.91      0.95      0.93       966

    accuracy                           0.88      1201
   macro avg       0.83      0.78      0.80      1201
weighted avg       0.88      0.88      0.88      1201


 Support Vector Machine: 
               precision    recall  f1-score   support

     Hip-Hop       0.85      0.60      0.71       235
        Rock       0.91      0.97      0.94       966

    accuracy                           0.90      1201
   macro avg       0.88      0.79      0.82      1201
weig