# Modeling with Numeric Features

#### Important Note!
This notebook was temporarily put to the side because I decided to use a CNN with the mel specgrograms as features instead. I would like to return to this notebook on a later date and do some more research on numeric features to see how well they do.

In [1]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Reading in the data
genre = pd.read_csv('../data/genre_clean.csv')

In [3]:
# First five rows of the data for reference
genre.head()

Unnamed: 0,files,zero_crossing_rate,spectral_centroid,spectral_rolloff,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,labels,y
0,reggae.00080.wav,0.094298,2539.121009,5260.77136,-103.136673,67.205032,2.212647,28.504496,4.172197,11.874951,14.244764,14.292829,3.261752,15.854269,8.042967,4.469097,8.679431,reggae,9
1,jazz.00016.wav,0.069845,1465.857446,2822.406728,-259.87674,123.187164,-6.390842,37.570335,-2.977656,13.057896,-14.083035,5.310007,-11.961549,3.524627,-9.633516,5.333287,-7.843499,jazz,6
2,disco.00052.wav,0.169775,2590.650686,5060.527559,-70.502701,90.517845,-48.066078,36.687813,-17.691069,21.595446,-30.198866,24.046898,-20.459778,14.487847,-18.499725,9.128921,-12.056172,disco,4
3,jazz.00002.wav,0.057857,1064.668667,1895.729578,-256.959015,175.358765,-44.822285,25.65062,-4.255735,-0.222764,-11.312749,-9.189112,-4.09536,-8.30826,-22.548216,-7.608586,-7.651291,jazz,6
4,disco.00046.wav,0.114198,2259.565542,4889.552594,-125.681534,101.784462,-17.4781,33.672756,-15.236323,23.70314,-8.659072,18.544029,-14.054308,19.970242,-15.139117,12.969249,-14.306309,disco,4


In [10]:
# Defining our features and predictor variable
X = genre.drop(['files', 'labels', 'y'], axis=1)
y = genre['y']

In [11]:
# Getting our baseline accuracy
y.value_counts(normalize=True)

10    0.1
9     0.1
8     0.1
7     0.1
6     0.1
5     0.1
4     0.1
3     0.1
2     0.1
1     0.1
Name: y, dtype: float64

In [12]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [7]:
# Scaling the features
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

## Support Vector Machine

In [8]:
# Instantiating and fitting a basic svc model
svc = SVC()
svc.fit(X_train_sc, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [9]:
# Checking the accuracy
print(f'Score on the training set: {svc.score(X_train_sc, y_train)}')
print(f'Score on the test set: {svc.score(X_test_sc, y_test)}')

Score on the training set: 0.7146666666666667
Score on the test set: 0.604


## Random Forest

In [25]:
# Instantiating and fitting a basic random forest model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
# Checking the accuracy
print(f'Score on the training set: {rf.score(X_train, y_train)}')
print(f'Score on the test set: {rf.score(X_test, y_test)}')

Score on the training set: 0.9986666666666667
Score on the test set: 0.608
