# Modeling

In [24]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [17]:
# Reading in the data
genre = pd.read_csv('../data/genre_clean.csv')

In [18]:
# Defining our features and predictor variable
X = genre.drop(['files', 'labels', 'y'], axis=1)
y = genre['y']

In [19]:
# Getting our baseline accuracy
y.value_counts(normalize=True)

10    0.1
9     0.1
8     0.1
7     0.1
6     0.1
5     0.1
4     0.1
3     0.1
2     0.1
1     0.1
Name: y, dtype: float64

In [20]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [21]:
# Scaling the features
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

## Support Vector Machine

In [22]:
# Instantiating and fitting a basic svc model
svc = SVC()
svc.fit(X_train_sc, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [23]:
# Checking the accuracy
print(f'Score on the training set: {svc.score(X_train_sc, y_train)}')
print(f'Score on the test set: {svc.score(X_test_sc, y_test)}')

Score on the training set: 0.692
Score on the test set: 0.604


## Random Forest

In [25]:
# Instantiating and fitting a basic random forest model
rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [27]:
# Checking the accuracy
print(f'Score on the training set: {rf.score(X_train, y_train)}')
print(f'Score on the test set: {rf.score(X_test, y_test)}')

Score on the training set: 0.9986666666666667
Score on the test set: 0.608
