In [2]:
import requests
import pandas as pd
import numpy as np

In [4]:
from sklearn.model_selection import train_test_split

df = pd.read_csv('Nov27_Final_Dataset.csv')

feature_df = df[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness'
                 , 'liveness', 'valence', 'tempo', 'key', 'duration']]
popular_df = df['popular'].astype('bool')

# Independent Variable
X = np.asarray(feature_df)

# Dependent Variable
y = np.asarray(popular_df)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=4)

In [5]:
from sklearn import svm

"""
    Define SVM using different kernels: Linear('linear'), polynomial('poly'), RBF('rbf'), and Sigmoid('sigmoid')
    **Preform dimensionality reduction
"""

# Define kernel
classifier = svm.SVC(kernel='poly', gamma='auto', C=2)

# Fit and execute prediction
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)

In [6]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_predict))
print(f"Mean accuracy: {classifier.score(X_test, y_test)}")

              precision    recall  f1-score   support

       False       0.66      0.68      0.67        63
        True       0.56      0.53      0.54        47

    accuracy                           0.62       110
   macro avg       0.61      0.61      0.61       110
weighted avg       0.62      0.62      0.62       110

Mean accuracy: 0.6181818181818182


In [52]:
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

df = pd.read_csv('Nov27_Final_Dataset.csv')
"""
Best accuracy with this setup: 63.768% 
Features that don't improve accuracy: instrumentalness

dance, energy, valence --> 66.66%, dance, tempo, valence -->66.66%
dance, energy, tempo -- > 62%

"""

feature_df = df[['danceability','energy', 'valence']]
popular_df = df['popular'].astype('bool')

# Independent Variable
X = np.asarray(feature_df)

# Dependent Variable
y = np.asarray(popular_df)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=4)

# Define kernel
classifier = svm.SVC(kernel='poly', gamma='auto', C=2)

# Cross validation
svm_cv_scores = cross_val_score(classifier, X_train, y_train, cv=10, scoring='accuracy')
print(f"SVM 10-Fold Cross-Validation Scores: {svm_cv_scores}")
print("Mean cross-validated score:", svm_cv_scores.mean())

# Fit and execute prediction
classifier.fit(X_train, y_train)
y_predict = classifier.predict(X_test)

print(classification_report(y_test, y_predict))
print(f"Mean accuracy: {classifier.score(X_test, y_test)}")

SVM 10-Fold Cross-Validation Scores: [0.64285714 0.80487805 0.65853659 0.75609756 0.56097561 0.70731707
 0.63414634 0.63414634 0.65853659 0.70731707]
Mean cross-validated score: 0.6764808362369338
              precision    recall  f1-score   support

       False       0.68      0.79      0.73        80
        True       0.63      0.50      0.56        58

    accuracy                           0.67       138
   macro avg       0.66      0.64      0.65       138
weighted avg       0.66      0.67      0.66       138

Mean accuracy: 0.6666666666666666


In [8]:
"""
from sklearn.metrics import confusion_matrix
import seaborn as sns; sns.set()
import matplotlib.pyplot as ptl

mat = confusion_matrix(y_test, y_predict, normalize="true")      # unsure about the results of this confusion matrix
class_labels = ['True', 'False']
sns.heatmap(mat, annot=True, cmap='viridis', xticklabels=class_labels, yticklabels=class_labels)

ptl.title('SVM Classfier Confusion Matrix')
ptl.xlabel('Actual Popular Song')
ptl.ylabel('Predicted Popular Song')
"""

'\nfrom sklearn.metrics import confusion_matrix\nimport seaborn as sns; sns.set()\nimport matplotlib.pyplot as ptl\n\nmat = confusion_matrix(y_test, y_predict, normalize="true")      # unsure about the results of this confusion matrix\nclass_labels = [\'True\', \'False\']\nsns.heatmap(mat, annot=True, cmap=\'viridis\', xticklabels=class_labels, yticklabels=class_labels)\n\nptl.title(\'SVM Classfier Confusion Matrix\')\nptl.xlabel(\'Actual Popular Song\')\nptl.ylabel(\'Predicted Popular Song\')\n'

In [50]:

from sklearn import datasets
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

df = pd.read_csv('Nov27_Final_Dataset.csv')

# Load a dataset (replace this with your own dataset)
feature_df = df[['danceability', 'energy', 'valence']]
popular_df = df['popular'].astype('bool')

# Independent Variable
X = np.asarray(feature_df)

# Dependent Variable
y = np.asarray(popular_df)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with SVM and standard scaler
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('svm', SVC())  # Support Vector Machine
])

# Define the hyperparameters to search
param_grid = {
    'svm__C': [0.1, 1, 10, 100],  # Regularization parameter
    'svm__kernel': ['poly'],  # Kernel type
    'svm__degree': [2, 3, 4],
    'svm__gamma': ['scale', 'auto']  # Kernel coefficient
}

# Create the GridSearchCV object with cross-validation
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Fit the model to the training data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters found
print("Best hyperparameters:", grid_search.best_params_)

# Evaluate the model on the test set
accuracy = grid_search.score(X_test, y_test)
print("Test set accuracy:", accuracy)



Best hyperparameters: {'svm__C': 100, 'svm__degree': 3, 'svm__gamma': 'scale', 'svm__kernel': 'poly'}
Test set accuracy: 0.5545454545454546
