In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
import json


#funbigfile.json holds all our data 
with open('funbigfile.json') as f:
    data = json.load(f)

df = pd.json_normalize(data['bruh'])
df = df.drop(['type', 'uri', 'track_href', 'analysis_url','time_signature', 'duration_ms'], axis=1)
#print(df)

#split training vs target - we should have 250 total songs
#WE'RE TRYING TO PREDICT GENRE
X_train, X_test, y_train, y_test = train_test_split(df.drop('genre', axis=1), df['genre'], test_size=0.2, random_state=42)

# Encode the target variable
lb = LabelBinarizer()
y_train_encoded = lb.fit_transform(y_train)
y_test_encoded = lb.transform(y_test)

#ATTRIBUTES WE DECIDE TO USE
#features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'speechiness', 'tempo', 'valence']
features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'key', 'liveness', 'loudness','speechiness', 'tempo', 'valence']
# Create the feature matrix
X_train = X_train[features]
X_test = X_test[features]

# Train a random forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train_encoded)

#create prediction
y_pred = rf.predict(X_test[features])



#run the scikit accuracy test 
accuracy = rf.score(X_test, y_test_encoded)
print("Accuracy:", accuracy)

#these are the 'right values'
print("TESTING DATASET: ")
print(y_test.values)

#these are what our model predicted
print("GUESSES: ")
print(lb.inverse_transform(y_pred))

rightvals = y_test.values
guessedvals = lb.inverse_transform(y_pred)

incorrect_guesses = rightvals != guessedvals
right_unguessed = rightvals[incorrect_guesses] #these are the things that were NOT guessed for
incorrect_preds = guessedvals[incorrect_guesses] #these are the things we guessed instead

comparearr = []
for i in range(0,len(right_unguessed),1):
    comparearr.append("CORRECT:" + right_unguessed[i] + " OUR GUESS:" + incorrect_preds[i])

print(comparearr)
#^THIS explains which songs were guessed incorrectly 


#enumerates genre list from bigfile.json
#it is dynamic so we can change genres easily through forfun.py and not worry about it here
genreList = list(set(rightvals))

for g in genreList:
    print('\n')
    total = np.count_nonzero(rightvals == g)
    missed = np.count_nonzero(right_unguessed == g)
    prop = str(1-(missed/total))
    print("Proportion of " + g + " guessed correctly: " + prop)
    wrongguess = np.count_nonzero(incorrect_preds == g)
    newprop = str((wrongguess/total))
    print("Proportion of guesses we shouldn't have guessed as " + g + ": " + newprop)

Accuracy: 0.94
TESTING DATASET: 
['Reggaeton' 'Metal' 'Jazz' 'Jazz' 'Reggaeton' 'Salsa' 'Salsa' 'Salsa'
 'Metal' 'Reggaeton' 'Country' 'Country' 'Country' 'Country' 'Jazz'
 'Country' 'Salsa' 'Metal' 'Salsa' 'Metal' 'Country' 'Metal' 'Reggaeton'
 'Jazz' 'Salsa' 'Metal' 'Jazz' 'Metal' 'Metal' 'Reggaeton' 'Country'
 'Metal' 'Reggaeton' 'Country' 'Metal' 'Reggaeton' 'Metal' 'Salsa' 'Salsa'
 'Salsa' 'Reggaeton' 'Jazz' 'Reggaeton' 'Salsa' 'Country' 'Metal'
 'Reggaeton' 'Jazz' 'Salsa' 'Metal']
GUESSES: 
['Reggaeton' 'Metal' 'Jazz' 'Jazz' 'Reggaeton' 'Salsa' 'Reggaeton' 'Salsa'
 'Metal' 'Reggaeton' 'Country' 'Country' 'Country' 'Country' 'Jazz'
 'Country' 'Salsa' 'Metal' 'Salsa' 'Metal' 'Country' 'Metal' 'Reggaeton'
 'Jazz' 'Salsa' 'Metal' 'Jazz' 'Metal' 'Metal' 'Reggaeton' 'Country'
 'Metal' 'Reggaeton' 'Country' 'Metal' 'Reggaeton' 'Metal' 'Salsa'
 'Country' 'Salsa' 'Reggaeton' 'Jazz' 'Reggaeton' 'Salsa' 'Country'
 'Metal' 'Reggaeton' 'Jazz' 'Country' 'Metal']
['CORRECT:Salsa OUR GUESS:Regga