In [2]:
import requests
import random
import string

import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab
from scipy.stats import norm
import numpy as np
import operator
import scipy.stats as ss
from math import pi
import math
from sklearn.model_selection import train_test_split
import pandas as pd

In [5]:
#split data into test and training sets
def split_data(data):
    #dropping first column since trackID is not needed for classifiers
    data = data.drop('trackID', axis = 1)
    train, test = train_test_split(data, test_size = 0.25, random_state = 21)
    return train,test
def read_data():
    classical = pd.read_csv('data/classical_csv.csv')
    country = pd.read_csv('data/country_csv.csv')
    edm = pd.read_csv('data/edm_csv.csv')
    jazz = pd.read_csv('data/jazz_csv.csv')
    rap = pd.read_csv('data/rap_csv.csv')
    rock = pd.read_csv('data/rock_csv.csv')

    #Renaming the first column to trackID
    classical = classical.rename(columns={'Unnamed: 0': 'trackID'})
    country = country.rename(columns={'Unnamed: 0': 'trackID'})
    edm = edm.rename(columns={'Unnamed: 0': 'trackID'})
    jazz = jazz.rename(columns={'Unnamed: 0': 'trackID'})
    rap = rap.rename(columns={'Unnamed: 0': 'trackID'})
    rock = rock.rename(columns={'Unnamed: 0': 'trackID'})

    return classical, country, edm, jazz, rap, rock


# -------- VARIABLES NEEDED ------------
#calling read data function
classical, country, edm, jazz, rap, rock = read_data()
#splitting all the data needed
classical_train,classical_test = split_data(classical)
country_train,country_test = split_data(country)
edm_train,edm_test = split_data(edm)  
jazz_train,jazz_test = split_data(jazz)
rap_train,rap_test = split_data(rap)
rock_train,rock_test = split_data(rock)

#combining all lists into one large list for training and testing
trainingDFs = [classical_train, country_train, edm_train, jazz_train, rap_train, rock_train]
testingDFs = [classical_test, country_test, edm_test, jazz_test, rap_test, rock_test]

#Have to name dfs in order to compare if classifier found correct playlist or not
classical_test.name = 'classical'
country_test.name = 'country'
edm_test.name = 'edm'
jazz_test.name = 'jazz'
rap_test.name = 'rap'
rock_test.name = 'rock'

#compiling all in one list to make looping easier
allPlaylists = ['classical', 'country', 'edm', 'jazz', 'rap', 'rock']

#creating a central dictionary with all features separated by playlists for graphing
features = jazz.columns.values
features = np.delete(features, 0)
featDict = dict.fromkeys(['classical', 'country', 'edm', 'jazz', 'rap', 'rock'], dict.fromkeys([ "acousticness", 
    "danceability", "energy", "instrumentalness", "key", "loudness", 
    'speechiness', "tempo", 'valence'], []))

# for feat in features:
#     for df in trainingDFs:
#         for playlist in allPlaylists:
#             allFeats = []
#             for i in range(0, len(df)):
#                 allFeats.append(df.iloc[i][feat])
#             featDict[playlist] = featDict[playlist].copy() #copy is needed to ensure data is different and not carried over
#             featDict[playlist][feat] = allFeats


trainingNoCountry = [classical_train, edm_train, jazz_train, rap_train, rock_train]
testingNoCountry = [classical_test, edm_test, jazz_test, rap_test, rock_test]
allPlaysNoCountry = ['classical', 'edm', 'jazz', 'rap', 'rock']
# -------- NAIVE BAYES ---------
# https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/
def mean(numbers):
    #converting to ints
    numbers = list(map(float, numbers))
    return sum(numbers)/float(len(numbers))
 
def stdev(numbers):
    numbers = list(map(float, numbers))
    avg = mean(numbers)
    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)
    return np.sqrt(variance)

#probability function
def calcProb(x, mean, stdev):
    #print('x: ', x, 'mean: ', mean, 'stdev', stdev)
    exponent = np.exp(-(np.square(x-mean)/(2*np.square(stdev))))
    return exponent / (np.sqrt(2*pi) * stdev) 

def allFeatsProbs(song, meanSDev):
    probs = {}
    for key, value in meanSDev.items():
        featProbs = []
        for i, val in enumerate(value):
            #print('song: ', song[i])
            featProbs.append(calcProb(song[i], val[0], val[1]))
        probs[key] = featProbs
    return probs

#returns a probability dictionary with playlists as keys
def probForSong(probs):
    songProb = dict.fromkeys(['classical', 'edm', 'jazz', 'rap', 'rock'], [])
    # songProb = dict.fromkeys(['classical', 'country', 'edm', 'jazz', 'rap', 'rock'], [])
    for key, value in probs.items():
        prob = 1
        for val in value:
            prob = prob * val
        songProb[key] = songProb[key].copy()
        songProb[key] = prob
    return songProb


def createNBDict():
    #creating dictionary with mean and stdev
    #order: "acousticness", "danceability", "energy", "instrumentalness", "key", "loudness", 'speechiness', "tempo", 'valence'
    #dict has mean and stdev for each playlist
    NBDict = {}
    i = 0
    for df in trainingNoCountry:
        meanSDev = []
        for feat in features:
            meanSDev.append((mean(df[feat]), stdev(df[feat])))
        NBDict[allPlaysNoCountry[i]] = meanSDev
        i += 1
    return NBDict

nbDict = createNBDict()


def findPlaylistNB(testLists, nbDict):
    allPlays = []
    #print(nbDict)
    #initializing counters
    correct = 0
    total = 0
    cc = 0
    jc = 0
    clc = 0
    ec = 0
    rc = 0
    rapc = 0
    for df in testLists:
        dfCorrect = 0
        dfTotal = 0
        for i in range(0, len(df)):
            probs = probForSong(allFeatsProbs(df.iloc[i], nbDict))
            #print(probs)
            #finding max value for prediction
            playlist = max(probs.items(), key=operator.itemgetter(1))[0]
            allPlays.append(playlist)
            #if correct prediction, increase correct
            if playlist == 'country':
                cc += 1
            if playlist == 'jazz':
                jc += 1
            if playlist == 'classical':
                clc += 1
            if playlist == 'edm':
                ec += 1
            if playlist == 'rock':
                rc += 1
            if playlist == 'rap':
                rapc += 1
            #print(cc, jc, clc, ec, rc, rapc)
            #print(df.name)
            if(df.name == playlist):
                dfCorrect += 1
                correct += 1
            dfTotal += 1
            total += 1
        #outputting
        print("NAIVE BAYES:", df.name, "fraction correct:", float(dfCorrect/dfTotal))
    print("NAIVE BAYES - Total fraction correct:", float(correct/total))




In [7]:
testingNoCountry

[      acousticness  danceability   energy  instrumentalness  key  loudness  \
 41           0.995        0.3710  0.02000          0.905000   11   -29.519   
 1133         0.804        0.3370  0.34200          0.466000    1   -15.630   
 1395         0.896        0.0783  0.05230          0.945000    5   -23.247   
 156          0.995        0.2690  0.00888          0.919000    8   -30.658   
 284          0.989        0.2790  0.00461          0.982000    2   -36.886   
 1246         0.906        0.1370  0.08420          0.903000   10   -20.476   
 1082         0.722        0.1860  0.19900          0.583000    2   -17.319   
 466          0.993        0.2100  0.16300          0.841000    4   -16.102   
 1814         0.994        0.2700  0.13800          0.487000    0   -16.773   
 68           0.985        0.2640  0.25300          0.944000    4   -21.191   
 1417         0.977        0.2490  0.15100          0.905000    1   -20.443   
 1925         0.989        0.3090  0.29400          

In [6]:
smallTest = [rock_test]
findPlaylistNB(testingNoCountry, nbDict)

NAIVE BAYES: classical fraction correct: 0.8886792452830189
NAIVE BAYES: edm fraction correct: 0.5831702544031311
NAIVE BAYES: jazz fraction correct: 0.636697247706422
NAIVE BAYES: rap fraction correct: 0.944547134935305
NAIVE BAYES: rock fraction correct: 0.6286836935166994
NAIVE BAYES - Total fraction correct: 0.738619119878604
