In [1]:
import pandas as pd
import numpy

#learning stuff
from sklearn import neighbors
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from math import sqrt

#graphing stuff!
%matplotlib notebook
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

Checkpoint \#1:
So far, we've refined our dataset in Excel (DataRefined.xlsx), pulled it into the notebook, and begun to lay that data out in an accessible fashion for use later. 
Basically, the Excel has several rows for musical metadata, which are pulled into a DataFrame called soundData, and two rows for mood data (created by our team) that are pulled into their own DataFrame emotionData. 
These are each being reduced to a single-column array, through PCA. These will be paired together to begin predicting emotionData. 

Unexpected Challenges:
- Randomizing Excel data - more complicated than it seems!
- Finding and eliminating NaNs in datasets.

Next Up (By Tuesday):
- Complete training set
- Begin training algorithms!

Note: I reduced the dataset to a nice round 10 thousand

In [2]:
data = pd.read_excel('DataRefined.xlsx')
#data is a pandas DataFrame
data_array = data.values
#data_array is a bit easier to think about - each value can be accessed with data_array[x]

In [3]:
#soundData is trimmed DataFrame that only contains song metadata
soundData = data.drop(columns=['artist_name', 'track_url', 'happysad', 'calmenergetic', 'trainortest', 'random', 'PCA output?', 'track_id'])
soundData.fillna(0)

Unnamed: 0,acousticness,danceability,energy,instrumentalness,liveness,speechiness,tempo,valence
0,0.078285,0.314572,0.309468,0.089045,0.354460,0.051375,127.856,0.212025
1,0.907184,0.608202,0.487272,0.952051,0.147523,0.055712,127.945,0.655503
2,0.902820,0.417216,0.192552,0.941335,0.103392,0.034817,110.647,0.114613
3,0.514903,0.297365,0.716660,0.884307,0.164822,0.045856,157.893,0.704008
4,0.420525,0.419414,0.800006,0.917086,0.188013,0.046945,98.782,0.856437
5,0.416963,0.756106,0.286637,0.000584,0.069551,0.338857,98.982,0.658264
6,0.694238,0.408591,0.919719,0.876129,0.131418,0.039127,109.640,0.944002
7,0.048519,0.667429,0.769637,0.852260,0.244618,0.063519,140.091,0.345366
8,0.945563,0.646874,0.295514,0.222247,0.060897,0.054109,133.129,0.502844
9,0.897702,0.796883,0.274274,0.939014,0.102938,0.067910,126.014,0.114578


In [4]:
#print(data.head())

In [5]:
emotionData = data.drop(columns=['artist_name', 'track_url', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'speechiness', 'tempo', 'valence', 'trainortest', 'random', 'PCA output?', 'track_id'])
emotionData.fillna(0)

Unnamed: 0,happysad,calmenergetic
0,5.0,8.0
1,1.0,2.0
2,0.0,0.0
3,0.0,0.0
4,7.0,3.0
5,3.0,4.0
6,4.0,6.0
7,0.0,0.0
8,8.0,6.0
9,4.0,1.0


In [6]:
#print(data_array[0])

In [7]:
#data.shape

In [8]:
#data_array[:5]

Goal here: PCA 

In [9]:
pca = PCA(n_components = 1)
numpy.nan_to_num(soundData)
numpy.isfinite(soundData).all()

#these columsn occasionally have empty values so here we just set them to 0
for index, row in soundData.iterrows():
    if(not numpy.isfinite(row["danceability"])):
       row["danceability"] = 0 
    if(not numpy.isfinite(row["speechiness"])):
       row["speechiness"] = 0 
    if(not numpy.isfinite(row["valence"])):
       row["valence"] = 0 
eigenbasis = pca.fit(soundData)

In [10]:
soundDataPCA = eigenbasis.transform(soundData)

In [11]:
soundDataPCA.shape

(10000, 1)

In [12]:
soundDataPCA[:10]

array([[  4.97565375],
       [  5.06488366],
       [-12.23413748],
       [ 35.01374082],
       [-24.09670909],
       [-23.89830834],
       [-13.23870497],
       [ 17.21177759],
       [ 10.24802095],
       [  3.13282303]])

In [13]:
for index, row in emotionData.iterrows():
    if(not numpy.isfinite(row["happysad"])):
       row["happysad"] = 0 
    if(not numpy.isfinite(row["calmenergetic"])):
       row["calmenergetic"] = 0 

In [39]:
emotionPCA = PCA(n_components = 1)
eigenbasisEmotion = emotionPCA.fit(emotionData)
emotionDataPCA = eigenbasisEmotion.transform(emotionData)
emotionDataPCA.shape

(10000, 1)

In [40]:
emotionDataPCA[:10]

array([[ 9.29015179],
       [ 2.11661289],
       [-0.06235761],
       [-0.06235761],
       [ 6.70754873],
       [ 4.93221078],
       [ 7.11118128],
       [-0.06235761],
       [ 9.6576908 ],
       [ 3.25532346]])

In [41]:
trainData_emotion = emotionDataPCA[:100]
trainData_emotion[:10]

array([[ 9.29015179],
       [ 2.11661289],
       [-0.06235761],
       [-0.06235761],
       [ 6.70754873],
       [ 4.93221078],
       [ 7.11118128],
       [-0.06235761],
       [ 9.6576908 ],
       [ 3.25532346]])

In [44]:
twotrainData_emotion = []
for i in range(100):
    #print(emotionDataPCA[i])
    twotrainData_emotion.append(emotionDataPCA[i][0])
twotrainData_emotion[:10]

[9.290151789482138,
 2.116612892179925,
 -0.06235761408559831,
 -0.06235761408559831,
 6.707548727093725,
 4.9322107769510914,
 7.111181283216615,
 -0.06235761408559831,
 9.657690797239189,
 3.255323463816915]

In [30]:
for i in range(len(trainData_emotion)):
    #print(trainData_emotion[i][0])
    hold = trainData_emotion[i][0]
    trainData_emotion[i] = 0
    print(hold)
    #trainData_emotion[i] = trainData_emotion[i][0]
trainData_emotion[:10]


9.290151789482145
2.116612892179927
-0.06235761408559832
-0.06235761408559832
6.707548727093709
4.932210776951093
7.111181283216618
-0.06235761408559832
9.657690797239178
3.2553234638169033
2.6186960853112637
7.245725468590922
11.971205488879006
-0.06235761408559832
10.563406546493423
8.384436040227898
9.926779167987783
3.3898676491912068
6.474553904710979
2.88778445605987
8.653524410976505
8.886519233359236
-0.06235761408559832
-0.06235761408559832
10.697950731867726
7.111181283216618
9.290151789482145
-0.06235761408559832
9.155607604107841
5.568838155456732
7.478720290973652
5.066754962325397
4.663122406202486
8.92261278172511
2.88778445605987
3.6589560199398132
4.295583398445453
5.201299147699699
4.295583398445453
6.609098090085283
5.066754962325397
9.021063418733538
6.340009719336676
1.2108971429256814
9.021063418733538
9.290151789482145
6.474553904710979
8.788068596350808
8.518980225602201
7.245725468590922
4.295583398445453
7.245725468590922
9.155607604107841
8.384436040227898
-0

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.]])

In [19]:
trainData_sound = soundDataPCA[:100]
trainData_sound[:10]

array([[  4.97565375],
       [  5.06488366],
       [-12.23413748],
       [ 35.01374082],
       [-24.09670909],
       [-23.89830834],
       [-13.23870497],
       [ 17.21177759],
       [ 10.24802095],
       [  3.13282303]])

In [20]:
testData_emotion = emotionDataPCA[100:]
testData_emotion[:10]

array([[-0.06235761],
       [-0.06235761],
       [-0.06235761],
       [-0.06235761],
       [-0.06235761],
       [-0.06235761],
       [-0.06235761],
       [-0.06235761],
       [-0.06235761],
       [-0.06235761]])

In [21]:
testData_sound = soundDataPCA[100:]
testData_sound[:10]

array([[ 22.63481199],
       [ 11.05482994],
       [-39.84525749],
       [-58.10276798],
       [ -1.63721607],
       [ 30.4292374 ],
       [  1.55383814],
       [ 33.06567168],
       [ 32.70214667],
       [-26.81139913]])

In [22]:
type(trainData_sound[0][0])

numpy.float64

In [23]:
classifier = neighbors.KNeighborsClassifier(metric='euclidean', n_neighbors=10)
classifier.fit(trainData_emotion, trainData_sound)
#classifier.score(testData_emotion, testData_sound)

  


ValueError: Unknown label type: 'continuous'