In [55]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

# Clean Data

In [77]:
df=pd.read_csv("data.csv")
df = df.sample(frac=1).reset_index(drop=True)
print(df.head())
dfX=df[['acousticness','danceability','duration_ms','energy','instrumentalness','liveness','loudness','speechiness','tempo','valence','year']]

dfy=df['popularity']

binarizer = Binarizer(threshold=0.50, copy=True)
df['popularity']=binarizer.fit_transform(df['popularity'].values.reshape(df.shape[0], 1))



X=dfX.to_numpy()
y=dfy.to_numpy()
xTrain,xTest,yTrain,yTest=train_test_split(X,y,test_size=0.50,random_state=0)

   acousticness                                            artists  \
0         0.871  ['Carmela Y Rafael', 'Rondalla Mexicana Del Ch...   
1         0.897                              ['Markos Vamvakaris']   
2         0.292                              ['The J. Geils Band']   
3         0.716                                ['Odetta', 'Larry']   
4         0.990               ['Francis Poulenc', 'Pierre Bernac']   

   danceability  duration_ms  energy  explicit                      id  \
0         0.527       178364   0.251         0  1Mcsy7qnG7gOzqAH7vpbAF   
1         0.750       194120   0.522         0  0dcY3n4KeVIP4A4G5zrt0v   
2         0.560       221840   0.863         0  0Fs1F6OIgVCy39ZLxKXZYI   
3         0.513       125453   0.589         0  4jGtxsvrC6lzjQK8R5xxJ2   
4         0.442        69253   0.345         0  2ig5HZeC6fgkC6A2Zo5zM9   

   instrumentalness  key  liveness  loudness  mode  \
0          0.000000   11     0.138   -11.265     0   
1          0.003850    1  

**BAYES with Scikit-lear**
*first try with a gaussian distribution: fit continuous data*
*NB : Gaussian Distribution, not pertinant
*sur-apprentissage ?*

In [78]:
gnb=GaussianNB()
yPred=gnb.fit(xTrain,yTrain).predict(xTest)
print("Number of mislabeled points out of a total %d points : %d"% (xTest.shape[0], (yTest != yPred).sum()))
metrics.accuracy_score(yTest, yPred)

Number of mislabeled points out of a total 84955 points : 7479


0.9119651580248367

*with scaling*

In [79]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cols=['acousticness','danceability','duration_ms','energy','instrumentalness','liveness','loudness','speechiness','tempo','valence','year']
def scaleColumns(df, cols2Scale):
    for col in cols2Scale:
        df[col] = pd.DataFrame(scaler.fit_transform(pd.DataFrame(df[col])),columns=[col])
    return df

dfScaled=scaleColumns(df,cols)
X=dfScaled[cols].to_numpy()
y=dfy.to_numpy()
xTrain,xTest,yTrain,yTest=train_test_split(X,y,test_size=0.25,random_state=0)
gnb=GaussianNB()
yPred=gnb.fit(xTrain,yTrain).predict(xTest)
print("Number of mislabeled points out of a total %d points : %d"% (xTest.shape[0], (yTest != yPred).sum()))
metrics.accuracy_score(yTest, yPred)*100


Number of mislabeled points out of a total 42478 points : 4877


88.51876265360893

# Preprocessing : discretization & binarization

In [80]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import Binarizer
#first n bins and then one hot encoder
#discrete : mode
#let's do an ordinalEncoder : since variable are ordinal and not nominal 
#(ordered categories but the distance is not known)


disc = KBinsDiscretizer(n_bins=4, encode='uniform', strategy='uniform')
columns=df.columns.values.tolist()
cols=['acousticness','danceability', 'duration_ms', 'energy','instrumentalness',  'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

def Discretize(df,cols):
    long=df.shape[0]
    pass


def Binarize(df,cols):
    long=df.shape[0]
    disc = KBinsDiscretizer(n_bins=3, encode='onehot', strategy='uniform')
    for c in cols:
        mean=df[c].mean()
        name=c+'Bin'
        binarizer = Binarizer(threshold=mean, copy=True)
        df[name]=binarizer.fit_transform(df[c].values.reshape(long, 1))
    return df


dfB=Binarize(df,cols)
print(dfB.head())

dfBX=dfB[['acousticnessBin','danceabilityBin','energyBin','instrumentalnessBin','livenessBin','loudnessBin','speechinessBin','tempoBin','valenceBin']][:1000]
#dfDX=dfD[['acousticnessDisc','danceabilityDisc','energyDisc','instrumentalnessDisc','livenessDisc','loudnessDisc','speechinessDisc','tempoDisc','valenceDisc']]
dfBy=df['popularity'][:1000]
XB=dfBX.to_numpy()
yB=dfBy.to_numpy()
xBTrain,xBTest,yBTrain,yBTest=train_test_split(XB,yB,test_size=0.50,random_state=0)


   acousticness                                            artists  \
0      1.003080  ['Carmela Y Rafael', 'Rondalla Mexicana Del Ch...   
1      1.072114                              ['Markos Vamvakaris']   
2     -0.534254                              ['The J. Geils Band']   
3      0.591531                                ['Odetta', 'Larry']   
4      1.319044               ['Francis Poulenc', 'Pierre Bernac']   

   danceability  duration_ms    energy  explicit                      id  \
0     -0.063587    -0.437203 -0.888567         0  1Mcsy7qnG7gOzqAH7vpbAF   
1      1.208189    -0.307333  0.124937         0  0dcY3n4KeVIP4A4G5zrt0v   
2      0.124613    -0.078850  1.400232         0  0Fs1F6OIgVCy39ZLxKXZYI   
3     -0.143430    -0.873325  0.375508         0  4jGtxsvrC6lzjQK8R5xxJ2   
4     -0.548345    -1.336557 -0.537019         0  2ig5HZeC6fgkC6A2Zo5zM9   

   instrumentalness  key  liveness  ...  acousticnessBin  danceabilityBin  \
0         -0.523513   11 -0.388529  ...      

**Bayes with SKlearn & binarized values**

In [81]:
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
yBPred=clf.fit(xBTrain, yBTrain).predict(xBTest)
print("Number of mislabeled points out of a total %d points : %d"% (xBTest.shape[0], (yBTest != yBPred).sum()))
metrics.accuracy_score(yBTest, yBPred)*100

Number of mislabeled points out of a total 500 points : 116


76.8

In [82]:
#apprenticeship issues : too many data not varied 