# TOP 50 Spotify songs 

In [292]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)          
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('top50.csv', index_col=0, encoding='ISO-8859-1')
df.head()


Unnamed: 0,Track.Name,Artist.Name,Genre,Beats.Per.Minute,Energy,Danceability,Loudness..dB..,Liveness,Valence.,Length.,Acousticness..,Speechiness.,Popularity
1,Señorita,Shawn Mendes,canadian pop,117,55,76,-6,8,75,191,4,3,79
2,China,Anuel AA,reggaeton flow,105,81,79,-4,8,61,302,8,9,92
3,boyfriend (with Social House),Ariana Grande,dance pop,190,80,40,-4,16,70,186,12,46,85
4,Beautiful People (feat. Khalid),Ed Sheeran,pop,93,65,64,-8,8,55,198,12,19,86
5,Goodbyes (Feat. Young Thug),Post Malone,dfw rap,150,65,58,-4,11,18,175,45,7,94


## Data Information:

- Track Name - Name of the song including featured artists
- Artist Name - Name of the artist
- Genre - Genre of the song
- Beats per Minute - The tempo of the song.
- Energy - The energy of a song. The higher the value, the more energtic the song
- Danceability - The higher the value, the easier it is to dance to this song.
- Loudness..dB.. - The higher the value, the louder the song.
- Liveness - The higher the value, the more likely the song is a live recording.
- Valence - The higher the value, the more positive mood for the song.
- Length - The duration of the song.
- Acousticness - The higher the value the more acoustic the song is.
- Speechiness - The higher the value the more spoken word the song contains.
- Popularity - The higher the value the more popular the song is.

## Display info dataset

In [293]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 50 entries, 1 to 50
Data columns (total 13 columns):
Track.Name          50 non-null object
Artist.Name         50 non-null object
Genre               50 non-null object
Beats.Per.Minute    50 non-null int64
Energy              50 non-null int64
Danceability        50 non-null int64
Loudness..dB..      50 non-null int64
Liveness            50 non-null int64
Valence.            50 non-null int64
Length.             50 non-null int64
Acousticness..      50 non-null int64
Speechiness.        50 non-null int64
Popularity          50 non-null int64
dtypes: int64(10), object(3)
memory usage: 4.9+ KB


Check if some of datasets are not complete 

In [294]:
df.isnull().sum()

Track.Name          0
Artist.Name         0
Genre               0
Beats.Per.Minute    0
Energy              0
Danceability        0
Loudness..dB..      0
Liveness            0
Valence.            0
Length.             0
Acousticness..      0
Speechiness.        0
Popularity          0
dtype: int64

Dimensions of data set

In [295]:
print('Number of rows in the dataset: ',df.shape[0])
print('Number of columns in the dataset: ',df.shape[1])

Number of rows in the dataset:  50
Number of columns in the dataset:  13


Basic statistical details 

In [296]:
df.describe().round(decimals=3)

Unnamed: 0,Beats.Per.Minute,Energy,Danceability,Loudness..dB..,Liveness,Valence.,Length.,Acousticness..,Speechiness.,Popularity
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,120.06,64.06,71.38,-5.66,14.66,54.6,200.96,22.16,12.48,87.5
std,30.898,14.232,11.93,2.056,11.118,22.336,39.144,18.996,11.162,4.491
min,85.0,32.0,29.0,-11.0,5.0,10.0,115.0,1.0,3.0,70.0
25%,96.0,55.25,67.0,-6.75,8.0,38.25,176.75,8.25,5.0,86.0
50%,104.5,66.5,73.5,-6.0,11.0,55.5,198.0,15.0,7.0,88.0
75%,137.5,74.75,79.75,-4.0,15.75,69.5,217.5,33.75,15.0,90.75
max,190.0,88.0,90.0,-2.0,58.0,95.0,309.0,75.0,46.0,95.0


The features described in the above data set are:

1. **Count** tells us the number of NoN-empty rows in a feature.

2. **Mean** tells us the mean value of that feature.

3. **Std** tells us the Standard Deviation Value of that feature.

4. **Min** tells us the minimum value of that feature.

5. **25%, 50%, and 75%** are the percentile/quartile of each features.

6. **Max** tells us the maximum value of that feature.


### Number of unique values in dataset per feature

In [297]:
print('Number of unique artists', df['Artist.Name'].nunique())
print('Number of unique genre', df['Genre'].nunique())

Number of unique artists 38
Number of unique genre 21


In [298]:
df['GeneralGenre']=[1 if each =='atl hip hop'
                      else 2 if each =='canadian hip hop'
                      else 3 if each == 'trap music'
                      else 4 if each == 'australian pop'
                      else 5 if each == 'boy band'
                      else 6 if each == 'canadian pop'
                      else 7 if each == 'dance pop'
                      else 8 if each == 'panamanian pop'
                      else 9 if each == 'pop'
                      else 10 if each == 'pop house'
                      else 11 if each == 'big room'
                      else 12 if each == 'brostep'
                      else 13 if each == 'edm'
                      else 14 if each == 'electropop'
                      else 15 if each == 'country rap'
                      else 16 if each == 'dfw rap'
                      else 17 if each == 'hip hop'
                      else 18 if each == 'latin'
                      else 19 if each == 'r&n en espanol'
                      else 20 for each in df['Genre']]

In [299]:
# Split the data into a training set and a testing set. Set: test_size=0.3, random_state=1
# 'Energy', 'Danceability', 'Loudness..dB..', 'Length.', 'Acousticness..', 'Speechiness.', 'Popularity'
predictors = ['GeneralGenre', 'Beats.Per.Minute', 'Energy', 'Danceability', 'Loudness..dB..', 'Length.', 'Acousticness..', 'Speechiness.']
x = df[predictors]
y = df['Popularity']

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

print ("train shape", X_train.shape, y_train.shape)
print ("test shape", X_test.shape, y_test.shape)

train shape (35, 8) (35,)
test shape (15, 8) (15,)


In [300]:
# import LogisticRegression from: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# your code here
from sklearn.linear_model import LinearRegression
# .fit for training
clf = LinearRegression().fit(X_train, y_train)
# your code here
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
print ('train accuracy =', train_score)
print ('test accuracy =', test_score)

y_pred = clf.predict(X_test)
print(y_pred)
print(y_test)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print(mse)

train accuracy = 0.26558728711050095
test accuracy = -0.3445401881439565
[82.83704296 86.69536143 85.73069199 85.39976075 94.10549507 87.92726876
 85.87657994 93.16613542 88.6149076  88.47994471 94.67029653 84.91675225
 90.45405982 90.13470553 93.44309806]
28    89
36    89
41    88
39    80
3     85
4     86
49    88
30    89
47    88
32    89
33    89
40    78
22    91
37    91
20    91
Name: Popularity, dtype: int64
18.249892153740635


In [301]:
#  Let's print the model's parameters

# coeff = pd.DataFrame()
# coeff['Feature'] = X_train.columns
# coeff['Coefficient Estimate'] = pd.Series(clf.coef_[0])
# coeff.loc[len(coeff)]=['Intercept',clf.intercept_[0]]
# print (coeff)

We now need to predict class labels for the test set. We will also generate the class probabilities

In [302]:
# predict class labels for the test set
# your code here
# to see if it is one or zero
y_pred = clf.predict(X_test)
df_output = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print (df_output)

    Actual  Predicted
28      89  82.837043
36      89  86.695361
41      88  85.730692
39      80  85.399761
3       85  94.105495
4       86  87.927269
49      88  85.876580
30      89  93.166135
47      88  88.614908
32      89  88.479945
33      89  94.670297
40      78  84.916752
22      91  90.454060
37      91  90.134706
20      91  93.443098


In [303]:
# generate class probabilities : http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
# your code here
# y_probs = clf.predict_proba(X_test)
# 2 columns, first prediction to not survice and the second one is to survive
# print (y_probs)

In [304]:
from sklearn.linear_model import Ridge
clfRidge = Ridge(alpha=1.0)
clfRidge.fit(X_train, y_train)

train_score1 = clfRidge.score(X_train, y_train)
test_score1 = clfRidge.score(X_test, y_test)
print ('train accuracy =', train_score1)
print ('test accuracy =', test_score1)

y_pred1 = clfRidge.predict(X_test)
print(y_pred1)

train accuracy = 0.26557770732526953
test accuracy = -0.3360311321317777
[82.85054024 86.71775696 85.75615155 85.40901386 94.06557839 87.93682183
 85.89443853 93.1341303  88.62372947 88.49475794 94.63788233 84.92188103
 90.45631516 90.11857984 93.41325371]


In [305]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train = sc.transform(X_train)
X_test = sc.transform(X_test)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
prediction = knn.predict(X_test)
df_output = pd.DataFrame({'Actual': y_test, 'Predicted': prediction})
print(df_output)

    Actual  Predicted
28      89         79
36      89         79
41      88         79
39      80         79
3       85         90
4       86         82
49      88         82
30      89         88
47      88         87
32      89         83
33      89         88
40      78         82
22      91         87
37      91         87
20      91         87


  return self.partial_fit(X, y)
  after removing the cwd from sys.path.
  """
