In [1]:
import numpy as np
import pickle
import pandas as pd
from pathlib import Path
import scipy.stats as st
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [2]:
train_df = pd.read_csv('music.csv')
train_df

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,genre
0,Bruno Mars,That's What I Like (feat. Gucci Mane),60.0,0.854,0.564,1.0,-4.964,1,0.0485,0.017100,,0.0849,0.8990,134.071,234596.0,4,Hip-Hop
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.230,1,0.0406,0.001100,0.004010,0.1010,0.5690,116.454,251733.0,4,Rock
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486000,0.000196,0.3940,0.7870,147.681,109667.0,4,Indie
3,Deno,Lingo (feat. J.I & Chunkz),66.0,0.853,0.597,10.0,-6.528,0,0.0555,0.021200,,0.1220,0.5690,107.033,173968.0,4,Hip-Hop
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,0.167,0.975,2.0,-4.279,1,0.2160,0.000169,0.016100,0.1720,0.0918,199.060,229960.0,4,Rock
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991,Green-House,Find Home,35.0,0.166,0.109,7.0,-17.100,0,0.0413,0.993000,0.824000,0.0984,0.1770,171.587,193450.0,3,Indie
17992,Micatone,All Gone,27.0,0.638,0.223,11.0,-10.174,0,0.0329,0.858000,0.000016,0.0705,0.3350,73.016,257067.0,4,Blues
17993,Smash Hit Combo,Peine perdue,34.0,0.558,0.981,4.0,-4.683,0,0.0712,0.000030,0.000136,0.6660,0.2620,105.000,216222.0,4,Metal
17994,Beherit,Salomon's Gate,29.0,0.215,0.805,6.0,-12.757,0,0.1340,0.001290,0.916000,0.2560,0.3550,131.363,219693.0,4,Metal


In [3]:
train_df = train_df.dropna()

In [4]:
bin_df = train_df.copy()

In [5]:
bins = [0, 50, 100]

# Create labels for these bins
group_labels = ["50 or Below", "Above 50"]

In [6]:
bin_df["Pop"] = pd.cut(bin_df["Popularity"], bins, labels=group_labels)
bin_df

Unnamed: 0,Artist Name,Track Name,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,genre,Pop
1,Boston,Hitch a Ride,54.0,0.382,0.814,3.0,-7.230,1,0.0406,0.001100,0.004010,0.1010,0.5690,116.454,251733.0,4,Rock,Above 50
2,The Raincoats,No Side to Fall In,35.0,0.434,0.614,6.0,-8.334,1,0.0525,0.486000,0.000196,0.3940,0.7870,147.681,109667.0,4,Indie,50 or Below
4,Red Hot Chili Peppers,Nobody Weird Like Me - Remastered,53.0,0.167,0.975,2.0,-4.279,1,0.2160,0.000169,0.016100,0.1720,0.0918,199.060,229960.0,4,Rock,Above 50
5,The Stooges,Search and Destroy - Iggy Pop Mix,53.0,0.235,0.977,6.0,0.878,1,0.1070,0.003530,0.006040,0.1720,0.2410,152.952,208133.0,4,Indie,Above 50
6,Solomon Burke,None Of Us Are Free,48.0,0.674,0.658,5.0,-9.647,0,0.1040,0.404000,0.000001,0.0981,0.6770,143.292,329387.0,4,Blues,50 or Below
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991,Green-House,Find Home,35.0,0.166,0.109,7.0,-17.100,0,0.0413,0.993000,0.824000,0.0984,0.1770,171.587,193450.0,3,Indie,50 or Below
17992,Micatone,All Gone,27.0,0.638,0.223,11.0,-10.174,0,0.0329,0.858000,0.000016,0.0705,0.3350,73.016,257067.0,4,Blues,50 or Below
17993,Smash Hit Combo,Peine perdue,34.0,0.558,0.981,4.0,-4.683,0,0.0712,0.000030,0.000136,0.6660,0.2620,105.000,216222.0,4,Metal,50 or Below
17994,Beherit,Salomon's Gate,29.0,0.215,0.805,6.0,-12.757,0,0.1340,0.001290,0.916000,0.2560,0.3550,131.363,219693.0,4,Metal,50 or Below


In [7]:
# Create a GroupBy object based upon "View Group"
test_group = bin_df.groupby("Pop")

# Find how many rows fall into each bin
print(test_group["Pop"].count())

# Get the average of each column within the GroupBy object
test_group.mean()

Pop
50 or Below    8081
Above 50       3732
Name: Pop, dtype: int64


Unnamed: 0_level_0,Popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature
Pop,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
50 or Below,34.349709,0.510606,0.663184,5.961886,-8.476019,0.620097,0.071438,0.260531,0.214869,0.202222,0.455982,122.984011,213209.040024,3.898899
Above 50,62.049303,0.55599,0.685723,5.995713,-7.652363,0.632369,0.07039,0.195443,0.101305,0.179666,0.500718,122.808747,212371.96906,3.946141


# Correlations

# Model Building

In [8]:
def test_model(model, data):
    X_train_scaled, X_test_scaled, y_train, y_test = data
    reg = model.fit(X_train_scaled, y_train)
    print(f'Model: {type(reg).__name__}')
    print(f'Train score: {reg.score(X_train_scaled, y_train)}')
    print(f'Test Score: {reg.score(X_test_scaled, y_test)}\n')
    plt.show()    

In [9]:
test = bin_df.copy()
test.drop("Pop", axis = 1, inplace = True)
test.drop("Artist Name", inplace = True, axis = 1)
test.drop("Track Name", inplace = True, axis = 1)
test.drop("Popularity", inplace = True, axis = 1)
# test.drop("genre", inplace = True, axis = 1)
# test.drop("mode", inplace = True, axis = 1)
# test.drop("speechiness", inplace = True, axis = 1)
# test.drop("acousticness", inplace = True, axis = 1)
# test.drop("instrumentalness", inplace = True, axis = 1)
# test.drop("liveness", inplace = True, axis = 1)
# test.drop("valence", inplace = True, axis = 1)
# test.drop("tempo", inplace = True, axis = 1)
# test.drop("duration_in min/ms", inplace = True, axis = 1)
# test.drop("time_signature", inplace = True, axis = 1)
test

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_in min/ms,time_signature,genre
1,0.382,0.814,3.0,-7.230,1,0.0406,0.001100,0.004010,0.1010,0.5690,116.454,251733.0,4,Rock
2,0.434,0.614,6.0,-8.334,1,0.0525,0.486000,0.000196,0.3940,0.7870,147.681,109667.0,4,Indie
4,0.167,0.975,2.0,-4.279,1,0.2160,0.000169,0.016100,0.1720,0.0918,199.060,229960.0,4,Rock
5,0.235,0.977,6.0,0.878,1,0.1070,0.003530,0.006040,0.1720,0.2410,152.952,208133.0,4,Indie
6,0.674,0.658,5.0,-9.647,0,0.1040,0.404000,0.000001,0.0981,0.6770,143.292,329387.0,4,Blues
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17991,0.166,0.109,7.0,-17.100,0,0.0413,0.993000,0.824000,0.0984,0.1770,171.587,193450.0,3,Indie
17992,0.638,0.223,11.0,-10.174,0,0.0329,0.858000,0.000016,0.0705,0.3350,73.016,257067.0,4,Blues
17993,0.558,0.981,4.0,-4.683,0,0.0712,0.000030,0.000136,0.6660,0.2620,105.000,216222.0,4,Metal
17994,0.215,0.805,6.0,-12.757,0,0.1340,0.001290,0.916000,0.2560,0.3550,131.363,219693.0,4,Metal


In [10]:
y_label = LabelEncoder().fit_transform(bin_df['Pop'])
y_label

array([1, 0, 1, ..., 0, 0, 0])

In [11]:
X = pd.get_dummies(test)
y = y_label
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
data = [X_train_scaled, X_test_scaled, y_train, y_test]

In [19]:
scale_test = scaler
pickle.dump(scale_test, open('scale.pkl', "wb"))

In [13]:
X.columns

Index(['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'duration_in min/ms', 'time_signature', 'genre_Acoustic/Folk',
       'genre_Alternate', 'genre_Blues', 'genre_Bollywood', 'genre_Country',
       'genre_Hip-Hop', 'genre_Indie', 'genre_Instrumental', 'genre_Metal',
       'genre_Pop', 'genre_Rock'],
      dtype='object')

# Familiar Regressors

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.linear_model import LogisticRegression

# Logistic Regression

In [15]:
classifier = LogisticRegression()
app_test = classifier.fit(X_train_scaled, y_train)

In [16]:
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.6843887571960718
Testing Data Score: 0.7085308056872038


In [17]:
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,1
4,0,1
...,...,...
2949,0,1
2950,0,0
2951,0,0
2952,0,1


In [18]:
pickle.dump(app_test, open('app.pkl', "wb"))