In [None]:
# Loading data into the colab environment

from google.colab import files
uploaded = files.upload()

Saving ConsolidatedCountryData_Brazil.csv to ConsolidatedCountryData_Brazil.csv


In [None]:
#Importing methods to be used later on

import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Part 1 Data Acquisition

# Read data from a CSV file into a data frame
# Put name of file you uploaded in brackets if working in Google Colab
df = pd.read_csv(r'ConsolidatedCountryData_Brazil.csv')
#df = df.drop_duplicates()


# Display the first ten rows of the data frame to examine if it is an individual-level data set
print(df.head(10))

# Display the variable list
print(df.columns.values)

# Display the number of rows and the number of columns in the data set to confirm the portrait shape
# The first element of the output is the number of rows and the second is the number of columns 
print(df.shape)

       Song Name                           Artist  ...  time_signature  Approved
0       24kGoldn           Mood (feat. iann dior)  ...               4         0
1  Ariana Grande                        positions  ...               4         0
2       Ashnikko                            Daisy  ...               4         0
3        Ava Max                   Kings & Queens  ...               4         0
4            BTS                         Dynamite  ...               4         0
5        Cardi B  WAP (feat. Megan Thee Stallion)  ...               4         0
6    Chris Brown                         Go Crazy  ...               4         0
7     Conan Gray                          Heather  ...               3         0
8         DaBaby     ROCKSTAR (feat. Roddy Ricch)  ...               4         0
9      DJ Khaled            POPSTAR (feat. Drake)  ...               4         0

[10 rows x 15 columns]
['Song Name' 'Artist' 'danceability' 'energy' 'key' 'loudness' 'mode'
 'speechiness' 

In [None]:
#Categorical and Numerical list building
#Remove Artist and Song Name for Feature Selection
df = df.drop(['Artist','Song Name','key','time_signature'],axis = 1)
df_sample1 = df
# Separate all the variables into two lists for future column indexing
# One for numerical, the other for categorical 
cvar_list = ['mode','Approved']
nvar_list = ['danceability', 'energy', 'loudness', 'speechiness', 
             'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

In [None]:
# Part 4 Variable transformation

# Standardize the numerical variables 
df_sample5 = df_sample1.copy()
df_sample5[nvar_list] = (df_sample1[nvar_list] - df_sample1[nvar_list].mean())/df_sample1[nvar_list].std()

# Set the datatype for the variables in the cvar_list to be categorical in Python
# Set the datatype for the variables in the nvar_list to be numerical in Python 
df_sample6 = df_sample5.copy()
df_sample6[cvar_list] = df_sample5[cvar_list].astype('category')
df_sample6[nvar_list] = df_sample5[nvar_list].astype('float64')

# Convert the categorical variables into dummies (Step 1 of dummy coding)
df_sample7 = df_sample6.copy()
df_sample7 = pd.get_dummies(df_sample6, prefix_sep='_')

# Remove the redundant dummies (Step 2 of dummy coding)
# Placeholder variable: rdummies
rdummies = ['Approved_0']
df_sample8 = df_sample7.copy()
df_sample8 = df_sample7.drop(columns=rdummies)

# Get the remaining variable list after the variable transformation
print(df_sample8.columns.values)

# Display the milestone dataframe. Compare it with the original dataframe.
print(df_sample8)
print(df)

['danceability' 'energy' 'loudness' 'speechiness' 'acousticness'
 'instrumentalness' 'liveness' 'valence' 'tempo' 'mode_0' 'mode_1'
 'Approved_1']
     danceability    energy  loudness  ...  mode_0  mode_1  Approved_1
0        0.171802  0.522272  1.238997  ...       1       0           0
1        0.420122  1.027934  0.688680  ...       0       1           0
2        1.096103  0.149345  0.054509  ...       0       1           0
3       -0.262757  0.320007  1.010348  ...       1       0           0
4        0.489100  0.794065  0.848597  ...       1       0           0
..            ...       ...       ...  ...     ...     ...         ...
414     -2.359679 -0.014995  0.622239  ...       1       0           0
415     -0.228268 -1.677361 -1.431485  ...       0       1           0
416      0.302860 -0.261505  0.003647  ...       0       1           0
417     -1.111183  0.572838  0.150276  ...       0       1           0
418      0.737419 -0.817734  0.246502  ...       1       0           0



In [None]:
# Part 5 Data Partiton

# Required package: scikit-learn. Package name in Python: sklearn
# Required subpackage: model_selection. Required function name: train_test_split
from sklearn.model_selection import train_test_split

# Placeholder variables: df4partition, testpart_size
# test_size specifies the percentage for the test partition
df4partition = df_sample8
testpart_size = 0.2

# random_state specifies the seed for random number generator. 
# random_state = 1 unless otherwised noted
df_nontestData, df_testData = train_test_split(df4partition, test_size=testpart_size, random_state=1)

print(df_nontestData)

     danceability    energy  loudness  ...  mode_0  mode_1  Approved_1
358     -1.276730 -0.135090  0.278118  ...       0       1           0
245     -0.014438  0.408498 -0.822058  ...       0       1           0
207     -0.973228  0.016609  0.317983  ...       0       1           0
122      0.785704  0.452743  0.922828  ...       0       1           0
377      0.551180 -0.387921 -1.223455  ...       1       0           0
..            ...       ...       ...  ...     ...     ...         ...
255      0.489100 -0.179335 -0.353761  ...       1       0           0
72       0.599464 -0.267826  0.438036  ...       1       0           1
396     -0.173086 -0.166694  0.060466  ...       1       0           0
235      0.489100  0.794065  0.848597  ...       1       0           0
37       0.233882 -0.577545 -1.022298  ...       0       1           0

[335 rows x 12 columns]


In [None]:
# Part 6 Nearest neighbor 

# Required package: scikit-learn. Package name in Python: sklearn
# Required subpackage: neighbors 
# Required function name: KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

# Separate the predictor values and the DV values into X and y respectively
# Placeholder variable: DV
DV = 'Approved_1'
y = df_nontestData[DV]
X = df_nontestData.drop(columns=[DV])

In [None]:
#kNN ##################################################################
print('kNN')
# First we build a kNN model with pre-specified k
k = 3
kfolds = 5

clf = KNeighborsClassifier(metric='euclidean', n_neighbors=k).fit(X,y)

# Report the performance over the test partition

X_test = df_testData.drop(columns=DV)
y_test = df_testData[DV]

# y_test_actual is the actual values of the DV in the test partition
y_test_actual = df_testData[DV]

from sklearn import metrics

model_object = clf

#ATTENTION TO MATT OR OWA FROM OWA, HAD TO CHANGE THIS FROM 200 AS FUNCTION WOULDNT WORK
max_k = 150

param_grid = {'n_neighbors': list(range(1, max_k-1))}

# Set n_jobs to be -1 to run  on all CPU cores.
# The search criterion is to find the model that maximizes 
# whatever the scoring function - for this case roc_auc - returns.

from sklearn.model_selection import GridSearchCV

gridsearch = GridSearchCV(KNeighborsClassifier(metric='euclidean'), param_grid, scoring='roc_auc', cv=kfolds, n_jobs=-1)
gridsearch.fit(X,y)
clf_bestkNN = gridsearch.best_estimator_

print('AUC score')
# obtain the AUC of the k-NN model with k=5
print(metrics.roc_auc_score(y_test, clf_bestkNN.predict_proba(X_test)[:, 1]))
y_test_predicted_kNN = clf_bestkNN.predict(X_test)
# Display the confusion matrix over the test partition
print('Accuracy score')
# Display the accuracy over the test partition
print(clf_bestkNN.score(X_test, y_test_actual))
print('Optimum k')
print(clf_bestkNN.n_neighbors)

kNN
AUC score
0.8614814814814815
Accuracy score
0.9166666666666666
Optimum k
15


In [None]:
# Display the optimal k (k-th nearest neighbor)


# y_test_actual is the actual values of the DV in the test partition
y_test_actual = df_testData[DV]

# X_test is the predictor values in the test partition
X_test = df_testData.drop(columns=[DV])


# Get the AUC of the final selected k-NN model
from sklearn.metrics import roc_auc_score
print(roc_auc_score(y_test_actual, clf_bestkNN.predict_proba(X_test)[:,1]))

0.8614814814814815


In [None]:
# Part7 Score the new data

# Upload UsedCar_NewData.csv from the local drive to Colab
from google.colab import files
uploaded = files.upload()

Saving This is Kacey Musgraves.csv to This is Kacey Musgraves.csv


In [None]:
# Load the data from UsedCar_NewData.csv to a Python dataframe df_newdata
df_newdata = pd.read_csv(r'This is Kacey Musgraves.csv')

# Display the first ten rows of the data frame to examine if it is an individual-level data set
print(df_newdata.head(10))

# Display the variable list
print(df_newdata.columns.values)

# Display the number of rows and the number of columns in the data set to confirm the portrait shape
# The first element of the output is the number of rows and the second is the number of columns 
print(df_newdata.shape)

           Song Name           Artist  ...    tempo  time_signature
0            Rainbow  Kacey Musgraves  ...  140.070               4
1        Butterflies  Kacey Musgraves  ...  147.942               4
2          Slow Burn  Kacey Musgraves  ...  165.981               4
3         High Horse  Kacey Musgraves  ...  181.075               4
4       Space Cowboy  Kacey Musgraves  ...   78.532               4
5        Golden Hour  Kacey Musgraves  ...  150.033               4
6   Oh, What A World  Kacey Musgraves  ...  181.050               4
7  Follow Your Arrow  Kacey Musgraves  ...  126.022               4
8        Happy & Sad  Kacey Musgraves  ...  127.968               4
9     Lonely Weekend  Kacey Musgraves  ...  162.013               4

[10 rows x 14 columns]
['Song Name' 'Artist' 'danceability' 'energy' 'key' 'loudness' 'mode'
 'speechiness' 'acousticness' 'instrumentalness' 'liveness' 'valence'
 'tempo' 'time_signature']
(50, 14)


In [None]:
#Remove Artist and Song Name for Feature Selection
df_newdata = df_newdata.drop(['Artist','Song Name','key','time_signature'], 1)


df_sample1 = df_newdata.copy()



# Separate all the variables into two lists for future column indexing
# One for numerical, the other for categorical 
cvar_list = ['mode']
nvar_list = ['danceability', 'energy', 'loudness', 'speechiness', 
             'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

In [None]:
# Part 4 Variable transformation

# Standardize the numerical variables 
df_sample5 = df_sample1.copy()
df_sample5[nvar_list] = (df_sample1[nvar_list] - df_sample1[nvar_list].mean())/df_sample1[nvar_list].std()

# Set the datatype for the variables in the cvar_list to be categorical in Python
# Set the datatype for the variables in the nvar_list to be numerical in Python 
df_sample6 = df_sample5.copy()
df_sample6[cvar_list] = df_sample5[cvar_list].astype('category')
df_sample6[nvar_list] = df_sample5[nvar_list].astype('float64')

#For kNN model numerical values need to be standardized
from sklearn.preprocessing import StandardScaler

df_sample6[nvar_list] = StandardScaler().fit_transform(df_sample6[nvar_list])

# Convert the categorical variables into dummies (Step 1 of dummy coding)
df_sample7 = df_sample6.copy()
df_sample7 = pd.get_dummies(df_sample6, prefix_sep='_')

# Remove the redundant dummies (Step 2 of dummy coding)
# Placeholder variable: rdummies
df_sample8 = df_sample7.copy()


# Get the remaining variable list after the variable transformation
print(df_sample8.columns.values)

# Display the milestone dataframe. Compare it with the original dataframe.
print(df_sample8)
print(df_newdata)

['danceability' 'energy' 'loudness' 'speechiness' 'acousticness'
 'instrumentalness' 'liveness' 'valence' 'tempo' 'mode_0' 'mode_1']
    danceability    energy  loudness  ...     tempo  mode_0  mode_1
0       1.044788  0.563563 -0.116199  ... -0.743778       0       1
1      -0.025214  1.726169  0.698260  ... -0.864746       0       1
2      -0.787988 -0.633491 -0.533926  ...  1.535041       1       0
3       1.203699  1.063053  0.558300  ... -0.895620       0       1
4       1.627462 -0.555984  0.273536  ...  0.258496       0       1
5      -0.650265  1.553931  0.678343  ...  1.779109       1       0
6       1.479145  0.038237  0.768240  ... -0.897051       0       1
7      -1.444820 -0.375134  0.564222  ...  1.666422       0       1
8       0.080727 -0.185673  0.014071  ...  1.663986       0       1
9       0.705778 -0.444029 -1.639610  ... -0.865873       1       0
10     -2.080465 -0.599044  0.225626  ... -1.144894       1       0
11      1.097758 -1.132981  0.355358  ... -0.773647

In [None]:
X = df_sample8

df_sample8['Approved'] = clf.predict(X)

df_sample8['Approved']

0     0
1     0
2     0
3     0
4     1
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    1
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
43    0
44    0
45    0
46    0
47    1
48    0
49    0
Name: Approved, dtype: uint8

In [None]:
Pass_rate = len([i for i in df_sample8['Approved'] if i == 1])/len(df_sample8['Approved'])
Song_Count = len([i for i in df_sample8['Approved'] if i == 1])
Pass_rate
Song_Count

3

In [None]:
import csv

with open('This is Kacey Musgraves_Brazil.csv','a') as f:
    writer = csv.writer(f)
    writer.writerow([Song_Count])
    writer.writerow([Pass_rate])