### Few Hit Wonders versus Consistent Hitters
##### Joseph Scarpa (jcs140)

In [None]:
# Imports (update throughout as necessary)
import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt # just to format confusion matrix

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

import re

import scipy
from scipy import stats

#### Data loading and cleaning

Notes:
* No null values found
* No duplicate values found

In [None]:
# Data loading
data10 = pd.read_csv('./datasets/dataset-of-10s.csv')
data00 = pd.read_csv('./datasets/dataset-of-00s.csv')
data90 = pd.read_csv('./datasets/dataset-of-90s.csv')
data80 = pd.read_csv('./datasets/dataset-of-80s.csv')
data70 = pd.read_csv('./datasets/dataset-of-70s.csv')
data60 = pd.read_csv('./datasets/dataset-of-60s.csv')

In [None]:
# Data cleaning

# Drop all na values
data10 = data10.dropna()
data00 = data00.dropna()
data90 = data90.dropna()
data80 = data80.dropna()
data70 = data70.dropna()
data60 = data60.dropna()


# Add a decades column to each dataset (will be helpful when creating cumulative dataset)
data10['decade'] = [2010]*len(data10)
data00['decade'] = [2000]*len(data00)
data90['decade'] = [1990]*len(data90)
data80['decade'] = [1980]*len(data80)
data70['decade'] = [1970]*len(data70)
data60['decade'] = [1960]*len(data60)

In [None]:
# Check for duplicates in dataset
merged = data10.merge(data00,on=['track','artist'])
merged = merged.merge(data90,on=['track','artist'])
merged = merged.merge(data80,on=['track','artist'])
merged = merged.merge(data70,on=['track','artist'])
merged = merged.merge(data60,on=['track','artist'])
merged
# NOTE: No duplicates found

# Create cumulative dataset (from songs across all decades)
data = data10.append(data00,ignore_index=True).append(data90,ignore_index=True).append(data80,ignore_index=True).append(data70,ignore_index=True).append(data60,ignore_index=True)
data.head()
data.tail()

Unnamed: 0,track,artist,uri,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,chorus_hit,sections,target,decade
41101,Le ire di Ursus,Giampiero Boneschi,spotify:track:3dVgolCVuNgecU5jovpc45,0.42,0.873,3,-11.689,1,0.1,0.808,0.879,0.363,0.183,119.821,26244,4,0.0,2,0,1960
41102,A-B-C,John Zacherle,spotify:track:0zXhzKf4raEYstebL7L42C,0.716,0.742,10,-5.727,0,0.242,0.663,0.0,0.107,0.834,135.104,142987,4,28.34316,10,0,1960
41103,Useless Landscape,Antônio Carlos Jobim,spotify:track:2uC02PCRVM6BhAaK7dgxxp,0.361,0.0944,10,-22.663,1,0.0327,0.94,0.000178,0.115,0.139,92.472,137200,4,56.63498,6,0,1960
41104,Vendedor De Assaí,Ary Lobo,spotify:track:0gpSUWZWNIRrEenaC1ZGk9,0.452,0.759,4,-7.821,0,0.175,0.658,0.0,0.738,0.853,137.976,159107,3,28.24372,9,0,1960
41105,Mambo Simphony - Remasterizado,Orquesta Casino De La Playa,spotify:track:5j0G0nkCSPlmSCSgXchMn3,0.554,0.278,5,-8.386,1,0.0349,0.928,0.922,0.136,0.428,131.401,164227,4,37.39251,8,0,1960


In [None]:
# Add custom columns that will be useful later

# Duration of song in seconds
data["duration_sec"] = np.round(data["duration_ms"]*(1/1000))
data["duration_sec"]

# Loudness of sound (rounded)
data["loudness_rounded"] = np.round(data["loudness"])
data["loudness_rounded"]

0        -5.0
1       -15.0
2        -7.0
3        -6.0
4        -4.0
         ... 
41101   -12.0
41102    -6.0
41103   -23.0
41104    -8.0
41105    -8.0
Name: loudness_rounded, Length: 41106, dtype: float64

In [None]:
# For later analysis of genres. Below data is from spotify playlists. 
data.to_csv("./data.csv")

#For a clean copy for genre analysis
cData = data.copy()
rData = data.copy()

# Load the Genre Songs from the csv into respective dataframes
christmas_songs = pd.read_csv('./datasets/christmas_hits.csv')
country_songs = pd.read_csv('./datasets/country_hits.csv')
rb_songs = pd.read_csv('./datasets/rb_hits.csv')
metal_songs = pd.read_csv('./datasets/metal_hits.csv')
hiphop_songs = pd.read_csv('./datasets/hiphop_hits.csv')

# Add a boolean value column indicating whether the song is in the playlist "Christmas Hits" by Spotify (https://open.spotify.com/playlist/37i9dQZF1DX0Yxoavh5qJV?si=96f9678a2982444d)
data['christmas'] = data.apply(lambda row: True if (row["track"] in christmas_songs["Track name"].tolist()) and (row["artist"] in christmas_songs[" Artist name"].tolist()) else False, axis=1)
data['country'] = data.apply(lambda row: True if (row["track"] in country_songs["Track name"].tolist()) and (row["artist"] in country_songs[" Artist name"].tolist()) else False, axis=1)
data['rb'] = data.apply(lambda row: True if (row["track"] in rb_songs["Track name"].tolist()) and (row["artist"] in rb_songs[" Artist name"].tolist()) else False, axis=1)
data['metal'] = data.apply(lambda row: True if (row["track"] in metal_songs["Track name"].tolist()) and (row["artist"] in metal_songs[" Artist name"].tolist()) else False, axis=1)
data['hiphop'] = data.apply(lambda row: True if (row["track"] in hiphop_songs["Track name"].tolist()) and (row["artist"] in hiphop_songs[" Artist name"].tolist()) else False, axis=1)

print("Christmas: ", len(data.loc[data['christmas'] == True]))
print("Country: ", len(data.loc[data['country'] == True]))
print("RB: ", len(data.loc[data['rb'] == True]))
print("Metal: ", len(data.loc[data['metal'] == True]))
print("Hip-Hop: ", len(data.loc[data['hiphop'] == True]))


Christmas:  22
Country:  27
RB:  10
Metal:  9
Hip-Hop:  8


#### General statistics
Means, medians, modes, standard deviations of each column for both unique decades and the overall dataset

#### Cumulative statistical analysis
Analysis of relationships between each column and 'hits' vs 'flops' for all decades overall


#### Decade-related statistical analysis

Analysis of properties of songs and how they vary by decade

#### Genre-Specific Hits statistical analysis
Analysis of relationships between properties of the hits songs between different genres/categories of songs. Includes hypothesis tests and bootstrapping


In [None]:
# Analysis of proportion of mode=1 for christmas hits and non-christmas hits
# Note that a mode of 1 implies "happy" songs as they are on the major scale. 
# Note that a mode of 0 implies "sad" songs as they are on the minor scale

# Null hypothesis: True Proportion of happy in Christmas = True Proportion of happy in non-christmas
# Alternative hypothesis: True Proportion of happy in Christmas > True Proportion of happy in non-christmas

mask_christmas = data["christmas"] == True
mask_non_christmas = data["christmas"] == False

common_christmas_songs = data[mask_christmas]
common_non_christmas_songs = data[mask_non_christmas]

christmas_hits = common_christmas_songs[common_christmas_songs["target"] == 1]
non_christmas_hits =common_non_christmas_songs[common_non_christmas_songs["target"] == 1]

mask_christmas_mode = christmas_hits["mode"] == 1
mask_non_christmas_mode = non_christmas_hits["mode"] == 1

proportion_happy_christmas = len(christmas_hits[mask_christmas_mode])/len(christmas_hits["mode"])

num_bootstrap = 10000
bootstrap_samples = np.random.choice(non_christmas_hits["mode"], size=(num_bootstrap, len(christmas_hits["mode"])), replace=True)
sampling_proportions_non_christmas = np.average(bootstrap_samples, axis=1)
non_christmas_proportion_avg = np.average(sampling_proportions_non_christmas)

sd_christmas = np.sqrt(proportion_happy_christmas*(1-proportion_happy_christmas))
sd_non_christmas = np.sqrt(non_christmas_proportion_avg*(1-non_christmas_proportion_avg))

newpval_christmas = stats.ttest_ind_from_stats(mean1=proportion_happy_christmas, std1=sd_christmas, nobs1=len(christmas_hits["mode"]), mean2=non_christmas_proportion_avg, std2=sd_non_christmas, nobs2=len(christmas_hits["mode"]))
p_value_bootstrapping_christmas = newpval_christmas[1]/2
print(p_value_bootstrapping_christmas)

# Note that p value is approx. 0.112, which is greater than the alpha level of 0.05, so we fail to reject the null hypothesis. 

0.11237203979909181


In [None]:
# Analysis of the Instrumentalness of country songs
# Null Hypothesis: Mean instrumentalness of Country = Mean instrumentalness of non-country
# Alternative Hypothesis: Mean instrumentalness of Country > Mean instrumentalness of non-country

mask_country = data["country"] == True
mask_non_country = data["country"] == False

common_country_songs = data[mask_country]
common_non_country_songs = data[mask_non_country]

country_hits = common_country_songs[common_country_songs["target"]==1]
non_country_hits = common_non_country_songs[common_non_country_songs["target"]==1]

mean_instrumental_country = np.average(country_hits["instrumentalness"])

num_bootstrap = 10000
bootstrap_samples_country = np.random.choice(non_country_hits["instrumentalness"], size=(num_bootstrap, len(country_hits["instrumentalness"])), replace=True)
sampling_proportions_non_country = np.average(bootstrap_samples_country, axis=1)
mean_instrumental_non_country = np.average(sampling_proportions_non_country)

country_sd = np.std(country_hits["instrumentalness"])
non_country_sd = np.std(non_country_hits["instrumentalness"])

newpval_country = stats.ttest_ind_from_stats(mean1=mean_instrumental_country, std1=country_sd, nobs1=len(country_hits["instrumentalness"]), mean2=mean_instrumental_non_country, std2=non_country_sd, nobs2=len(country_hits["instrumentalness"]))
p_value_bootstrapping_country = newpval_country[1]/2
print(p_value_bootstrapping_country)

# Note that p value is approx. 0.114, which is greater than the alpha level of 0.05, so we fail to reject the null hypothesis. 

0.113792548405481


In [None]:
# Analysis of the tempo of Instrumental (and non-instrumental) songs
# Two-tailed test
# Null Hypothesis: Mean tempo of instrumental songs = Mean tempo of non instrumental songs
# Alternative Hypothesis: Mean tempo of instrumental songs not equal to mean tempo of non-instrumental songs

# A song "isInstrumental" if the instrumentalness is greater than 0.5
data['isInstrumental'] = data.apply(lambda row: True if (row["instrumentalness"]>0.5) else False, axis=1)
mask_instrumental = data["isInstrumental"] == True
mask_non_instrumental = data["isInstrumental"] == False

common_instrumental_songs = data[mask_instrumental]
common_non_instrumental_songs = data[mask_non_instrumental]

instrumental_hits = common_instrumental_songs[common_instrumental_songs["target"]==1]
non_instrumental_hits = common_non_instrumental_songs[common_non_instrumental_songs["target"]==1]

mean_tempo_instrumental = np.average(instrumental_hits["tempo"])

num_bootstrap = 10000
bootstrap_samples_instrumental = np.random.choice(non_instrumental_hits["tempo"], size=(num_bootstrap, len(instrumental_hits["tempo"])), replace=True)
sampling_proportions_non_instrumental = np.average(bootstrap_samples_instrumental, axis=1)
mean_tempo_non_instrumental = np.average(sampling_proportions_non_instrumental)

instrumental_sd = np.std(instrumental_hits["tempo"])
non_instrumental_sd = np.std(non_instrumental_hits["tempo"])

newpval_instrumental = stats.ttest_ind_from_stats(mean1=mean_tempo_instrumental, std1=instrumental_sd, nobs1=len(instrumental_hits["tempo"]), mean2=mean_tempo_non_instrumental, std2=non_instrumental_sd, nobs2=len(instrumental_hits["tempo"]))
p_value_bootstrapping_instrumental = newpval_instrumental[1] 
print(p_value_bootstrapping_instrumental)

0.5101679643039706


#### One-hit wonders and Consistent Hitters statistical analysis

Analysis of properties of songs by artists with one-hit wonders vs. artists with multiple hit songs

#### Regression statistical analysis

#### Backwards regression statistical analysis

In [None]:
#### Backwards regression statistical analysis

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=162d76af-48a2-4f75-9aef-34643a48aa39' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>