### Holiday Statistics Analysis
#### Sharan Sokhi (ss1148)

In [None]:
# Imports (update throughout as necessary)
import numpy as np
import pandas as pd
import seaborn as sns
import math
import matplotlib.pyplot as plt
pd.options.mode.chained_assignment = None

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import plot_confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt # just to format confusion matrix

from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

import re
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest

#### Data loading and cleaning

Notes:
* No null values found
* No duplicate values found

In [None]:
# Data loading
data10 = pd.read_csv('./datasets/dataset-of-10s.csv')
data00 = pd.read_csv('./datasets/dataset-of-00s.csv')
data90 = pd.read_csv('./datasets/dataset-of-90s.csv')
data80 = pd.read_csv('./datasets/dataset-of-80s.csv')
data70 = pd.read_csv('./datasets/dataset-of-70s.csv')
data60 = pd.read_csv('./datasets/dataset-of-60s.csv')

# Data cleaning

# Drop all na values
data10 = data10.dropna()
data00 = data00.dropna()
data90 = data90.dropna()
data80 = data80.dropna()
data70 = data70.dropna()
data60 = data60.dropna()


# Add a decades column to each dataset (will be helpful when creating cumulative dataset)
data10['decade'] = [2010]*len(data10)
data00['decade'] = [2000]*len(data00)
data90['decade'] = [1990]*len(data90)
data80['decade'] = [1980]*len(data80)
data70['decade'] = [1970]*len(data70)
data60['decade'] = [1960]*len(data60)

# Check for duplicates in dataset
merged = data10.merge(data00,on=['track','artist'])
merged = merged.merge(data90,on=['track','artist'])
merged = merged.merge(data80,on=['track','artist'])
merged = merged.merge(data70,on=['track','artist'])
merged = merged.merge(data60,on=['track','artist'])
merged
# NOTE: No duplicates found

# Create cumulative dataset (from songs across all decades)
data = data10.append(data00,ignore_index=True).append(data90,ignore_index=True).append(data80,ignore_index=True).append(data70,ignore_index=True).append(data60,ignore_index=True)
data.head()
#data.tail()

# Add custom columns that will be useful later

# Duration of song in seconds
data["duration_sec"] = np.round(data["duration_ms"]*(1/1000))
data["duration_sec"]

# Loudness of sound (rounded)
data["loudness_rounded"] = np.round(data["loudness"])
data["loudness_rounded"]


0        -5.0
1       -15.0
2        -7.0
3        -6.0
4        -4.0
         ... 
41101   -12.0
41102    -6.0
41103   -23.0
41104    -8.0
41105    -8.0
Name: loudness_rounded, Length: 41106, dtype: float64

In [None]:
data['artist'] = data['artist'].str.replace(r"\(.*\)","")

for i in range(len(data)):
    data['artist'][i] = re.split(" [fF][eE][aA][tT]", data['artist'][i])[0]

  """Entry point for launching an IPython kernel.


In [None]:
# Load the Genre Songs from the csv into respective dataframes
christmas_songs = pd.read_csv('./datasets/christmas_hits.csv')
country_songs = pd.read_csv('./datasets/country_hits.csv')


In [None]:
# Add a boolean value column indicating whether the song is in the playlist "Christmas Hits" by Spotify (https://open.spotify.com/playlist/37i9dQZF1DX0Yxoavh5qJV?si=96f9678a2982444d)
data['christmas'] = data.apply(lambda row: True if (row["track"] in christmas_songs["Track name"].tolist()) and (row["artist"] in christmas_songs[" Artist name"].tolist()) else False, axis=1)
data['country'] = data.apply(lambda row: True if (row["track"] in country_songs["Track name"].tolist()) and (row["artist"] in country_songs[" Artist name"].tolist()) else False, axis=1)

print("Christmas: ", len(data.loc[data['christmas'] == True]))
print("Country: ", len(data.loc[data['country'] == True]))


Christmas:  22
Country:  27


In [None]:
mask_christmas = data["christmas"] == True

mask_non_christmas = data["christmas"] == False

common_christmas_songs = data[mask_christmas]
common_christmas_songs.to_csv("./c_christmas.csv")
common_non_christmas_songs = data[mask_non_christmas]

christmas_hits = common_christmas_songs[common_christmas_songs["target"] == 1]
non_christmas_hits =common_non_christmas_songs[common_non_christmas_songs["target"] == 1]

mask_christmas_mode = christmas_hits["mode"] == 1
mask_non_christmas_mode = non_christmas_hits["mode"] == 1
#non_christmas_sample = common_non_christmas_songs.sample(n=len(common_christmas_songs["mode"]))

proportion_happy_christmas = len(christmas_hits[mask_christmas_mode])/len(christmas_hits["mode"])
#proportion_happy_non_christmas = len(non_christmas_sample[mask_non_christmas_mode])/len(non_christmas_sample["mode"])

num_bootstrap = 10000
bootstrap_samples = np.random.choice(non_christmas_hits["mode"], size=(num_bootstrap, len(christmas_hits["mode"])), replace=True)
sampling_proportions = np.average(bootstrap_samples, axis=1)
non_christmas_proportion_avg = np.average(sampling_proportions)

# 51% percent men rank star wars

sd_christmas = np.sqrt(proportion_happy_christmas*(1-proportion_happy_christmas))
sd_non_christmas = np.sqrt(non_christmas_proportion_avg*(1-non_christmas_proportion_avg))

newpval = stats.ttest_ind_from_stats(mean1=proportion_happy_christmas, std1=sd_christmas, nobs1=len(christmas_hits["mode"]), mean2=non_christmas_proportion_avg, std2=sd_non_christmas, nobs2=len(christmas_hits["mode"]))
p_value_bootstrapping = newpval[1]/2
print(p_value_bootstrapping)


0.10876236860567981


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=162d76af-48a2-4f75-9aef-34643a48aa39' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>