In [1]:
import pandas as pd
import numpy as np
import time
from scipy.stats import sem
from scipy.stats import norm
from scipy.stats import pearsonr
from scipy.stats import spearmanr
from pingouin import partial_corr
import matplotlib.pyplot as plt


In [5]:
df=pd.read_csv("/Users/jamesdoucette/Desktop/Schoolwork/data-science-project/genre+release.csv")

In [4]:
continuous_vars=['acousticness','danceability','duration_ms','energy','instrumentalness','liveness','loudness','speechiness','tempo','valence']
discrete_vars=['key','mode','time_signature']
#others are popularity,genre,year


In [3]:
def z_test(data1,data2):
    mean1=np.mean(data1)
    mean2=np.mean(data2)
    diff = np.abs(mean1 - mean2)
    se = np.sqrt(sem(data1)**2 + sem(data2)**2 )
    p=(1-norm.cdf(diff/se))
    return [mean1>mean2,mean1,mean2,p]

In [2]:
def pearson(var1,var2):
    return pearsonr(df[var1],df[var2])

def spearman(var1,var2):
    return spearmanr(df[var1],df[var2])

def print_corr(var1,var2):
    p=pearson(var1,var2)
    s=spearman(var1,var2)
    print(var1)
    print(f'Spearmanr {s[0]} with p value {s[1]}\n')
    
def get_genre(i):
    genres=['Movie', 'R&B', 'A Capella', 'Alternative', 'Country', 'Dance', 'Electronic', 'Anime', 'Folk', 'Blues', 'Opera', 'Hip-Hop', 'Children\'s Music', 'Rap', 'Indie', 'Classical', 'Pop', 'Reggae', 'Reggaeton', 'Jazz', 'Rock', 'Ska', 'Comedy', 'Soul', 'Soundtrack', 'World']
    return genres[i]
    

def population_correlations():
    print('Correlation with population, no control')
    for var in continuous_vars:
        print_corr(var,'popularity')
    print_corr('year','popularity')
    
    print('Discrete Variables')
    
def population_partial_correlations():
    print('Correlation with population, controlling for release year')
    for var in continuous_vars:
        p=partial_corr(df,x=var,y='popularity',covar='year',method='spearman')
        print(var)
        print(f"Correlation coefficient {p['r'].values[0]} with p value {p['p-val'].values[0]}\n")
        
def control_genre():
    print('Controlling for release year and numerical genre')
    for var in continuous_vars:
        p=partial_corr(df,x=var,y='popularity',covar=['year','genrenum'],method='spearman')
        print(var)
        print(f"Correlation coefficient {p['r'].values[0]} with p value {p['p-val'].values[0]}\n")
        
def all_genres():
    print('Controlling by release year, separating by genre')
    for i in range(26):
        print(f'Genre {get_genre(i)}')
        for var in continuous_vars:
            this_genre=df[df['genre']==i]
            p=partial_corr(this_genre,x=var,y='popularity',covar='year',method='spearman')
            print(var)
            print(f"Correlation coefficient {p['r'].values[0]} with p value {p['p-val'].values[0]}\n")
        
def year_correlations():
    print('Continuous Variables')
    for var in continuous_vars:
        print_corr(var,'year')
        
def discrete():
    print('Testing discretes')
    for var in discrete_vars:
        vals=df[var].unique()
        print(var)
        for val in vals:
            positive=df[df[var]==val]
            negative=df[df[var]!=val]
            test=z_test(positive,negative)
            print(val,test)

        

        
    

In [None]:
vals=df['key'].unique()
for val in vals:
    positive=df[df['key']==val]['popularity']
    negative=df[df['key']!=val]['popularity']
    print(val,z_test(positive,negative))

In [71]:
all_genres()

Controlling by release year, separating by genre
Genre Movie
acousticness
Correlation coefficient -0.3086195038499765 with p value 0.0

danceability
Correlation coefficient 0.2281512082213368 with p value 0.0

duration_ms
Correlation coefficient 0.10424085934014923 with p value 0.0

energy
Correlation coefficient 0.18333162273962136 with p value 0.0

instrumentalness
Correlation coefficient -0.06016981039577464 with p value 1.8562150582052655e-140

liveness
Correlation coefficient -0.11311109212112347 with p value 0.0

loudness
Correlation coefficient 0.30275576368381146 with p value 0.0

speechiness
Correlation coefficient -0.052369972113748324 with p value 7.759805428226346e-107

tempo
Correlation coefficient 0.09214358467533393 with p value 0.0

valence
Correlation coefficient 0.0707518096506821 with p value 1.4755704956873983e-193

Genre R&B
acousticness
Correlation coefficient -0.3086195038499765 with p value 0.0

danceability
Correlation coefficient 0.2281512082213368 with p valu

energy
Correlation coefficient 0.18333162273962136 with p value 0.0

instrumentalness
Correlation coefficient -0.06016981039577464 with p value 1.8562150582052655e-140

liveness
Correlation coefficient -0.11311109212112347 with p value 0.0

loudness
Correlation coefficient 0.30275576368381146 with p value 0.0

speechiness
Correlation coefficient -0.052369972113748324 with p value 7.759805428226346e-107

tempo
Correlation coefficient 0.09214358467533393 with p value 0.0

valence
Correlation coefficient 0.0707518096506821 with p value 1.4755704956873983e-193

Genre Hip-Hop
acousticness
Correlation coefficient -0.3086195038499765 with p value 0.0

danceability
Correlation coefficient 0.2281512082213368 with p value 0.0

duration_ms
Correlation coefficient 0.10424085934014923 with p value 0.0

energy
Correlation coefficient 0.18333162273962136 with p value 0.0

instrumentalness
Correlation coefficient -0.06016981039577464 with p value 1.8562150582052655e-140

liveness
Correlation coefficie

tempo
Correlation coefficient 0.09214358467533393 with p value 0.0

valence
Correlation coefficient 0.0707518096506821 with p value 1.4755704956873983e-193

Genre Ska
acousticness
Correlation coefficient -0.3086195038499765 with p value 0.0

danceability
Correlation coefficient 0.2281512082213368 with p value 0.0

duration_ms
Correlation coefficient 0.10424085934014923 with p value 0.0

energy
Correlation coefficient 0.18333162273962136 with p value 0.0

instrumentalness
Correlation coefficient -0.06016981039577464 with p value 1.8562150582052655e-140

liveness
Correlation coefficient -0.11311109212112347 with p value 0.0

loudness
Correlation coefficient 0.30275576368381146 with p value 0.0

speechiness
Correlation coefficient -0.052369972113748324 with p value 7.759805428226346e-107

tempo
Correlation coefficient 0.09214358467533393 with p value 0.0

valence
Correlation coefficient 0.0707518096506821 with p value 1.4755704956873983e-193

Genre Comedy
acousticness
Correlation coeffici

In [68]:
control_genre()

acousticness
Correlation coefficient -0.28679079961149323 with p value 0.0

danceability
Correlation coefficient 0.2110118032443134 with p value 0.0

duration_ms
Correlation coefficient 0.1061131757665399 with p value 0.0

energy
Correlation coefficient 0.1678639436350018 with p value 0.0

instrumentalness
Correlation coefficient -0.05852333314607488 with p value 5.6887608905189525e-133

liveness
Correlation coefficient -0.10630831317501654 with p value 0.0

loudness
Correlation coefficient 0.26055738820925944 with p value 0.0

speechiness
Correlation coefficient -0.037351168322597654 with p value 3.1496002492983665e-55

tempo
Correlation coefficient 0.0782412928125873 with p value 2.2855739658737314e-236

valence
Correlation coefficient 0.061056225042918834 with p value 1.418457789127113e-144



In [63]:
population_partial_correlations()

Correlation with population, controlling for release year
acousticness
Correlation coefficient -0.3086195038499765 with p value 0.0

danceability
Correlation coefficient 0.2281512082213368 with p value 0.0

duration_ms
Correlation coefficient 0.10424085934014923 with p value 0.0

energy
Correlation coefficient 0.18333162273962136 with p value 0.0

instrumentalness
Correlation coefficient -0.06016981039577464 with p value 1.8562150582052655e-140

liveness
Correlation coefficient -0.11311109212112347 with p value 0.0

loudness
Correlation coefficient 0.30275576368381146 with p value 0.0

speechiness
Correlation coefficient -0.052369972113748324 with p value 7.759805428226346e-107

tempo
Correlation coefficient 0.09214358467533393 with p value 0.0

valence
Correlation coefficient 0.0707518096506821 with p value 1.4755704956873983e-193



In [35]:
population_correlations()

Continuous Variables
acousticness
Spearmanr -0.31207220998424534 with p value 0.0

danceability
Spearmanr 0.23169683970386576 with p value 0.0

duration_ms
Spearmanr 0.10212238803898245 with p value 0.0

energy
Spearmanr 0.18918846854955584 with p value 0.0

instrumentalness
Spearmanr -0.13092492123603755 with p value 0.0

liveness
Spearmanr -0.11352932168025172 with p value 0.0

loudness
Spearmanr 0.31326262959525103 with p value 0.0

speechiness
Spearmanr -0.060305096343884756 with p value 4.4078202646597854e-141

tempo
Spearmanr 0.09449679811842997 with p value 0.0

valence
Spearmanr 0.06381509987629423 with p value 8.944148003243136e-158

year
Spearmanr 0.15058176383379 with p value 0.0

Discrete Variables


In [36]:
year_correlations()

Continuous Variables
acousticness
Spearmanr -0.0940312099814562 with p value 0.0

danceability
Spearmanr 0.10817624865489583 with p value 0.0

duration_ms
Spearmanr -0.04733061780946501 with p value 1.3485671980970821e-87

energy
Spearmanr 0.06661693243692801 with p value 8.819570700133203e-172

instrumentalness
Spearmanr -0.056632577633137775 with p value 1.2445689492955797e-124

liveness
Spearmanr -0.032385459501692186 with p value 5.937979290485341e-42

loudness
Spearmanr 0.19427354377815564 with p value 0.0

speechiness
Spearmanr 0.0794471320006571 with p value 1.1485856829330001e-243

tempo
Spearmanr 0.05129182029186905 with p value 1.4785409188885243e-102

valence
Spearmanr -0.09861774544103145 with p value 0.0

