## Monte Carlo Simulation

In this notebook I conduct a monte carlo simulation where I take 10 samples from each numeric column 1000 times and find the quantiles of variance of the samples. I will use these quantiles to find the significant level of variance to choose features for my models later on.

In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv(r'C:\Users\jsull\UW Work\Stat 451\Project\processed_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,master_idx,artist_name,track_name,track_id,year,genre,popularity,danceability,energy,...,Instrumental,Jazz and Blues,Latin,Miscellaneous,Pop,Reggae and Tropical,Rock,Spiritual and Religious,Theater,World
0,0,0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,2012,Acoustic,0.68,0.483,0.303,...,0,0,0,0,0,0,0,0,0,0
1,1,1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,2012,Acoustic,0.5,0.572,0.454,...,0,0,0,0,0,0,0,0,0,0
2,2,2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,2012,Acoustic,0.57,0.409,0.234,...,0,0,0,0,0,0,0,0,0,0
3,3,3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,2012,Acoustic,0.58,0.392,0.251,...,0,0,0,0,0,0,0,0,0,0
4,4,4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,2012,Acoustic,0.54,0.43,0.791,...,0,0,0,0,0,0,0,0,0,0


In [16]:
# Finding the means of the columns
mean_list = []
for col in ["danceability", "energy", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", 'loudness', 'tempo', 'duration_ms']:
    mean_list.append(df[col].mean())
sum(mean_list)/len(mean_list)

0.38152950974378574

In [6]:
# Monte Carlo simulation to determine the 0.05 quantile of the variance of the 10 values selected from each column

from joblib import Parallel, delayed

columns_of_interest = ["danceability", "energy", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", 'loudness', 'tempo', 'duration_ms']

# Clean the DataFrame once
cleaned_columns = {col: df[col].dropna().values for col in columns_of_interest}

# Number of simulations
num_simulations = 1000

# Function to calculate variances for a column
def calculate_variances(values):
    variances = []
    for _ in range(num_simulations):
        sample = np.random.choice(values, 10, replace=False)
        variance = np.var(sample, ddof=1)  # Using ddof=1 for sample variance
        variances.append(variance)
    return variances

# Use joblib for parallel processing
results = Parallel(n_jobs=-1)(delayed(lambda col: (col, np.quantile(calculate_variances(cleaned_columns[col]), 0.05)))(col) for col in columns_of_interest)

# Convert list of tuples to dictionary
results_dict = dict(results)

# Print the results
print(results_dict)


{'danceability': 0.013122668888888888, 'energy': 0.026530489444444444, 'speechiness': 0.0003606191555555556, 'acousticness': 0.04500401804348366, 'instrumentalness': 0.03412134705572286, 'liveness': 0.0041157230444444455, 'valence': 0.03393465611111111, 'loudness': 0.001310219118265714, 'tempo': 0.005193332793267071, 'duration_ms': 7.275565316918107e-05}


In [None]:
import json
# Write dictionary to a JSON file
with open(r'C:\Users\jsull\UW Work\Stat 451\Project\monte_carlo.json', 'w') as f:
    json.dump(results_dict, f, indent=4)

In [8]:
def calculate_average_distance(values):
    distances = []
    for _ in range(num_simulations):
        # Randomly select two points
        sample = np.random.choice(values, 2, replace=False)
        # Calculate the absolute distance between them
        distance = abs(sample[1] - sample[0])
        distances.append(distance)
    # Calculate and return the average distance
    return np.mean(distances)

# Use joblib for parallel processing
results = Parallel(n_jobs=-1)(delayed(lambda col: (col, calculate_average_distance(cleaned_columns[col])))(col) for col in columns_of_interest)

# Convert list of tuples to dictionary
distances_dict = dict(results)

# Print the results
print(distances_dict)


{'danceability': 0.2059687, 'energy': 0.30181103000000004, 'speechiness': 0.09452669999999999, 'acousticness': 0.39033502381, 'instrumentalness': 0.33972041278, 'liveness': 0.1976444, 'valence': 0.29951859999999997, 'loudness': 0.08479571222097318, 'tempo': 0.13033426135931808, 'duration_ms': 0.019910233020054603}


In [12]:
np.mean(list(distances_dict.values()))

0.2064565073190346

In [7]:
# check to make sure it opens
with open(r'C:\Users\jsull\UW Work\Stat 451\Project\monte_carlo.json', 'r') as f:
    data_loaded = json.load(f)

print(data_loaded)

{'danceability': 0.013382394999999998, 'energy': 0.025014266666666667, 'speechiness': 0.00042638024999999994, 'acousticness': 0.0484138911527198, 'instrumentalness': 0.03146510005767226, 'liveness': 0.004487286666666666, 'valence': 0.03284678672777778, 'loudness': 0.001398039625570148, 'tempo': 0.005009213793574107, 'duration_ms': 6.857034893928311e-05}
