In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress
import plotly.express as px

In [None]:
# Reading MSD dataframe
df = pd.read_pickle("msd.pkl")

In [None]:
# Dropping O's and null values
# Rename pitch_network_entropy to entropy
df = df.rename(columns={'pitch_network_entropy': 'entropy'})
# Funtion to drop 0's
def dropZero(feature, df):
    df_noZero = df[(df[feature] != 0)]
    return df_noZero

In [None]:
# Plotting numerical data
feature_list = ['artist_familiarity', 'artist_hotttnesss', 'tempo', 'loudness', 'year', 'entropy', 'timbre_00', 'timbre_11']

# Dropping 0's and null values
for feature in feature_list:
    df = dropZero(feature, df)
    df.dropna(subset=[feature], inplace=True)

# define the color feature
color_feature = 'song_hotttnesss'

In [None]:
# Plotting numerical data, dark background
fig = px.scatter_matrix(df, dimensions=feature_list, template="plotly_dark", color=color_feature, labels={col: col.replace('_', ' ').title() for col in feature_list},
                        height=1100, width=1100)
fig.show()

In [None]:
# Plotting numerical data, white background
fig = px.scatter_matrix(df, dimensions=feature_list, color=color_feature, labels={col: col.replace('_', ' ').title() for col in feature_list},
                        height=1100, width=1100)
fig.show()

In [None]:
# Dataframe for numerical features, dropping energy and danceability since values are all 0's
df = df.drop('energy', axis=1)
df = df.drop('danceability', axis=1)

allfeature_list = df.columns
# Dropping 0's and null values
for feature in allfeature_list:
    df = dropZero(feature, df)
    df.dropna(subset=[feature], inplace=True)    

## Histograms

In [None]:
fig = px.histogram(df, x='artist_familiarity', color='year',  nbins=85, height=700, width=700)
fig.show()

In [None]:
fig = px.histogram(df, x='song_hotttnesss', color='year',  nbins=85, height=700, width=700)
fig.show()

In [None]:
fig = px.histogram(df, x='song_hotttnesss', template="plotly_dark", color='year',  nbins=85, height=700, width=700)
fig.show()

In [None]:
fig = px.histogram(df, x='tempo', color='year',  nbins=85, height=700, width=700)
fig.show()

In [None]:
fig = px.histogram(df, x='tempo', color='year', template="plotly_dark", nbins=85, height=700, width=700)
fig.show()

In [None]:
fig = px.histogram(df, x='timbre_01', color='year',  nbins=85, height=700, width=700)
fig.show()

In [None]:
# Sort unique year values alphabetically
year_order = sorted(df['year'].unique())
print(year_order)

fig = px.histogram(df, 
                   x='artist_familiarity', 
                   color='year', 
                   nbins=85, 
                   template="plotly_dark",
                   height=700, 
                   width=700,
                   category_orders={'color': year_order})

fig.show()

In [None]:
# Histogramns for timbre
# define the number of rows and columns in the subplot grid WHITE bg
num_cols = 4
num_rows = 3

# List of columns
col_list = list(enumerate(df[['timbre_00', 'timbre_01',
   'timbre_02', 'timbre_03', 'timbre_04', 'timbre_05', 'timbre_06',
   'timbre_07', 'timbre_08', 'timbre_09', 'timbre_10', 'timbre_11']]))

# define the figure size
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10))

# define the colors to use for each histogram
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'gray', 'pink', 'olive', 'teal', 'navy', 'maroon']

# loop through each column and plot a histogram in the corresponding subplot
for i, column in col_list:
    row = i // num_cols
    col = i % num_cols
    ax = axes[row, col]
    ax.hist(df[column], bins=50, color=colors[i])
    ax.set_title(column)
    plt.tight_layout()

#plt.show()
# save the plot as a PNG file
plt.savefig('histogram_white.png')

In [None]:
# define the number of rows and columns in the subplot grid.   Dark bg
num_cols = 4
num_rows = 3

# List of columns
col_list = list(enumerate(df[['timbre_00', 'timbre_01',
   'timbre_02', 'timbre_03', 'timbre_04', 'timbre_05', 'timbre_06',
   'timbre_07', 'timbre_08', 'timbre_09', 'timbre_10', 'timbre_11']]))

# define the figure size
fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, 10))

# define the colors to use for each histogram
colors = ['red', 'blue', 'green', 'orange', 'purple', 'brown', 'gray', 'pink', 'olive', 'teal', 'navy', 'maroon']

# loop through each column and plot a histogram in the corresponding subplot
for i, column in col_list:
    row = i // num_cols
    col = i % num_cols
    ax = axes[row, col]
    ax.hist(df[column], bins=25, color=colors[i], alpha=0.7, label=column, density=True, histtype='stepfilled', linewidth=1.5, edgecolor='black', linestyle='--')
    ax.set_title(column)
    ax.set_facecolor('#2C2F33')
    ax.grid(color='white', linestyle=':', axis='y')
    ax.legend()

plt.tight_layout()

# save the plot as a PNG file
plt.savefig('histogram_dark.png')

In [None]:
def scatter_plot(feature1, feature2):
    # Define x and y variables
    x = df[feature1]
    y = df[feature2]
    c = df['song_hotttnesss']

    # Create scatter plot with color gradient based on song_hotttnesss
    plt.scatter(x, y, c=c, cmap='coolwarm')
    
    # add colorbar
    cbar = plt.colorbar()
    cbar.ax.set_ylabel('Song Hotttness')

    # add labels and title
    plt.xlabel(feature1)
    plt.ylabel(feature2)
    plt.title('Scatter Plot with Color Map')

    # Calculate the regression line
    slope, intercept, r_value, p_value, std_err = linregress(x, y)

    # calculate correlation coefficient
    corr_coef = np.corrcoef(x, y)[0, 1]

    # add text to plot
    plt.title(f"Correlation Coefficient: {corr_coef:.2f}")

    # Add regression line
    plt.plot(x, slope*x + intercept, 'r', label='fitted line')

    # Add legend
    plt.legend()

    # show the plot
    plt.show()
    
scatter_plot('artist_familiarity', 'artist_hotttnesss')

In [None]:
scatter_plot('timbre_00', 'loudness')

In [None]:
scatter_plot('timbre_00', 'entropy')

In [None]:
scatter_plot('entropy', 'loudness')