# June Code Jam 

## Introduction

## Data Overview

In [1]:
# Import necessary libraries
import re
import os
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats as st
from scipy import stats as st
import warnings
warnings.filterwarnings('ignore')

from dash import Dash, html, dcc, callback, Output, Input
import plotly.express as px

## Data Preprocessing

In [2]:
# Load the dataset and read the data correctly
data = pd.read_csv('datasets/spotify.csv')

In [3]:
# Display the first 5 rows of the dataset
display(data.head())

Unnamed: 0,Index,Title,Artist,Top Genre,Year,Beats Per Minute (BPM),Energy,Danceability,Loudness (dB),Liveness,Valence,Length (Duration),Acousticness,Speechiness,Popularity
0,1,Sunrise,Norah Jones,adult standards,2004,157,30,53,-14,11,68,201,94,3,71
1,2,Black Night,Deep Purple,album rock,2000,135,79,50,-11,17,81,207,17,7,39
2,3,Clint Eastwood,Gorillaz,alternative hip hop,2001,168,69,66,-9,7,52,341,2,17,69
3,4,The Pretender,Foo Fighters,alternative metal,2007,173,96,43,-4,3,37,269,0,4,76
4,5,Waitin' On A Sunny Day,Bruce Springsteen,classic rock,2002,106,82,58,-5,10,87,256,1,3,59


In [4]:
# Display the shape of 'data' DataFrame 
n_rows, n_cols = data.shape
print(f"The DataFrame has {n_rows} rows and {n_cols} columns") 

The DataFrame has 1994 rows and 15 columns


In [5]:
# Display informative summary of the 'data' DataFrame
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1994 entries, 0 to 1993
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Index                   1994 non-null   int64 
 1   Title                   1994 non-null   object
 2   Artist                  1994 non-null   object
 3   Top Genre               1994 non-null   object
 4   Year                    1994 non-null   int64 
 5   Beats Per Minute (BPM)  1994 non-null   int64 
 6   Energy                  1994 non-null   int64 
 7   Danceability            1994 non-null   int64 
 8   Loudness (dB)           1994 non-null   int64 
 9   Liveness                1994 non-null   int64 
 10  Valence                 1994 non-null   int64 
 11  Length (Duration)       1994 non-null   object
 12  Acousticness            1994 non-null   int64 
 13  Speechiness             1994 non-null   int64 
 14  Popularity              1994 non-null   int64 
dtypes: i

In [6]:
# Clean column names: replace spaces with underscores, remove special characters, and convert to lowercase
data.columns = [re.sub(r'\s+', '_', re.sub(r'[^\w\s]', '', col)).lower() for col in data.columns]

# Checking if the column names are renamed successfully
data.columns

Index(['index', 'title', 'artist', 'top_genre', 'year', 'beats_per_minute_bpm',
       'energy', 'danceability', 'loudness_db', 'liveness', 'valence',
       'length_duration', 'acousticness', 'speechiness', 'popularity'],
      dtype='object')

In [7]:
# Check for any duplicates in the dataframe
duplicates = data[data.duplicated()]
print(f"Number of duplicates: {duplicates.shape[0]}")

Number of duplicates: 0


In [8]:
# Check for missing values in 'data' DataFrame
display(data.isna().sum())

index                   0
title                   0
artist                  0
top_genre               0
year                    0
beats_per_minute_bpm    0
energy                  0
danceability            0
loudness_db             0
liveness                0
valence                 0
length_duration         0
acousticness            0
speechiness             0
popularity              0
dtype: int64

In [9]:
# Count the occurrences of each unique value in the 'length_duration' column 
# This is done to check why is this column datatype is object
data['length_duration'].value_counts()

length_duration
215      23
243      21
241      20
217      20
236      20
         ..
1,367     1
433       1
602       1
421       1
128       1
Name: count, Length: 350, dtype: int64

In [10]:
# Count how many values in the 'length_duration' column contain a comma
num_with_comma = data['length_duration'].str.contains(',').sum()
print(f"Number of values with a comma: {num_with_comma}")

Number of values with a comma: 4


In [11]:
# Strip commas from length_duration entries that have them
data['length_duration'] = data['length_duration'].str.replace(',', '')

In [12]:
# Convert 'length_duration' column to integer data type
data['length_duration'] = data['length_duration'].astype(int)

# Checking if the datatype of 'length_duration' column is successfully changed
data['length_duration'].dtype

dtype('int32')

In [13]:
# Display descriptive statistics of the 'data' DataFrame
data.describe()

Unnamed: 0,index,year,beats_per_minute_bpm,energy,danceability,loudness_db,liveness,valence,length_duration,acousticness,speechiness,popularity
count,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0,1994.0
mean,997.5,1992.992979,120.215647,59.679539,53.238215,-9.008526,19.012036,49.408726,262.44333,28.858074,4.994985,59.52658
std,575.762538,16.116048,28.028096,22.154322,15.351507,3.647876,16.727378,24.858212,93.604387,29.011986,4.401566,14.3516
min,1.0,1956.0,37.0,3.0,10.0,-27.0,2.0,3.0,93.0,0.0,2.0,11.0
25%,499.25,1979.0,99.0,42.0,43.0,-11.0,9.0,29.0,212.0,3.0,3.0,49.25
50%,997.5,1993.0,119.0,61.0,53.0,-8.0,12.0,47.0,245.0,18.0,4.0,62.0
75%,1495.75,2007.0,136.0,78.0,64.0,-6.0,23.0,69.75,289.0,50.0,5.0,71.0
max,1994.0,2019.0,206.0,100.0,96.0,-2.0,99.0,99.0,1412.0,99.0,55.0,100.0


## Instating a Dash app

Our goal is to build an interactive dashboard using Plotly Dash to explore music listening trends in our Spotify data set. Here's a basic Dash app setup:

In [16]:
demo_app = Dash()

demo_app.layout = [
    html.H1(children='Soon-To-Be-Titled Dashboard App', style={'textAlign':'center'}),
    dcc.Dropdown(data['year'].unique(), value=2004, id='dropdown-selection'),
    dcc.Graph(id='graph-content')
]

@callback(
    Output('graph-content', 'figure'),
    Input('dropdown-selection', 'value')
)
def update_graph(value):
    dff = data[data['year']==value].sort_values(by='beats_per_minute_bpm', axis=0)
    return px.line(dff, x='beats_per_minute_bpm', y='popularity')

#if __name__ == '__main__': #following line appears after this 'if' in the actual app
    
demo_app.run(debug=True)

This is not a particularly useful app as it stands, but it's got some scaffolding: there's a dropdown menu and a graph that reacts to it. Once we've finalized the app, we'll break out the Dash-related code along with the processed data from pandas into its own file, app.py.