In [None]:

import os
import sys
import logging
from pathlib import Path
from datetime import datetime

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml # For reading the config file

In [None]:

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

In [None]:

def load_config(config_path: Path):
    with open(config_path, 'r') as file:
        return yaml.safe_load(file)

# Assuming config folder is at project root, one level up from 'notebooks'
CONFIG_PATH = Path.cwd().parent / 'config' / 'config.yaml' 
CONFIG = load_config(CONFIG_PATH)
CONFIG_PATH

WindowsPath('d:/projects/spotify_emotion_prediction/config/config.yaml')

In [27]:
# Display settings for pandas DataFrames
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

In [28]:
# Define data path using pathlib for platform independence and readability
# In a Jupyter notebook, Path.cwd() is the notebook's directory (e.g., spotify_emotion_recognition/notebooks/).
# We need to go up one level to the project root, and then navigate using the config path.
CURRENT_DIR = Path.cwd()
PROJECT_ROOT = CURRENT_DIR.parent 

# Raw data path will be read from the config file
RAW_DATA_PATH = PROJECT_ROOT / CONFIG['data']['raw_data_path'] 
PROJECT_ROOT

WindowsPath('d:/projects/spotify_emotion_prediction')

In [29]:
def load_data(path: Path) -> pd.DataFrame:
    logging.info(f"Loading data from: {path}")
    if not path.exists():
        logging.error(f"File not found: {path}")
        raise FileNotFoundError(f"File not found at: {path}")
    try:
        df = pd.read_csv(path, low_memory=False)
        logging.info("Data loaded.")
        return df
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        raise


In [30]:
def get_memory_usage(df: pd.DataFrame) -> None:
    # Prints the memory usage of the DataFrame, broken down by columns.
    # Optimized for large datasets by showing actual memory usage.
    logging.info("Checking memory usage of the DataFrame.")

    # Calculate and convert memory usage for each column to MB
    memory_usage_mb = df.memory_usage(deep=True) / (1024 * 1024)

    print("\nMemory usage by column:")
    # Print column-wise memory usage directly from the calculated series
    for column, mem_mb in memory_usage_mb.drop('Index', errors='ignore').items():
        print(f"{column}: {mem_mb:.2f} MB")

    # Calculate and print total DataFrame memory usage
    total_memory_mb = memory_usage_mb.sum()
    print(f"\nTotal DataFrame Memory Usage: {total_memory_mb:.2f} MB")
    logging.info(f"DataFrame total memory usage: {total_memory_mb:.2f} MB")


In [31]:
def display_dataframe_overview(df: pd.DataFrame) -> None:
    # Displays basic overview of the DataFrame including dimensions, head, tail and random samples.
    print("\n### Dataset Dimensions ###")
    print(f"Rows: {df.shape[0]} \nColumns: {df.shape[1]} ") # Display rows and columns on one line
    logging.info(f"Dataset dimensions: Rows={df.shape[0]}, Columns={df.shape[1]}")

    print("\n### First 5 Rows ###")
    print(df.head()) # Display the first 5 rows
    logging.info("Displayed first 5 rows.")

    print("\n### Last 5 Rows ###")
    print(df.tail()) # Display the last 5 rows
    logging.info("Displayed last 5 rows.")

    print("\n### 10 Random Samples ###")
    print(df.sample(10)) # Display 10 random samples
    logging.info("Displayed 10 random samples.")

In [33]:
def check_data_types_and_missing_values(df: pd.DataFrame) -> None:
    # Checks column names, their data types, non-null counts, and reports missing values.
    print("\n### Column Names and Their Data Types ###") # Print header for data types
    df.info() # Display DataFrame info (includes dtypes and non-null counts)
    print("\n### Data Types Summary ###") # Print header for data types summary
    print(df.dtypes.value_counts()) # Display count of each data type
    logging.info("Displayed column info and data types summary.") # Log the action

    print("\n### Missing Values Count ###") # Print header for missing values
    # Calculate missing values count and percentage, create DataFrame, filter, and sort in one chain
    missing_df = pd.DataFrame({
        'Missing Count': df.isnull().sum(), # Calculate count of nulls per column
        'Missing Percentage (%)': (df.isnull().sum() / len(df)) * 100 # Calculate percentage of nulls
    }).loc[lambda x: x['Missing Count'] > 0].sort_values(by='Missing Percentage (%)', ascending=False) # Filter for >0 missing and sort

    print(missing_df) # Print the filtered and sorted missing values DataFrame
    logging.info("Reported missing values count and percentage.") # Log the action

In [34]:
def check_duplicate_rows(df: pd.DataFrame) -> None:
    # Checks and reports the number of duplicate rows.
    print("\n### Duplicate Rows Check ###") # Print header for duplicate check
    num_duplicates = df.duplicated().sum() # Calculate the total number of duplicate rows
    print(f"Number of duplicate rows: {num_duplicates}") # Print the count of duplicate rows
    # Log a warning if duplicates are found, otherwise log info
    if num_duplicates > 0:
        logging.warning(f"Found {num_duplicates} duplicate rows.")
    else:
        logging.info("No duplicate rows found.")


In [None]:
def get_descriptive_statistics(df: pd.DataFrame) -> None:
    
    print("\n### Descriptive Statistics for Numerical Features ###") 
    numerical_features = df.select_dtypes(include=np.number) 
    if not numerical_features.empty:
        print(numerical_features.describe()) 
        logging.info("Displayed descriptive statistics for numerical features.")
    else:
        logging.info("No numerical features found for descriptive statistics.")

In [None]:
def detect_outliers_iqr(df: pd.DataFrame) -> None:
   
    print("\n### Outlier Detection using IQR Method ###")
    numerical_cols = df.select_dtypes(include=np.number).columns

    if numerical_cols.empty:
        logging.info("No numerical columns found to detect outliers.")
        return

    outliers_found = False
    for col in numerical_cols:
        # Calculate Q1, Q3, and IQR
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1

        # Define bounds for outliers
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR

        # Count outliers
        outliers_count = df[(df[col] < lower_bound) | (df[col] > upper_bound)].shape[0]

        if outliers_count > 0:
            print(f"{col}: {outliers_count} outliers (Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f})")
            logging.warning(f"Outliers detected in '{col}': {outliers_count} outliers.")
            outliers_found = True

    if not outliers_found:
         logging.info("No outliers detected in numerical columns.")


In [36]:
def analyze_categorical_features(df: pd.DataFrame, target_column: str) -> None:
    
    print("\n### Categorical Features: Unique Values and Frequencies ###")
    categorical_cols = df.select_dtypes(include='object').columns

    for col in categorical_cols:
        # Combine header and unique count on one line
        print(f"\n--- {col} (Unique values: {df[col].nunique()}) ---")
        # Print value counts
        print(df[col].value_counts(dropna=False))
        logging.info(f"Analyzed unique values and frequencies for '{col}'.")

    # Special attention to the target column ('emotion')
    if target_column in df.columns:
        print(f"\n### '{target_column}' column distribution (Target Variable) ###")
        print(df[target_column].value_counts(dropna=False))
        logging.info(f"Displayed distribution of the '{target_column}' target variable.")
    else:
        logging.warning(f"Target column '{target_column}' not found in DataFrame.")

In [None]:
def main():
    
    logging.info("Starting data exploration process for 1.0-data-exploration.ipynb.")

    try:
        df = load_data(RAW_DATA_PATH)
        get_memory_usage(df)
        display_dataframe_overview(df)
        check_data_types_and_missing_values(df)
        check_duplicate_rows(df)
        get_descriptive_statistics(df)
        detect_outliers_iqr(df)
        analyze_categorical_features(df)
        logging.info("Data exploration completed successfully.")

    except FileNotFoundError as fnf_error:
        logging.error(f"Critical Error: {fnf_error}")
        sys.exit(1) # Exit with an error code
    except Exception as e:
        logging.critical(f"An unexpected error occurred during data exploration: {e}")
        sys.exit(1) # Exit with an error code


In [None]:



# Entry Point:
if __name__ == "__main__":
    main()

2025-06-12 18:28:33,863 - INFO - Starting data exploration process for 1.0-data-exploration.ipynb.
2025-06-12 18:28:33,865 - INFO - Loading data from: d:\projects\spotify_emotion_prediction\src\data\raw\spotify_dataset.csv
2025-06-12 18:28:51,077 - INFO - Data loaded.
2025-06-12 18:28:51,079 - INFO - Checking memory usage of the DataFrame.
2025-06-12 18:28:52,617 - INFO - DataFrame total memory usage: 1950.00 MB
2025-06-12 18:28:52,618 - INFO - Dataset dimensions: Rows=551443, Columns=39
2025-06-12 18:28:52,629 - INFO - Displayed first 5 rows.
2025-06-12 18:28:52,636 - INFO - Displayed last 5 rows.
2025-06-12 18:28:52,665 - INFO - Displayed 10 random samples.



Memory usage by column:
Artist(s): 38.16 MB
song: 37.59 MB
text: 1249.80 MB
Length: 32.61 MB
emotion: 32.50 MB
Genre: 36.64 MB
Album: 39.83 MB
Release Date: 38.28 MB
Key: 32.79 MB
Tempo: 4.21 MB
Loudness (db): 33.55 MB
Time signature: 31.55 MB
Explicit: 31.18 MB
Popularity: 4.21 MB
Energy: 4.21 MB
Danceability: 4.21 MB
Positiveness: 4.21 MB
Speechiness: 4.21 MB
Liveness: 4.21 MB
Acousticness: 4.21 MB
Instrumentalness: 4.21 MB
Good for Party: 4.21 MB
Good for Work/Study: 4.21 MB
Good for Relaxation/Meditation: 4.21 MB
Good for Exercise: 4.21 MB
Good for Running: 4.21 MB
Good for Yoga/Stretching: 4.21 MB
Good for Driving: 4.21 MB
Good for Social Gatherings: 4.21 MB
Good for Morning Routine: 4.21 MB
Similar Artist 1: 38.17 MB
Similar Song 1: 37.59 MB
Similarity Score 1: 4.21 MB
Similar Artist 2: 38.17 MB
Similar Song 2: 37.54 MB
Similarity Score 2: 4.21 MB
Similar Artist 3: 38.16 MB
Similar Song 3: 37.56 MB
Similarity Score 3: 4.21 MB

Total DataFrame Memory Usage: 1950.00 MB

### Datase

2025-06-12 18:28:53,120 - INFO - Displayed column info and data types summary.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 551443 entries, 0 to 551442
Data columns (total 39 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   Artist(s)                       551443 non-null  object 
 1   song                            551427 non-null  object 
 2   text                            551443 non-null  object 
 3   Length                          551443 non-null  object 
 4   emotion                         551443 non-null  object 
 5   Genre                           551443 non-null  object 
 6   Album                           551391 non-null  object 
 7   Release Date                    551443 non-null  object 
 8   Key                             551443 non-null  object 
 9   Tempo                           551443 non-null  int64  
 10  Loudness (db)                   551443 non-null  object 
 11  Time signature                  551435 non-null  object 
 12  Explicit        

2025-06-12 18:28:54,123 - INFO - Reported missing values count and percentage.


                Missing Count  Missing Percentage (%)
Album                      52                0.009430
Similar Song 2             20                0.003627
Similar Song 1             19                0.003446
song                       16                0.002901
Time signature              8                0.001451
Similar Song 3              4                0.000725

### Duplicate Rows Check ###




Number of duplicate rows: 53391

### Descriptive Statistics for Numerical Features ###


2025-06-12 18:28:59,139 - INFO - Displayed descriptive statistics for numerical features.


               Tempo     Popularity         Energy   Danceability   Positiveness    Speechiness       Liveness   Acousticness  Instrumentalness  Good for Party  Good for Work/Study  Good for Relaxation/Meditation  Good for Exercise  Good for Running  Good for Yoga/Stretching  Good for Driving  Good for Social Gatherings  Good for Morning Routine  Similarity Score 1  Similarity Score 2  Similarity Score 3
count  551443.000000  551443.000000  551443.000000  551443.000000  551443.000000  551443.000000  551443.000000  551443.000000     551443.000000   551443.000000        551443.000000                   551443.000000      551443.000000     551443.000000             551443.000000     551443.000000               551443.000000             551443.000000       551443.000000       551443.000000       551443.000000
mean      120.513567      32.258106      62.661874      59.195460      47.738330      11.663980      19.655658      25.748436          7.152255        0.061809             0.074931    

2025-06-12 18:28:59,895 - INFO - Outliers detected in 'Tempo': 4 outliers.
2025-06-12 18:28:59,896 - INFO - Outliers detected in 'Popularity': 8574 outliers.
2025-06-12 18:28:59,897 - INFO - Outliers detected in 'Danceability': 611 outliers.
2025-06-12 18:28:59,898 - INFO - Outliers detected in 'Speechiness': 51711 outliers.
2025-06-12 18:28:59,898 - INFO - Outliers detected in 'Liveness': 36698 outliers.
2025-06-12 18:28:59,901 - INFO - Outliers detected in 'Instrumentalness': 127328 outliers.
2025-06-12 18:28:59,901 - INFO - Outliers detected in 'Good for Party': 34084 outliers.
2025-06-12 18:28:59,902 - INFO - Outliers detected in 'Good for Work/Study': 41320 outliers.
2025-06-12 18:28:59,904 - INFO - Outliers detected in 'Good for Relaxation/Meditation': 16969 outliers.
2025-06-12 18:28:59,904 - INFO - Outliers detected in 'Good for Exercise': 103304 outliers.
2025-06-12 18:28:59,905 - INFO - Outliers detected in 'Good for Running': 29691 outliers.
2025-06-12 18:28:59,905 - INFO - 

Tempo: 4 outliers (Lower bound: 32.50, Upper bound: 204.50)
Popularity: 8574 outliers (Lower bound: -14.50, Upper bound: 77.50)
Danceability: 611 outliers (Lower bound: 9.50, Upper bound: 109.50)
Speechiness: 51711 outliers (Lower bound: -12.50, Upper bound: 31.50)
Liveness: 36698 outliers (Lower bound: -12.50, Upper bound: 47.50)
Instrumentalness: 127328 outliers (Lower bound: 0.00, Upper bound: 0.00)
Good for Party: 34084 outliers (Lower bound: 0.00, Upper bound: 0.00)
Good for Work/Study: 41320 outliers (Lower bound: 0.00, Upper bound: 0.00)
Good for Relaxation/Meditation: 16969 outliers (Lower bound: 0.00, Upper bound: 0.00)
Good for Exercise: 103304 outliers (Lower bound: 0.00, Upper bound: 0.00)
Good for Running: 29691 outliers (Lower bound: 0.00, Upper bound: 0.00)
Good for Yoga/Stretching: 11847 outliers (Lower bound: 0.00, Upper bound: 0.00)
Good for Driving: 31872 outliers (Lower bound: 0.00, Upper bound: 0.00)
Good for Social Gatherings: 6757 outliers (Lower bound: 0.00, Upp

2025-06-12 18:29:00,211 - INFO - Analyzed unique values and frequencies for 'Artist(s)'.



--- Artist(s) (Unique values: 127334) ---
Artist(s)
Victor J Sefo,Lisi,Mwayz,Sefos.Beats    1603
L.A.B.                                  1428
The Exponents                            726
Corrella                                 706
TEMM DOGG                                701
                                        ... 
Jacob Bellens                              1
Jacob Baron                                1
Jacob Banks,WESTSIDE BOOGIE                1
Jacob Banks,Timbaland                      1
코케                                         1
Name: count, Length: 127334, dtype: int64

--- song (Unique values: 317906) ---


2025-06-12 18:29:00,531 - INFO - Analyzed unique values and frequencies for 'song'.


song
The Chinese Aint Do Tiananmen Square I Did    1603
Heataz                                         593
King Of The Jungle                             575
Hooker Got a Boyfriend                         469
On Da Roadz Freestyle                          418
                                              ... 
If I Ever Get Back To Georgia                    1
I'll Go On Alone                                 1
I'm Glad I Got To See You Once Again             1
I'm Glad I'm On The Inside Looking Out           1
Memories Smiling Tears Remix                     1
Name: count, Length: 317907, dtype: int64


2025-06-12 18:29:03,738 - INFO - Analyzed unique values and frequencies for 'text'.
2025-06-12 18:29:03,771 - INFO - Analyzed unique values and frequencies for 'Length'.
2025-06-12 18:29:03,838 - INFO - Analyzed unique values and frequencies for 'emotion'.



--- text (Unique values: 497496) ---
text
[Hook] Crazy cash, sick cash Tell me, what am I gonna do with all this money? Stupid cash, retarded cash Tell me, what am I gonna do with all this money? Unnecessary cash, let it rain cash Tell me, what am I gonna do with all this money? So much cash, it's too much cash Tell me, what am I gonna do with all this money?  [Verse 1] Uh-uh I done went and got wealth Now these boys wishin' that I was in bad health Models choosin' up Like I'm the last man on Earth Meanwhile, I got these boys feelings hurt Tall pretty nigga it ain't fair I got it all Killer hops, with cash to blow up a mall Life ain't fair for y'all So just watch me ball Will you be a hater like folks that wanna see me fall? I hope they all stall Just watch my new ride crawl Holla' at me at the whole court for a hoop call Bring yo homies even if they can't stand me Just don't take it personal when I get that grammy [Hook] Crazy cash, sick cash Tell me, what am I gonna do with all this

2025-06-12 18:29:03,906 - INFO - Analyzed unique values and frequencies for 'Genre'.
2025-06-12 18:29:04,088 - INFO - Analyzed unique values and frequencies for 'Album'.


Genre
hip hop                           262070
pop                                11451
country                             7392
folk                                4897
jazz                                3913
                                   ...  
electronic,k-pop,pop                   1
electronic,drum and bass,house         1
hip-hop,funk,soul                      1
pop,math rock,indie rock               1
alternative,punk,rock                  1
Name: count, Length: 3097, dtype: int64

--- Album (Unique values: 154681) ---
Album
685 (Remix)                                                                  1603
Greatest Hits                                                                 683
Everything                                                                    602
Road From 26                                                                  599
312 DAY                                                                       593
                                                

2025-06-12 18:29:04,172 - INFO - Analyzed unique values and frequencies for 'Release Date'.
2025-06-12 18:29:04,219 - INFO - Analyzed unique values and frequencies for 'Key'.
2025-06-12 18:29:04,278 - INFO - Analyzed unique values and frequencies for 'Loudness (db)'.
2025-06-12 18:29:04,301 - INFO - Analyzed unique values and frequencies for 'Time signature'.
2025-06-12 18:29:04,336 - INFO - Analyzed unique values and frequencies for 'Explicit'.



--- Release Date (Unique values: 14299) ---
Release Date
1st January 2006      2730
1st January 2011      2466
1st January 2013      2454
1st January 2008      2172
1st January 2007      2096
                      ... 
14th January 1999        1
23rd February 1968       1
21st October 1995        1
16th July 1968           1
10th December 2006       1
Name: count, Length: 14299, dtype: int64

--- Key (Unique values: 24) ---
Key
C Maj     47745
G Maj     47465
D Maj     44568
C# Maj    44409
A Maj     33849
G# Maj    26045
B min     25772
E min     22348
F Maj     22298
A min     21094
A# min    20261
B Maj     19764
E Maj     19633
F# min    19529
F# Maj    19470
F min     19073
C# min    18583
A# Maj    16250
G min     14558
C min     12224
D min     10455
G# min    10293
D# Maj     9233
D# min     6524
Name: count, dtype: int64

--- Loudness (db) (Unique values: 3746) ---
Loudness (db)
-6db        10611
-7db         8986
-5db         8221
-8db         7010
-4db         6528
        

2025-06-12 18:29:04,500 - INFO - Analyzed unique values and frequencies for 'Similar Artist 1'.



--- Similar Artist 1 (Unique values: 90280) ---
Similar Artist 1
Victor J Sefo,Lisi,Mwayz,Sefos.Beats    1610
L.A.B.                                  1500
The Exponents                            756
Corrella                                 755
Kora                                     721
                                        ... 
redknobs                                   1
Daniel Sahuleka                            1
Erin Kirby                                 1
CHRIS VAYLE                                1
CZ Wang,Neo Image,Separated At Birth       1
Name: count, Length: 90280, dtype: int64

--- Similar Song 1 (Unique values: 218726) ---


2025-06-12 18:29:04,732 - INFO - Analyzed unique values and frequencies for 'Similar Song 1'.
2025-06-12 18:29:04,877 - INFO - Analyzed unique values and frequencies for 'Similar Artist 2'.


Similar Song 1
MAD vs G2 | 2021 Summer    1603
Dont Watch That             593
Still Tryna Get It          575
8 Mile Road                 470
Snakes                      451
                           ... 
Humble Neighborhood           1
Manhood                       1
End  Start Again              1
Photo Copied                  1
NEVER EXISTED                 1
Name: count, Length: 218727, dtype: int64

--- Similar Artist 2 (Unique values: 87533) ---
Similar Artist 2
Victor J Sefo,Lisi,Mwayz,Sefos.Beats    1615
L.A.B.                                  1513
Corrella                                 802
The Exponents                            780
Kora                                     745
                                        ... 
The Solo Committee                         1
Jim Brickman,Martina McBride               1
Steffany Gretzinger,Bobby Strand           1
The Keystones                              1
James Lloyd                                1
Name: count, Length: 87533, dt

2025-06-12 18:29:05,128 - INFO - Analyzed unique values and frequencies for 'Similar Song 2'.
2025-06-12 18:29:05,268 - INFO - Analyzed unique values and frequencies for 'Similar Artist 3'.


Similar Song 2
Hit the Road                   1603
Somebody                        657
All The Way G                   575
Shots on the Hood of My Car     469
My Nigga                        418
                               ... 
Look At Her Remix                 1
What About Now?                   1
None of Dem                       1
Born Never Asked                  1
Closer To Your Heart              1
Name: count, Length: 207671, dtype: int64

--- Similar Artist 3 (Unique values: 86963) ---
Similar Artist 3
Victor J Sefo,Lisi,Mwayz,Sefos.Beats                  1618
L.A.B.                                                1503
Corrella                                               835
The Exponents                                          795
Kora                                                   760
                                                      ... 
Fabolous,Jeremih                                         1
Wu-Tang Clan,Inspectah Deck,Masta Killa,Cappadonna       1
P-Lo,Kehl

2025-06-12 18:29:05,466 - INFO - Analyzed unique values and frequencies for 'Similar Song 3'.
2025-06-12 18:29:05,480 - INFO - Displayed distribution of the 'emotion' target variable.
2025-06-12 18:29:05,481 - INFO - Data exploration completed successfully.


Similar Song 3
J. Cole No Role Modelz                                     1603
IDGAF                                                       614
Method Man  Black Thought Sway In The Morning Freestyle     593
Only One Road                                               469
In My Bag                                                   425
                                                           ... 
Ambassel                                                      1
Get to know you girl                                          1
Shadows of da Gods                                            1
CHERRY BOMB Performance Ver.                                  1
Light in Places                                               1
Name: count, Length: 206001, dtype: int64

### 'emotion' column distribution (Target Variable) ###
emotion
joy          209009
sadness      171078
anger        109678
fear          28097
love          27963
surprise       5592
True             17
Love              3
pink          