In [23]:
import pandas as pd
import numpy as np

# Define functions for specific cleaning tasks
def make_categorical_column(df, column_name):
    """
    Convert a numerical column into categorical with fixed labels: 
    ['Low', 'Moderate', 'High', 'Very High'].
    
    Parameters:
    - df (DataFrame): The DataFrame to modify.
    - column_name (str): The name of the column to convert.
    
    Returns:
    - DataFrame: The updated DataFrame with the new categorical column.
    """
    labels = ['Low', 'Moderate', 'High', 'Very High']
    df[f"{column_name}_categorical"] = pd.cut(df[column_name], bins=4, labels=labels)
    return df

def move_column(df, column_name, new_position):
    """
    Move a column to a specific position in the DataFrame.
    
    Parameters:
    - df (DataFrame): The DataFrame to modify.
    - column_name (str): The name of the column to move.
    - new_position (int): The new column index.
    
    Returns:
    - DataFrame: The DataFrame with the column moved.
    """
    column = df.pop(column_name)
    df.insert(new_position, column_name, column)
    return df

def split_column_to_n_columns(df, column_name, n, new_column_prefix):
    """
    Split a column with comma-separated values into `n` separate columns,
    replacing the original column and keeping the new columns in the same position.
    
    Parameters:
    - df (DataFrame): The DataFrame containing the column to split.
    - column_name (str): The name of the column to split.
    - n (int): The number of new columns to create.
    - new_column_prefix (str): Prefix for the names of the new columns.
    
    Returns:
    - DataFrame: The DataFrame with new split columns replacing the original column.
    """
    # Find the original column index to insert new columns in the same position
    col_idx = df.columns.get_loc(column_name)
    
    # Split the column and keep only the first `n` parts
    split_cols = df[column_name].str.split(', ', n=n-1, expand=True).iloc[:, :n]
    split_cols.columns = [f"{new_column_prefix}_{i+1}" for i in range(n)]
    
    # Drop the original column
    df = df.drop(columns=[column_name])
    
    # Insert new columns at the original column position
    for i, new_col in enumerate(split_cols.columns):
        df.insert(col_idx + i, new_col, split_cols[new_col])
    
    return df
    
def clean_project_data(df):
    """
    Clean and transform the Spotify dataset for analysis.
    
    Parameters:
    - df (DataFrame): The original Spotify DataFrame.
    
    Returns:
    - DataFrame: The cleaned DataFrame.
    """
    # Remove duplicate rows
    df = df.drop_duplicates()

    df = split_column_to_n_columns(df, 'artists', 3, 'artist')
    df = split_column_to_n_columns(df, 'track_genre', 3, 'track_genre')

    columns_to_categorize = [
        'danceability', 'energy', 'acousticness',
        'instrumentalness', 'liveness', 'speechiness', 'valence'
    ]

    # Drop rows with NaNs in the specified columns
    df = df.dropna(subset=columns_to_categorize).reset_index(drop=True)

    for col in columns_to_categorize:
        df = make_categorical_column(df, col)
        
    df['popularity'] = df['popularity'] / 100
    df = make_categorical_column(df, 'popularity')
    
    # Duration: Convert from milliseconds to minutes
    df['duration_minutes'] = (df['duration_ms'] / 60000).round(2)
    df = move_column(df, 'duration_minutes', 8)
    df = df.drop(columns=['duration_ms'])
    
    # Optional: Drop columns if they are considered redundant
    # df = df.drop(columns=['key', 'loudness', 'mode'])  # Uncomment if these fields are unnecessary
        
    return df

# Load the dataset and clean it
df = pd.read_csv('KD_random_tracks.csv')
cleaned_df = clean_project_data(df)

# Check the data types of the cleaned DataFrame
print(cleaned_df.dtypes)

# Verify no missing values are present in the cleaned data
print("\nMissing values per column:")
print(cleaned_df.isna().sum())

# Display max and min values for all numeric columns to check range
print("\nMax values in numeric columns:")
print(cleaned_df.select_dtypes(include='number').max())

print("\nMin values in numeric columns:")
print(cleaned_df.select_dtypes(include='number').min())

# Display unique values in key categorical columns
categorical_columns = [
    'popularity_categorical', 'explicit', 'danceability_categorical', 
    'energy_categorical', 'speechiness_categorical'
]

for col in categorical_columns:
    print(f"\nUnique values in column '{col}':")
    print(cleaned_df[col].unique())

# Save the cleaned DataFrame to a CSV file
cleaned_df.to_csv('cleaned_spotify_data.csv', index=False)
display(cleaned_df)


track_id                          object
artist_1                          object
artist_2                          object
artist_3                          object
album_name                        object
release_date                      object
album_image_url                   object
track_name                        object
duration_minutes                 float64
popularity                       float64
explicit                            bool
available_markets                 object
track_external_url                object
track_genre_1                     object
track_genre_2                     object
track_genre_3                     object
artist_popularity                float64
artist_followers                 float64
artist_image_url                  object
artist_external_url               object
danceability                     float64
energy                           float64
key                              float64
loudness                         float64
mode            

Unnamed: 0,track_id,artist_1,artist_2,artist_3,album_name,release_date,album_image_url,track_name,duration_minutes,popularity,...,analysis_url,time_signature,danceability_categorical,energy_categorical,acousticness_categorical,instrumentalness_categorical,liveness_categorical,speechiness_categorical,valence_categorical,popularity_categorical
0,21B4gaTWnTkuSh77iWEXdS,Sabrina Carpenter,,,Short n' Sweet,2024-08-23,https://i.scdn.co/image/ab67616d0000b273fd8d7a...,Juno,3.72,0.88,...,https://api.spotify.com/v1/audio-analysis/21B4...,4.0,High,High,Low,Low,Low,Low,High,Very High
1,0QkWikH5Z3U0f79T9iuF6c,Lady Gaga,,,Born This Way,2011-01-01,https://i.scdn.co/image/ab67616d0000b2734ba15b...,Judas,4.15,0.73,...,https://api.spotify.com/v1/audio-analysis/0QkW...,4.0,High,Very High,Low,Low,Moderate,Low,High,High
2,2mWfVxEo4xZYDaz0v7hYrN,Clairo,,,Charm,2024-07-12,https://i.scdn.co/image/ab67616d0000b273193c2f...,Juna,3.25,0.82,...,https://api.spotify.com/v1/audio-analysis/2mWf...,4.0,High,High,Low,Low,Low,Low,Moderate,Very High
3,1aXV8GrmQLvgoFtBPERP7E,Eyedress,,,Jealous,2019-12-06,https://i.scdn.co/image/ab67616d0000b2734de4da...,Jealous,2.04,0.83,...,https://api.spotify.com/v1/audio-analysis/1aXV...,4.0,Moderate,Very High,Low,Very High,Moderate,Low,High,Very High
4,7KoyXL9zghiNpXkb5iVDyj,Junior H,,,Atrapado en un Sueño,2020-03-27,https://i.scdn.co/image/ab67616d0000b2739c076f...,Jueves 10,4.80,0.69,...,https://api.spotify.com/v1/audio-analysis/7Koy...,3.0,High,High,Moderate,Low,Low,Low,High,High
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13991,2OEDRIC3KxuX2Y9i0b0ikQ,Five Finger Death Punch,,,Got Your Six,2015-09-04,https://i.scdn.co/image/ab67616d0000b27392e4cf...,Jekyll and Hyde,3.45,0.18,...,,4.0,High,Very High,Low,Low,Low,Low,High,Low
13992,5mhW03yGpDpkYHs9aZaxSE,Zac Brown Band,,,JEKYLL + HYDE,2015-04-28,https://i.scdn.co/image/ab67616d0000b2731185c8...,Loving You Easy,2.59,0.57,...,,4.0,High,High,Low,Low,Low,Low,Moderate,High
13993,7l6yY4kttGsr96vM9vY7XX,Jekalyn Carr,,,Changing Your Story,2020-10-23,https://i.scdn.co/image/ab67616d0000b273c1f91c...,Jehovah Jireh - Live,6.23,0.43,...,,4.0,Moderate,High,Low,Low,Moderate,Low,Low,Moderate
13994,1engbDJQ7UNLnDv4EtH9Mn,Jonathan Thulin,Rapture Ruckus,,Science Fiction,2015-03-17,https://i.scdn.co/image/ab67616d0000b2736cc73d...,Jekyll And Hyde,3.09,0.35,...,,4.0,High,High,Low,Low,Low,Low,High,Moderate
