In [None]:
import pandas as pd
import numpy as np

# Define functions for specific cleaning tasks
def make_categorical_column(df, column_name):
    """
    Convert a numerical column into categorical with fixed labels: 
    ['Low', 'Moderate', 'High', 'Very High'].
    
    Parameters:
    - df (DataFrame): The DataFrame to modify.
    - column_name (str): The name of the column to convert.
    
    Returns:
    - DataFrame: The updated DataFrame with the new categorical column.
    """
    labels = ['Low', 'Moderate', 'High', 'Very High']
    df[f"{column_name}_categorical"] = pd.cut(df[column_name], bins=4, labels=labels)
    return df

def move_column(df, column_name, new_position):
    """
    Move a column to a specific position in the DataFrame.
    
    Parameters:
    - df (DataFrame): The DataFrame to modify.
    - column_name (str): The name of the column to move.
    - new_position (int): The new column index.
    
    Returns:
    - DataFrame: The DataFrame with the column moved.
    """
    column = df.pop(column_name)
    df.insert(new_position, column_name, column)
    return df

def split_column_to_n_columns(df, column_name, n, new_column_prefix):
    """
    Split a column with comma-separated values into `n` separate columns,
    replacing the original column and keeping the new columns in the same position.
    The new 3rd column should only contain the third comma-separated value, not the rest of the values.
    
    Parameters:
    - df (DataFrame): The DataFrame containing the column to split.
    - column_name (str): The name of the column to split.
    - n (int): The number of new columns to create.
    - new_column_prefix (str): Prefix for the names of the new columns.
    
    Returns:
    - DataFrame: The DataFrame with new split columns replacing the original column.
    """
    # Find the original column index to insert new columns in the same position
    col_idx = df.columns.get_loc(column_name)
    
    # Split the column into separate values and keep only the first `n` parts
    split_cols = df[column_name].str.split(', ', expand=True).iloc[:, :n]
    
    # Ensure the 3rd column only contains the third value if present
    if n >= 3 and f"{new_column_prefix}_3" in split_cols.columns:
        split_cols[f"{new_column_prefix}_3"] = split_cols[f"{new_column_prefix}_3"].str.split(',').str[0]
    
    # Rename the columns with the specified prefix
    split_cols.columns = [f"{new_column_prefix}_{i+1}" for i in range(n)]
    
    # Drop the original column
    df = df.drop(columns=[column_name])
    
    # Insert the new columns in place of the original column
    for i, new_col in enumerate(split_cols.columns):
        df.insert(col_idx + i, new_col, split_cols[new_col])
    
    return df

def numerical_summary_df(df):
    """
    Creates a summary DataFrame for each numerical column in the input DataFrame,
    including mean, median, standard deviation, minimum, and maximum.
    
    Parameters:
    - df (DataFrame): The DataFrame containing columns to summarize.
    
    Returns:
    - DataFrame: A DataFrame with summary statistics for each numerical column.
    """
    summary_data = {
        'Column': [],
        'Mean': [],
        'Median': [],
        'Standard Deviation': [],
        'Minimum': [],
        'Maximum': []
    }
    
    for column in df.select_dtypes(include=['number']).columns:
        summary_data['Column'].append(column)
        summary_data['Mean'].append(df[column].mean())
        summary_data['Median'].append(df[column].median())
        summary_data['Standard Deviation'].append(df[column].std())
        summary_data['Minimum'].append(df[column].min())
        summary_data['Maximum'].append(df[column].max())
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df

def complete_summary_df(df):
    """
    Creates a summary DataFrame for each column in the input DataFrame, 
    including specific statistics for numerical and non-numerical columns.
    
    For numerical columns:
      - Mean, Median, Minimum, Maximum
    
    For non-numerical columns:
      - Unique Count, Most Frequent (Mode), Missing Count
    
    Parameters:
    - df (DataFrame): The DataFrame containing columns to summarize.
    
    Returns:
    - DataFrame: A summary DataFrame with statistics for each column.
    """
    summary_data = {
        'Column': [],
        'Type': [],
        'Mean': [],
        'Median': [],
        'Minimum': [],
        'Maximum': [],
        'Unique Count': [],
        'Most Frequent': [],
        'Missing Count': []
    }
    
    for column in df.columns:
        summary_data['Column'].append(column)
        summary_data['Type'].append(df[column].dtype)
        
        if pd.api.types.is_numeric_dtype(df[column]):
            # Numerical column statistics
            summary_data['Mean'].append(df[column].mean())
            summary_data['Median'].append(df[column].median())
            summary_data['Minimum'].append(df[column].min())
            summary_data['Maximum'].append(df[column].max())
            summary_data['Unique Count'].append(None)
            summary_data['Most Frequent'].append(None)
            summary_data['Missing Count'].append(df[column].isna().sum())
        else:
            # Non-numerical column statistics
            summary_data['Mean'].append(None)
            summary_data['Median'].append(None)
            summary_data['Minimum'].append(None)
            summary_data['Maximum'].append(None)
            summary_data['Unique Count'].append(df[column].nunique())
            summary_data['Most Frequent'].append(df[column].mode()[0] if not df[column].mode().empty else None)
            summary_data['Missing Count'].append(df[column].isna().sum())
    
    summary_df = pd.DataFrame(summary_data)
    return summary_df
    
def clean_project_data(df):
    """
    Clean and transform the Spotify dataset for analysis.
    
    Parameters:
    - df (DataFrame): The original Spotify DataFrame.
    
    Returns:
    - DataFrame: The cleaned DataFrame.
    """
    # Remove duplicate rows
    df = df.drop_duplicates()

    df = split_column_to_n_columns(df, 'artists', 3, 'artist')
    df = split_column_to_n_columns(df, 'track_genre', 3, 'track_genre')

    columns_to_categorize = [
        'danceability', 'energy', 'acousticness',
        'instrumentalness', 'liveness', 'speechiness', 'valence'
    ]

    # Drop rows with NaNs in the specified columns
    df = df.dropna(subset=columns_to_categorize).reset_index(drop=True)

    for col in columns_to_categorize:
        df = make_categorical_column(df, col)
        
    df['popularity'] = df['popularity'] / 100
    df = make_categorical_column(df, 'popularity')
    df['artist_popularity'] = df['artist_popularity'] / 100
    df = make_categorical_column(df, 'artist_popularity')

    # Duration: Convert from milliseconds to minutes
    df['duration_minutes'] = (df['duration_ms'] / 60000).round(2)
    df = move_column(df, 'duration_minutes', 8)
    df = df.drop(columns=['duration_ms'])
    
    # Optional: Drop columns if they are considered redundant
    # df = df.drop(columns=['key', 'loudness', 'mode'])  # Uncomment if these fields are unnecessary
        
    return df

# Load the dataset and clean it
df = pd.read_csv('KD_random_tracks.csv')
cleaned_df = clean_project_data(df)

# Check the data types of the cleaned DataFrame
print(cleaned_df.dtypes)

# Verify no missing values are present in the cleaned data
print("\nMissing values per column:")
print(cleaned_df.isna().sum())

# Display max and min values for all numeric columns to check range
print("\nMax values in numeric columns:")
print(cleaned_df.select_dtypes(include='number').max())

print("\nMin values in numeric columns:")
print(cleaned_df.select_dtypes(include='number').min())

# Display unique values in key categorical columns
categorical_columns = [
    'popularity_categorical', 'explicit', 'danceability_categorical', 
    'energy_categorical', 'speechiness_categorical'
]

for col in categorical_columns:
    print(f"\nUnique values in column '{col}':")
    print(cleaned_df[col].unique())

# Save the cleaned DataFrame to a CSV file
cleaned_df.to_csv('cleaned_spotify_data.csv', index=False)
display(cleaned_df)

summary_df = complete_summary_df(cleaned_df)
summary_df.to_excel('cleaned_df_summary.xlsx', index=False)