In [2]:
# We will start off by importing our necessary items and also create a path where the data will be pulling from and eventually storing data
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re
import logging
import pandas as pd


FILE_PATH = "../Data/Train.csv"
OUTPUT_PATH = "../Data/2_cleaned_songs_dataset.csv"
IMAGE_DIR = "../images"
DROP_COLUMNS = ["age", "len"] 

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

In [3]:

# We will create this function to sanitize the data values against special characters that some titles might have
def sanitize_data_values(df):
    for column in df.select_dtypes(include=['object']).columns: 
        df[column] = df[column].apply(lambda x: re.sub(r'[^a-zA-Z0-9_ ]', '', x))  
    return df

# We will be creating a function called clean_data that will be helping us perform data cleaning on the dataset.
def clean_data():
    df = pd.read_csv(FILE_PATH)
    print(f"The initial shape: {df.shape}")

    df = sanitize_data_values(df)

    # We will drop the unwanted missing values and duplicates
    df.dropna(inplace=True)
    df.drop_duplicates(inplace=True)
    print(f"The shape after dropping missing values and duplicates: {df.shape}")


    # We will create and save boxplots for 'len' and 'age' before dropping them
    for i, column in enumerate(DROP_COLUMNS, start=1):
        if column in df.columns:
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=df[column])
            plt.title(f"Boxplot of '{column}'")
            plt.tight_layout()
            plt.savefig(f"{IMAGE_DIR}/2.{i}_boxplot_{column}.png")
            plt.close()

    # We will drop the specified columns mentioned earlier
    df.drop(columns=DROP_COLUMNS, inplace=True, errors="ignore")
    print(f"The dropped columns: {DROP_COLUMNS}")

    # We will plot final distributions after cleaning for the remaining columns
    remaining_columns = df.columns 
    for i, column in enumerate(remaining_columns, start=1):
        plt.figure(figsize=(10, 6))
        sns.histplot(df[column], bins=30, kde=True)
        plt.title(f"Final Distribution of '{column}' (After Cleaning)")
        plt.xlabel(column)
        plt.ylabel("Frequency")
        plt.tight_layout()
        plt.savefig(f"{IMAGE_DIR}/2.{i+2}_final_distribution_{column}.png") 
        plt.close()

    # We will save the cleaned dataset
    df.to_csv(OUTPUT_PATH, index=False)
    print(f"The cleaned file has been saved to: {OUTPUT_PATH}")
    print(f"The final shape is as follows: {df.shape}")


In [None]:
if __name__ == "__main__":
    clean_data()

The initial shape: (28362, 24)
The shape after dropping missing values and duplicates: (28362, 24)
The dropped columns: ['age', 'len']
