# Spotify Tracks Dataset - Data Cleaning & EDA

In [None]:

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:

# Load dataset
df = pd.read_csv("spotify_tracks.csv")

# Display first few rows
df.head()


In [None]:

# Basic Information about the dataset
df.info()


In [None]:

# Check for missing values
df.isnull().sum()


In [None]:

# Fill missing values
for col in df.select_dtypes(include=['number']).columns:
    df[col].fillna(df[col].median(), inplace=True)

for col in df.select_dtypes(include=['object']).columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Check again to confirm
df.isnull().sum()


In [None]:

# Convert categorical text to lowercase and remove extra spaces
df = df.apply(lambda x: x.str.lower().str.strip() if x.dtype == "object" else x)


In [None]:

# Handling Outliers using IQR Method
Q1 = df.select_dtypes(include=['number']).quantile(0.25)
Q3 = df.select_dtypes(include=['number']).quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Remove outliers
df = df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]


In [None]:

# Histogram of Danceability
plt.figure(figsize=(6,4))
sns.histplot(df["danceability"], bins=30, kde=True)
plt.title("Distribution of Danceability")
plt.show()


In [None]:

# Boxplot of Energy
plt.figure(figsize=(6,4))
sns.boxplot(x=df["energy"])
plt.title("Boxplot of Energy")
plt.show()


In [None]:

# Scatter Plot - Energy vs Loudness
plt.figure(figsize=(6,4))
sns.scatterplot(x=df["energy"], y=df["loudness"])
plt.title("Energy vs Loudness")
plt.show()


In [None]:

# Save cleaned dataset
df.to_csv("cleaned_spotify_tracks.csv", index=False)
