In [None]:
# Step 1: Load Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from scipy.stats import zscore

# Step 2: Load Dataset from Google Drive
from google.colab import files
uploaded = files.upload()

# Load the dataset (Replace 'customer_behavior_analytics.csv' with your actual filename)
df = pd.read_csv("/content/customer_behavior_analytcis.csv")

# Step 3: Data Exploration
print(df.info())
print(df.head())

# Step 4: Handle Missing Values
# Select only numeric columns for median calculation
numeric_df = df.select_dtypes(include=np.number)
numeric_df.fillna(numeric_df.median(), inplace=True)
# Update the original DataFrame with the filled values
df[numeric_df.columns] = numeric_df

# Step 5: Drop Unnecessary Columns
df.drop(columns=['customer_id'], inplace=True)  # Removing customer ID since it's not needed

# Step 6: Check for Duplicates
duplicate_rows = df.duplicated().sum()
print(f"Duplicate Rows: {duplicate_rows}")
df = df.drop_duplicates()

# Step 7: Detect and Remove Outliers Using Z-Score
z_scores = np.abs(zscore(df))
outlier_threshold = 3  # Typically, a Z-score above 3 is considered an outlier
outliers = (z_scores > outlier_threshold).sum(axis=0)
print(f"Outliers per column: \n{outliers}")

# Remove outliers
df_cleaned = df[(z_scores < outlier_threshold).all(axis=1)]

# Step 8: Summary Statistics After Cleaning
print(df_cleaned.describe())

# Save Cleaned Data
df_cleaned.to_csv("cleaned_customer_behavior.csv", index=False)
print("Cleaned dataset saved as 'cleaned_customer_behavior.csv'")