# Customer Data Cleaning - CP610 Deliverable #2

**Purpose**: Clean and prepare customer data for analysis  
**Input**: `../datasources/Customers_v4.csv`  
**Output**: `../output_data/Customers_cleaned.csv`  
**Strategy**: See `../CLEANING_STRATEGY.md` for detailed methodology

---

## Table of Contents
1. [Load & Initial Exploration](#1)
2. [Data Quality Assessment](#2)
3. [Data Cleaning Operations](#3)
4. [Data Validation](#4)
5. [Export Cleaned Data](#5)
6. [Quality Report](#6)

---
## 1. Load & Initial Exploration

In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

Libraries imported successfully!
Pandas version: 2.3.3
NumPy version: 2.0.2


In [2]:
# Load customer data
df = pd.read_csv('../datasources/Customers_v4.csv')

print("="*60)
print("CUSTOMER DATA LOADED")
print("="*60)
print(f"Dataset shape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024:.2f} KB")
print("="*60)

CUSTOMER DATA LOADED
Dataset shape: 1000 rows × 9 columns
Memory usage: 454.64 KB


In [3]:
# Display basic information
print("\n" + "="*60)
print("DATASET INFORMATION")
print("="*60)
df.info()

print("\n" + "="*60)
print("FIRST 10 ROWS")
print("="*60)
display(df.head(10))

print("\n" + "="*60)
print("LAST 10 ROWS")
print("="*60)
display(df.tail(10))


DATASET INFORMATION
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Customer ID       1000 non-null   object 
 1   Customer Name     1000 non-null   object 
 2   Gender            1000 non-null   object 
 3   City              1000 non-null   object 
 4   Province/State    1000 non-null   object 
 5   Region            1000 non-null   object 
 6   Membership Level  1000 non-null   object 
 7   Customer Age      1000 non-null   int64  
 8   Tenure (Years)    1000 non-null   float64
dtypes: float64(1), int64(1), object(7)
memory usage: 70.4+ KB

FIRST 10 ROWS


Unnamed: 0,Customer ID,Customer Name,Gender,City,Province/State,Region,Membership Level,Customer Age,Tenure (Years)
0,CUST_0001,Jia Brown,Female,San Diego,CA,West,Standard,44,5.1
1,CUST_0002,Victor Rodriguez,Male,Detroit,MI,Midwest,Platinum,60,1.6
2,CUST_0003,Nadia Jones,Female,Dallas,TX,South,Standard,50,6.6
3,CUST_0004,Jack Park,Male,San Antonio,TX,South,Standard,40,10.0
4,CUST_0005,Scarlett Roberts,Female,Atlanta,GA,South,Standard,39,4.5
5,CUST_0006,Mei Anderson,Female,Miami,FL,South,Standard,58,3.5
6,CUST_0007,Valentina Young,Female,Dallas,TX,South,Platinum,41,4.0
7,CUST_0008,James Ramirez,Male,Portland,OR,West,Standard,23,1.2
8,CUST_0009,Hamza Perez,Male,New York,NY,Northeast,Gold,32,8.8
9,CUST_0010,Benjamin Taylor,Male,Austin,TX,South,Standard,64,4.1



LAST 10 ROWS


Unnamed: 0,Customer ID,Customer Name,Gender,City,Province/State,Region,Membership Level,Customer Age,Tenure (Years)
990,CUST_0991,Liam Garcia,Male,Baltimore,MD,Northeast,Gold,42,9.1
991,CUST_0992,Matthew Young,Male,Las Vegas,NV,West,Gold,69,7.2
992,CUST_0993,Mateo Martinez,Male,Columbus,OH,Midwest,Gold,47,5.8
993,CUST_0994,Naomi Williams,Female,Jacksonville,FL,South,Standard,62,8.0
994,CUST_0995,Camila Khan,Female,Houston,TX,South,Gold,66,8.6
995,CUST_0996,Elizabeth Carter,Female,San Diego,CA,West,Gold,45,4.6
996,CUST_0997,Andrew Khan,Male,Detroit,MI,Midwest,Standard,44,4.8
997,CUST_0998,Victor Walker,Male,Boston,MA,Northeast,Standard,48,3.9
998,CUST_0999,Zoe Carvalho,Female,Denver,CO,West,Standard,38,9.7
999,CUST_1000,Amir Roy,Male,Philadelphia,PA,Northeast,Standard,52,7.0


In [4]:
# Summary statistics
print("="*60)
print("SUMMARY STATISTICS")
print("="*60)
display(df.describe(include='all'))

SUMMARY STATISTICS


Unnamed: 0,Customer ID,Customer Name,Gender,City,Province/State,Region,Membership Level,Customer Age,Tenure (Years)
count,1000,1000,1000,1000,1000,1000,1000,1000.0,1000.0
unique,1000,946,2,45,29,6,3,,
top,CUST_0001,Julian Lewis,Female,New York,TX,South,Standard,,
freq,1,3,511,74,159,312,609,,
mean,,,,,,,,45.087,4.9696
std,,,,,,,,14.605504,2.867766
min,,,,,,,,20.0,0.0
25%,,,,,,,,33.0,2.5
50%,,,,,,,,45.0,4.9
75%,,,,,,,,58.0,7.5


---
## 2. Data Quality Assessment

In [5]:
# Check for missing values
print("="*60)
print("MISSING VALUES ANALYSIS")
print("="*60)

missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df)) * 100
missing_summary = pd.DataFrame({
    'Missing_Count': missing,
    'Percentage': missing_pct
})

print(missing_summary)
print(f"\nTotal missing values: {missing.sum()}")

if missing.sum() > 0:
    print("\n⚠️ Columns with missing values:")
    print(missing_summary[missing_summary['Missing_Count'] > 0])
else:
    print("\n✅ No missing values found!")

MISSING VALUES ANALYSIS
                  Missing_Count  Percentage
Customer ID                   0         0.0
Customer Name                 0         0.0
Gender                        0         0.0
City                          0         0.0
Province/State                0         0.0
Region                        0         0.0
Membership Level              0         0.0
Customer Age                  0         0.0
Tenure (Years)                0         0.0

Total missing values: 0

✅ No missing values found!


In [6]:
# Check for duplicate Customer IDs
print("="*60)
print("DUPLICATE CUSTOMER IDs CHECK")
print("="*60)

total_rows = len(df)
unique_ids = df['Customer ID'].nunique()
duplicate_count = total_rows - unique_ids

print(f"Total rows: {total_rows:,}")
print(f"Unique Customer IDs: {unique_ids:,}")
print(f"Duplicate Customer IDs: {duplicate_count:,}")

if duplicate_count > 0:
    print(f"\n⚠️ Found {duplicate_count} duplicate Customer ID(s)")
    
    # Show duplicate Customer IDs
    duplicates = df[df.duplicated(subset=['Customer ID'], keep=False)].sort_values('Customer ID')
    print(f"\nDuplicate Customer IDs:")
    display(duplicates)
else:
    print("\n✅ No duplicate Customer IDs found!")

DUPLICATE CUSTOMER IDs CHECK
Total rows: 1,000
Unique Customer IDs: 1,000
Duplicate Customer IDs: 0

✅ No duplicate Customer IDs found!


In [7]:
# Validate Customer Age
print("="*60)
print("CUSTOMER AGE VALIDATION")
print("="*60)

age_stats = df['Customer Age'].describe()
print(age_stats)

# Check for invalid ages
df['age_valid'] = df['Customer Age'].between(18, 100)
invalid_ages = df[~df['age_valid']]

print(f"\nAge range: {df['Customer Age'].min()} - {df['Customer Age'].max()}")
print(f"Invalid ages (< 18 or > 100): {len(invalid_ages)}")

if len(invalid_ages) > 0:
    print("\n⚠️ Customers with invalid ages:")
    display(invalid_ages[['Customer ID', 'Customer Name', 'Customer Age']])
else:
    print("\n✅ All ages are within valid range (18-100)")

CUSTOMER AGE VALIDATION
count    1000.000000
mean       45.087000
std        14.605504
min        20.000000
25%        33.000000
50%        45.000000
75%        58.000000
max        70.000000
Name: Customer Age, dtype: float64

Age range: 20 - 70
Invalid ages (< 18 or > 100): 0

✅ All ages are within valid range (18-100)


In [8]:
# Validate Tenure
print("="*60)
print("TENURE VALIDATION")
print("="*60)

tenure_stats = df['Tenure (Years)'].describe()
print(tenure_stats)

# Check for invalid tenure
df['tenure_valid'] = df['Tenure (Years)'].between(0, 50)
invalid_tenure = df[~df['tenure_valid']]

print(f"\nTenure range: {df['Tenure (Years)'].min()} - {df['Tenure (Years)'].max()}")
print(f"Invalid tenure (< 0 or > 50): {len(invalid_tenure)}")

if len(invalid_tenure) > 0:
    print("\n⚠️ Customers with invalid tenure:")
    display(invalid_tenure[['Customer ID', 'Customer Name', 'Tenure (Years)']])
else:
    print("\n✅ All tenure values are within valid range (0-50)")

TENURE VALIDATION
count    1000.000000
mean        4.969600
std         2.867766
min         0.000000
25%         2.500000
50%         4.900000
75%         7.500000
max        10.000000
Name: Tenure (Years), dtype: float64

Tenure range: 0.0 - 10.0
Invalid tenure (< 0 or > 50): 0

✅ All tenure values are within valid range (0-50)


In [10]:
# Check categorical values
print("="*60)
print("CATEGORICAL VALUES ANALYSIS")
print("="*60)

categorical_cols = ['Gender', 'Region', 'Membership Level']

for col in categorical_cols:
    print(f"\n{col}:")
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Value counts:")
    print(df[col].value_counts().to_string())
    print()

CATEGORICAL VALUES ANALYSIS

Gender:
  Unique values: 2
  Value counts:
Gender
Female    511
Male      489


Region:
  Unique values: 6
  Value counts:
Region
South             312
West              268
Northeast         182
Midwest           138
Eastern Canada     62
Western Canada     38


Membership Level:
  Unique values: 3
  Value counts:
Membership Level
Standard    609
Gold        243
Platinum    148



---
## 3. Data Cleaning Operations

In [11]:
# Create a copy for cleaning
df_clean = df.copy()

print("="*60)
print("STARTING DATA CLEANING")
print("="*60)
print(f"Starting with {len(df_clean)} rows\n")

STARTING DATA CLEANING
Starting with 1000 rows



In [12]:
# Step 1: Remove duplicate Customer IDs (keep first occurrence)
before_dup_removal = len(df_clean)
df_clean = df_clean.drop_duplicates(subset=['Customer ID'], keep='first')
after_dup_removal = len(df_clean)

print(f"✓ Step 1: Remove duplicate Customer IDs")
print(f"  Rows removed: {before_dup_removal - after_dup_removal}")
print(f"  Rows remaining: {after_dup_removal:,}\n")

✓ Step 1: Remove duplicate Customer IDs
  Rows removed: 0
  Rows remaining: 1,000



In [13]:
# Step 2: Handle missing values in critical fields
before_missing = len(df_clean)
df_clean = df_clean.dropna(subset=['Customer ID', 'Customer Name'])
after_missing = len(df_clean)

print(f"✓ Step 2: Remove rows with missing Customer ID or Name")
print(f"  Rows removed: {before_missing - after_missing}")
print(f"  Rows remaining: {after_missing:,}\n")

# Impute missing ages with median (if any)
if df_clean['Customer Age'].isnull().any():
    median_age = df_clean['Customer Age'].median()
    missing_age_count = df_clean['Customer Age'].isnull().sum()
    df_clean['Customer Age'].fillna(median_age, inplace=True)
    print(f"✓ Step 2a: Imputed {missing_age_count} missing age values with median ({median_age})\n")
else:
    print(f"✓ Step 2a: No missing age values to impute\n")

✓ Step 2: Remove rows with missing Customer ID or Name
  Rows removed: 0
  Rows remaining: 1,000

✓ Step 2a: No missing age values to impute



In [14]:
# Step 3: Standardize text fields
text_columns = ['Customer Name', 'Gender', 'City', 'Province/State', 'Region', 'Membership Level']

print(f"✓ Step 3: Standardize text fields")
for col in text_columns:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].str.strip()
        print(f"  - Trimmed whitespace in '{col}'")

print()

✓ Step 3: Standardize text fields
  - Trimmed whitespace in 'Customer Name'
  - Trimmed whitespace in 'Gender'
  - Trimmed whitespace in 'City'
  - Trimmed whitespace in 'Province/State'
  - Trimmed whitespace in 'Region'
  - Trimmed whitespace in 'Membership Level'



In [15]:
# Step 4: Remove invalid ages and tenure
before_validation = len(df_clean)

# Remove invalid ages
df_clean = df_clean[df_clean['Customer Age'].between(18, 100)]
after_age = len(df_clean)

# Remove invalid tenure
df_clean = df_clean[df_clean['Tenure (Years)'].between(0, 50)]
after_tenure = len(df_clean)

print(f"✓ Step 4: Remove invalid ages and tenure")
print(f"  Rows removed (invalid age): {before_validation - after_age}")
print(f"  Rows removed (invalid tenure): {after_age - after_tenure}")
print(f"  Rows remaining: {after_tenure:,}\n")

# Drop temporary validation columns
df_clean = df_clean.drop(columns=['age_valid', 'tenure_valid'], errors='ignore')

✓ Step 4: Remove invalid ages and tenure
  Rows removed (invalid age): 0
  Rows removed (invalid tenure): 0
  Rows remaining: 1,000



---
## 4. Data Validation

In [17]:
# Final validation checks
print("="*60)
print("FINAL VALIDATION CHECKS")
print("="*60)

# Check 1: Customer ID uniqueness
is_unique = df_clean['Customer ID'].is_unique
print(f"✓ Customer IDs unique: {is_unique}")
if not is_unique:
    print("  ⚠️ WARNING: Customer IDs are not unique!")

# Check 2: No missing critical fields
critical_fields = ['Customer ID', 'Customer Name']
missing_critical = df_clean[critical_fields].isnull().sum().sum()
print(f"✓ Missing values in critical fields: {missing_critical}")

# Check 3: Valid age range
valid_age = df_clean['Customer Age'].between(18, 100).all()
print(f"✓ All ages within valid range (18-100): {valid_age}")

# Check 4: Valid tenure range
valid_tenure = df_clean['Tenure (Years)'].between(0, 50).all()
print(f"✓ All tenure within valid range (0-50): {valid_tenure}")

# Check 5: Data types
print(f"\n✓ Data types:")
print(df_clean.dtypes.to_string())

print("\n" + "="*60)
print("✅ ALL VALIDATION CHECKS PASSED!")
print("="*60)

FINAL VALIDATION CHECKS
✓ Customer IDs unique: True
✓ Missing values in critical fields: 0
✓ All ages within valid range (18-100): True
✓ All tenure within valid range (0-50): True

✓ Data types:
Customer ID          object
Customer Name        object
Gender               object
City                 object
Province/State       object
Region               object
Membership Level     object
Customer Age          int64
Tenure (Years)      float64

✅ ALL VALIDATION CHECKS PASSED!


---
## 5. Export Cleaned Data

In [None]:
# Create output directory if it doesn't exist
import os
os.makedirs('../output_data', exist_ok=True)

# Export cleaned data
output_file = '../output_data/Customers_cleaned.csv'
df_clean.to_csv(output_file, index=False)

print("="*60)
print("CLEANED DATA EXPORTED")
print("="*60)
print(f"Output file: {output_file}")
print(f"Rows exported: {len(df_clean):,}")
print(f"Columns: {len(df_clean.columns)}")
print("="*60)

---
## 6. Data Quality Report

In [None]:
# Generate comprehensive quality report
quality_report = {
    'Dataset': 'Customer Data',
    'Original_Rows': len(df),
    'Cleaned_Rows': len(df_clean),
    'Rows_Removed': len(df) - len(df_clean),
    'Percentage_Retained': f"{(len(df_clean) / len(df) * 100):.2f}%",
    'Duplicate_Customer_IDs_Removed': df.duplicated(subset=['Customer ID']).sum(),
    'Missing_Values_Original': df.isnull().sum().sum(),
    'Missing_Values_Cleaned': df_clean.isnull().sum().sum(),
    'Unique_Customers': df_clean['Customer ID'].nunique(),
    'Age_Range': f"{df_clean['Customer Age'].min():.0f} - {df_clean['Customer Age'].max():.0f}",
    'Tenure_Range': f"{df_clean['Tenure (Years)'].min():.1f} - {df_clean['Tenure (Years)'].max():.1f}",
    'Unique_Regions': df_clean['Region'].nunique(),
    'Unique_Membership_Levels': df_clean['Membership Level'].nunique()
}

# Print report
print("="*60)
print("CUSTOMER DATA QUALITY REPORT")
print("="*60)
for key, value in quality_report.items():
    print(f"{key}: {value}")
print("="*60)

# Save report to file
report_file = '../output_data/Customer_Quality_Report.txt'
with open(report_file, 'w') as f:
    f.write("CUSTOMER DATA QUALITY REPORT\n")
    f.write("="*60 + "\n\n")
    for key, value in quality_report.items():
        f.write(f"{key}: {value}\n")
    f.write("\n" + "="*60 + "\n")
    f.write("\nCleaning completed successfully!\n")
    f.write(f"Cleaned data available at: output_data/Customers_cleaned.csv\n")

print(f"\n✅ Quality report saved to: {report_file}")