# Data Deduplication using Clustering
**Objective**: Learn and implement data deduplication techniques.

**Task**: DBSCAN for Data Deduplication

**Steps**:
1. Data Set: Download a dataset containing duplicate entries for event registrations.
2. DBSCAN Clustering: Apply the DBSCAN algorithm to cluster similar registrations.
3. Identify Duplicates: Detect duplicates based on density of the clusters.
4. Refinement: Validate clusters and remove any erroneous duplicates.

In [1]:
# Install required library
!pip install -q scikit-learn pandas

# Import necessary modules
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
import numpy as np
import unittest

# Sample dataset with potential duplicates (e.g., names, registration details)
data = {
    'name': ['Alice', 'Bob', 'Alice', 'Charlie', 'David', 'Eve', 'Alice'],
    'email': ['alice@example.com', 'bob@example.com', 'alice@example.com', 'charlie@example.com', 
              'david@example.com', 'eve@example.com', 'alice@example.com'],
    'phone': ['123', '456', '123', '789', '101', '112', '123'],
}

# Function to load data safely and check for issues
def load_data(data):
    try:
        df = pd.DataFrame(data)
        if df.isnull().values.any():
            raise ValueError("Input data contains missing values.")
        return df
    except Exception as e:
        print(f"Error loading data: {e}")
        return pd.DataFrame()

# Load data and validate
df = load_data(data)
if df.empty:
    raise ValueError("Data loading failed. Empty DataFrame returned.")

# Preprocessing function with validation and error handling
def preprocess_data(df):
    try:
        # Convert phone to numeric, handle errors gracefully
        df['phone'] = pd.to_numeric(df['phone'], errors='coerce')
        
        # Add email_length as a feature
        df['email_length'] = df['email'].apply(len)

        if df[['phone', 'email_length']].isnull().values.any():
            raise ValueError("Data contains NaN values after preprocessing.")
        
        return df
    except Exception as e:
        print(f"Error in preprocessing: {e}")
        return pd.DataFrame()

# Apply preprocessing
df = preprocess_data(df)
if df.empty:
    raise ValueError("Preprocessing failed. Empty DataFrame returned.")

# Define DBSCAN parameters
EPSILON = 0.5  # Max distance between two samples for them to be considered as in the same neighborhood
MIN_SAMPLES = 2  # The minimum number of points required to form a cluster

# DBSCAN clustering function with error handling
def apply_dbscan(df):
    try:
        # Standardize the numerical features
        scaler = StandardScaler()
        df_scaled = scaler.fit_transform(df[['phone', 'email_length']])

        # Apply DBSCAN clustering
        db = DBSCAN(eps=EPSILON, min_samples=MIN_SAMPLES, metric='euclidean')
        df['cluster'] = db.fit_predict(df_scaled)
        
        if df['cluster'].isnull().values.any():
            raise ValueError("DBSCAN clustering failed. NaN values in cluster.")
        
        return df
    except Exception as e:
        print(f"Error in DBSCAN clustering: {e}")
        return pd.DataFrame()

# Apply DBSCAN
df = apply_dbscan(df)
if df.empty:
    raise ValueError("DBSCAN failed. Empty DataFrame returned.")

# Identify potential duplicates based on cluster
def identify_duplicates(df):
    duplicates = df[df['cluster'] > -1]  # Only clusters with values greater than -1 are considered duplicates
    return duplicates

# Identify duplicates
duplicates = identify_duplicates(df)

# Print the results
print("\nOriginal Data with Clusters:")
print(df[['name', 'email', 'cluster']])
print("\nPotential Duplicates:")
print(duplicates)

# Refining duplicates (removing noise, if needed)
refined_df = df[df['cluster'] > -1]
print("\nRefined Data (without noise):")
print(refined_df)

# Unit tests for the functions
class TestDataProcessing(unittest.TestCase):

    def test_load_data_valid(self):
        valid_data = {'name': ['Alice', 'Bob'], 'email': ['alice@example.com', 'bob@example.com'], 'phone': ['123', '456']}
        df = load_data(valid_data)
        self.assertFalse(df.empty)
    
    def test_load_data_invalid(self):
        invalid_data = {'name': ['Alice', None], 'email': ['alice@example.com', 'bob@example.com'], 'phone': ['123', '456']}
        df = load_data(invalid_data)
        self.assertTrue(df.empty)
    
    def test_preprocess_data(self):
        data = {'name': ['Alice'], 'email': ['alice@example.com'], 'phone': ['123']}
        df = load_data(data)
        df = preprocess_data(df)
        self.assertFalse(df[['phone', 'email_length']].isnull().values.any())
    
    def test_dbscan_cluster(self):
        data = {'name': ['Alice', 'Bob'], 'email': ['alice@example.com', 'bob@example.com'], 'phone': ['123', '456']}
        df = load_data(data)
        df = preprocess_data(df)
        df = apply_dbscan(df)
        self.assertTrue('cluster' in df.columns)
    
    def test_identify_duplicates(self):
        data = {'name': ['Alice', 'Bob', 'Alice'], 'email': ['alice@example.com', 'bob@example.com', 'alice@example.com'], 'phone': ['123', '456', '123']}
        df = load_data(data)
        df = preprocess_data(df)
        df = apply_dbscan(df)
        duplicates = identify_duplicates(df)
        self.assertGreater(len(duplicates), 0)

# Run unit tests
if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


.....
----------------------------------------------------------------------
Ran 5 tests in 0.016s

OK



Original Data with Clusters:
      name                email  cluster
0    Alice    alice@example.com        0
1      Bob      bob@example.com       -1
2    Alice    alice@example.com        0
3  Charlie  charlie@example.com       -1
4    David    david@example.com        0
5      Eve      eve@example.com       -1
6    Alice    alice@example.com        0

Potential Duplicates:
    name              email  phone  email_length  cluster
0  Alice  alice@example.com    123            17        0
2  Alice  alice@example.com    123            17        0
4  David  david@example.com    101            17        0
6  Alice  alice@example.com    123            17        0

Refined Data (without noise):
    name              email  phone  email_length  cluster
0  Alice  alice@example.com    123            17        0
2  Alice  alice@example.com    123            17        0
4  David  david@example.com    101            17        0
6  Alice  alice@example.com    123            17        0
Error lo