# Data Deduplication using Clustering
**Objective**: Learn and implement data deduplication techniques.

**Task**: Deduplication Using K-means Clustering

**Steps**:
1. Data Set: Download a dataset containing duplicate customer records.
2. Preprocess: Standardize the data to ensure better clustering.
3. Apply K-means: Use K-means clustering to find and group similar customer records.
4. Identify Duplicates: Identify and remove duplicates within clusters.

In [None]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import random

# Step 1: Simulate a dataset with duplicate customer records
np.random.seed(42)
data = {
    'name': ['Alice', 'Bob', 'Charlie', 'Alice', 'Bob', 'Charlie', 'David', 'Eve', 'Frank'],
    'age': [25, 30, 35, 25, 30, 35, 40, 45, 50],
    'income': [50000, 60000, 70000, 50500, 60200, 69800, 80000, 85000, 90000]
}
df = pd.DataFrame(data)

# Step 2: Preprocess by encoding categorical data and standardizing numerical data
df_encoded = pd.get_dummies(df[['name']])
df_features = pd.concat([df[['age', 'income']], df_encoded], axis=1)
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df_features)

# Step 3: Apply K-means clustering
kmeans = KMeans(n_clusters=6, random_state=42)
df['cluster'] = kmeans.fit_predict(scaled_features)

# Step 4: Identify duplicates within clusters
duplicates = []
for cluster_id in df['cluster'].unique():
    cluster_data = df[df['cluster'] == cluster_id]
    if len(cluster_data) > 1:
        indices, _ = pairwise_distances_argmin_min(cluster_data[['age', 'income']], 
                                                   [cluster_data[['age', 'income']].mean()])
        representatives = cluster_data.iloc[indices].index[0]
        cluster_duplicates = cluster_data.drop(index=representatives).index.tolist()
        duplicates.extend(cluster_duplicates)

dedup_df = df.drop(index=duplicates).reset_index(drop=True)

print("Original records:")
print(df[['name', 'age', 'income', 'cluster']])
print("\nDeduplicated records:")
print(dedup_df[['name', 'age', 'income']])
