# Data Deduplication using Clustering
**Objective**: Learn and implement data deduplication techniques.

**Task**: Hierarchical Clustering for Deduplication

**Steps**:
1. Data Set: Obtain a dataset containing duplicate employee information.
2. Perform Clustering: Use hierarchical agglomerative clustering to cluster the employee
records.
3. Evaluate Duplicates: Determine duplicates by analyzing the clusters formed.
4. Clean Data: Remove duplicate employee records found during clustering.

In [None]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Simulate employee dataset with duplicates
data = {
    'employee_id': [101, 102, 103, 104, 105, 106, 107, 108],
    'name': ['John Doe', 'Jane Smith', 'Jake Long', 'John Doe', 'Jane Smith', 'Sam Ray', 'Anna Grey', 'Jake Long'],
    'age': [30, 28, 32, 30, 28, 45, 38, 32],
    'department': ['Sales', 'HR', 'IT', 'Sales', 'HR', 'Finance', 'Marketing', 'IT']
}
df = pd.DataFrame(data)

# Step 2: Preprocess: Encode categorical variables, standardize numerical
df_encoded = pd.get_dummies(df[['name', 'department']])
df_features = pd.concat([df[['age']], df_encoded], axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_features)

# Step 3: Hierarchical Clustering
model = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5, linkage='ward')
labels = model.fit_predict(X_scaled)
df['cluster'] = labels

# Step 4: Evaluate Duplicates - remove non-representative from duplicate clusters
duplicates = []
for cluster_id in df['cluster'].unique():
    cluster_group = df[df['cluster'] == cluster_id]
    if len(cluster_group) > 1:
        rep_index = cluster_group.index[0]
        cluster_duplicates = cluster_group.drop(index=rep_index).index.tolist()
        duplicates.extend(cluster_duplicates)

dedup_df = df.drop(index=duplicates).reset_index(drop=True)

print("Original Data with Clusters:")
print(df[['employee_id', 'name', 'age', 'department', 'cluster']])
print("\nDeduplicated Employee Data:")
print(dedup_df[['employee_id', 'name', 'age', 'department']])
