# Data Deduplication using Clustering
**Objective**: Learn and implement data deduplication techniques.

**Task**: Hierarchical Clustering for Deduplication

**Steps**:
1. Data Set: Obtain a dataset containing duplicate employee information.
2. Perform Clustering: Use hierarchical agglomerative clustering to cluster the employee
records.
3. Evaluate Duplicates: Determine duplicates by analyzing the clusters formed.
4. Clean Data: Remove duplicate employee records found during clustering.

In [2]:
# write your code from here
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset (ensure the CSV file exists in the correct path)
df = pd.read_csv('src/Module 11/AI-Driven Data Quality & Anomaly Detection/employees.csv')

# Check if the dataset is empty or contains missing values
if df.isnull().values.any():
    raise ValueError("Data contains missing values. Please handle them before proceeding.")

# Select numerical features for clustering (adjust column names as per your dataset)
numerical_features = ['age', 'salary', 'years_at_company']
df_numerical = df[numerical_features]

# Standardize the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df_numerical)

# Apply Hierarchical Agglomerative Clustering
clustering = AgglomerativeClustering(n_clusters=None, distance_threshold=1.5)
df['cluster'] = clustering.fit_predict(scaled_data)

# Function to find duplicates within clusters (using distance threshold for similarity)
def find_duplicates_within_cluster(df, cluster_column='cluster'):
    duplicate_indices = []
    for cluster in df[cluster_column].unique():
        cluster_data = df[df[cluster_column] == cluster]
        cluster_data_values = cluster_data[numerical_features].values
        distances = np.linalg.norm(cluster_data_values[:, None] - cluster_data_values, axis=2)
        for i in range(len(distances)):
            for j in range(i + 1, len(distances)):
                if distances[i, j] < 0.1:  # Define a small threshold for duplicates
                    duplicate_indices.append(cluster_data.index[i])
                    duplicate_indices.append(cluster_data.index[j])
    return list(set(duplicate_indices))

# Find and remove duplicates
duplicate_indices = find_duplicates_within_cluster(df)
df_cleaned = df.drop(duplicate_indices)

# Visualize the clusters and cleaned data
sns.scatterplot(data=df_cleaned, x='age', y='salary', hue='cluster', palette='viridis')
plt.title('Employee Clusters after Deduplication')
plt.xlabel('Age')
plt.ylabel('Salary')
plt.show()

# Print the number of records before and after deduplication
print(f"Original dataset size: {len(df)}")
print(f"Cleaned dataset size (after deduplication): {len(df_cleaned)}")


FileNotFoundError: [Errno 2] No such file or directory: 'src/Module 11/AI-Driven Data Quality & Anomaly Detection/employees.csv'