In [1]:
# Question: Advanced Deduplication Using Machine Learning
# Description: Implement ML-based deduplication based on feature similarity.
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# Sample data: records and whether they're duplicates (for training)
data = pd.DataFrame({
    'record1': ['Apple Inc.', 'Microsoft Corp.', 'Apple Incorporated', 'Google LLC', 'Microsoft Corporation'],
    'record2': ['Apple Incorporated', 'Microsoft Corporation', 'Apple Inc.', 'Alphabet Inc.', 'Google LLC'],
    'is_duplicate': [1, 1, 1, 0, 0]  # Labels
})

# Combine all records to fit TF-IDF
all_texts = pd.concat([data['record1'], data['record2']])
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_texts)

# Split TF-IDF back into record1 and record2 vectors
record1_vecs = tfidf_matrix[:len(data)]
record2_vecs = tfidf_matrix[len(data):]

# Compute similarity
similarity_scores = [cosine_similarity(record1_vecs[i], record2_vecs[i])[0][0] for i in range(len(data))]
data['similarity'] = similarity_scores

# Train model on similarity
X = data[['similarity']]
y = data['is_duplicate']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# Prediction on new unseen records
new_data = pd.DataFrame({
    'record1': ['Apple Inc.', 'Meta Platforms'],
    'record2': ['Apple Incorporated', 'Facebook Inc.']
})

# TF-IDF transformation
combined = pd.concat([new_data['record1'], new_data['record2']])
tfidf_new = vectorizer.transform(combined)
vec1 = tfidf_new[:len(new_data)]
vec2 = tfidf_new[len(new_data):]

# Compute similarity
new_sim = [cosine_similarity(vec1[i], vec2[i])[0][0] for i in range(len(new_data))]
new_data['similarity'] = new_sim

# Predict duplication
new_data['is_duplicate'] = model.predict(new_data[['similarity']])

print(new_data)


          record1             record2  similarity  is_duplicate
0      Apple Inc.  Apple Incorporated    0.407945             1
1  Meta Platforms       Facebook Inc.    0.000000             1
