## 1. Load and Explore Raw Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
df = pd.read_csv('../data/raw/name_pairs.csv')

# Display first few rows
print("First few rows:")
print(df.head())

print(f"\nDataset shape: {df.shape}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nMissing values:\n{df.isnull().sum()}")

In [None]:
# Check class distribution
print("Class distribution:")
print(df['label'].value_counts())
print(f"\nClass balance ratio: {df['label'].sum() / len(df):.2%} similar")

# Visualize
plt.figure(figsize=(6, 4))
sns.countplot(x='label', data=df, palette=['lightcoral', 'lightgreen'])
plt.xlabel('Label (0: Not Similar, 1: Similar)')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.tight_layout()
plt.show()

## 2. Text Preprocessing and Cleaning

In [None]:
import re

def clean_text(text):
    """Normalize: lowercase, remove punctuation, normalize spaces."""
    text = str(text).lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply cleaning
df['name_1_clean'] = df['name_1'].apply(clean_text)
df['name_2_clean'] = df['name_2'].apply(clean_text)

# Show examples
print("Cleaning examples:")
for idx in range(5):
    print(f"  {df.iloc[idx]['name_1']} -> {df.iloc[idx]['name_1_clean']}")
    print(f"  {df.iloc[idx]['name_2']} -> {df.iloc[idx]['name_2_clean']}")
    print()

## 3. Feature Engineering with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Create vectorizer for character n-grams
vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(2, 3),
    max_features=500,
    lowercase=False  # Already cleaned
)

# Fit on all names
all_names = np.concatenate([df['name_1_clean'].values, df['name_2_clean'].values])
vectorizer.fit(all_names)

# Transform
vec1 = vectorizer.transform(df['name_1_clean'])
vec2 = vectorizer.transform(df['name_2_clean'])

# Compute cosine similarity
similarities = np.array([cosine_similarity(vec1[i], vec2[i])[0, 0] for i in range(len(df))])

print(f"Similarity score range: [{similarities.min():.3f}, {similarities.max():.3f}]")
print(f"Mean similarity: {similarities.mean():.3f}")
print(f"\nFeature matrix shape: {similarities.shape}")

In [None]:
# Visualize similarity distribution by class
df['similarity'] = similarities

plt.figure(figsize=(10, 4))

plt.subplot(1, 2, 1)
plt.hist(df[df['label'] == 0]['similarity'], bins=15, alpha=0.6, label='Not Similar', color='lightcoral')
plt.hist(df[df['label'] == 1]['similarity'], bins=15, alpha=0.6, label='Similar', color='lightgreen')
plt.xlabel('Cosine Similarity')
plt.ylabel('Frequency')
plt.title('Similarity Distribution by Class')
plt.legend()

plt.subplot(1, 2, 2)
sns.boxplot(x='label', y='similarity', data=df, palette=['lightcoral', 'lightgreen'])
plt.xlabel('Label (0: Not Similar, 1: Similar)')
plt.ylabel('Similarity Score')
plt.title('Similarity by Class')

plt.tight_layout()
plt.show()

## 4. Train-Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Prepare features and labels
X = similarities.reshape(-1, 1)
y = df['label'].values

# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"\nTrain label distribution: {np.bincount(y_train)}")
print(f"Test label distribution: {np.bincount(y_test)}")

## 5. Model Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights for imbalance
classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight = {c: w for c, w in zip(classes, weights)}

print(f"Class weights: {class_weight}")

# Train model
model = LogisticRegression(
    class_weight=class_weight,
    max_iter=1000,
    random_state=42
)
model.fit(X_train, y_train)

print("Model training complete.")

## 6. Model Evaluation

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Predictions
y_pred = model.predict(X_test)

# Metrics
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
f1 = f1_score(y_test, y_pred, zero_division=0)
cm = confusion_matrix(y_test, y_pred)

print("=== Model Evaluation ===")
print(f"Precision: {precision:.3f}")
print(f"Recall:    {recall:.3f}")
print(f"F1 Score:  {f1:.3f}")
print(f"\nConfusion Matrix:")
print(cm)
print(f"\nDetailed Report:")
print(classification_report(y_test, y_pred, target_names=['Not Similar', 'Similar'], zero_division=0))

In [None]:
# Visualize confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Not Similar', 'Similar'],
            yticklabels=['Not Similar', 'Similar'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.show()

## 7. Inference and Predictions

In [None]:
def predict_similarity(name_1, name_2):
    """Predict similarity between two names."""
    # Clean
    name_1_clean = clean_text(name_1)
    name_2_clean = clean_text(name_2)
    
    # Vectorize
    v1 = vectorizer.transform([name_1_clean])
    v2 = vectorizer.transform([name_2_clean])
    
    # Similarity
    score = cosine_similarity(v1, v2)[0, 0]
    X_pred = np.array([[score]])
    
    # Predict
    decision = bool(model.predict(X_pred)[0])
    confidence = model.predict_proba(X_pred)[0][1]
    
    return score, decision, confidence

# Test examples
test_pairs = [
    ('ApplePay', 'Apple Pay'),
    ('Google', 'Alphabet'),
    ('Nike', 'Niike'),
]

print("=== Prediction Examples ===")
for name_1, name_2 in test_pairs:
    score, decision, confidence = predict_similarity(name_1, name_2)
    print(f"\n{name_1} vs {name_2}")
    print(f"  Similarity: {score:.3f}")
    print(f"  Decision: {'Similar' if decision else 'Not Similar'}")
    print(f"  Confidence: {confidence:.3f}")