In [1]:
import pandas as pd

# Load splits
train_df = pd.read_csv("./data/neural_column_mapping_medium_train.csv")
val_df = pd.read_csv("./data/neural_column_mapping_medium_val.csv")
test_df = pd.read_csv("./data/neural_column_mapping_medium_test.csv")

print(f"Training examples: {len(train_df):,}")
print(f"Validation examples: {len(val_df):,}")
print(f"Test examples: {len(test_df):,}")

Training examples: 110,481
Validation examples: 13,810
Test examples: 13,811


In [2]:
import torch
import torch.nn as nn
from transformers import AutoModel

class SiameseColumnMapper(nn.Module):
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    
    def forward(self, raw_tokens, standard_tokens):
        # Encode both inputs
        raw_emb = self.encoder(**raw_tokens).pooler_output
        std_emb = self.encoder(**standard_tokens).pooler_output
        
        # Compute similarity features
        diff = torch.abs(raw_emb - std_emb)
        prod = raw_emb * std_emb
        
        # Concatenate features
        features = torch.cat([raw_emb, std_emb, diff, prod], dim=1)
        
        # Classify
        return self.classifier(features)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
class CrossEncoderMapper(nn.Module):
    def __init__(self, model_name='distilbert-base-uncased'):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        self.classifier = nn.Linear(768, 1)
    
    def forward(self, input_tokens):
        # Input: "[raw_col] [SEP] [standard_col]"
        outputs = self.encoder(**input_tokens)
        return torch.sigmoid(self.classifier(outputs.pooler_output))

In [4]:
class MultiTaskColumnMapper(nn.Module):
    def __init__(self, num_domains=7):
        super().__init__()
        self.shared_encoder = AutoModel.from_pretrained('all-MiniLM-L6-v2')
        
        # Task-specific heads
        self.similarity_head = nn.Linear(768*4, 1)  # Main matching task
        self.domain_head = nn.Linear(768, num_domains)  # Domain classification
        self.confidence_head = nn.Linear(768*4, 1)  # Confidence estimation
    
    def forward(self, raw_tokens, standard_tokens, domain_labels=None):
        raw_emb = self.shared_encoder(**raw_tokens).pooler_output
        std_emb = self.shared_encoder(**standard_tokens).pooler_output
        
        # Similarity features
        diff = torch.abs(raw_emb - std_emb)
        prod = raw_emb * std_emb
        sim_features = torch.cat([raw_emb, std_emb, diff, prod], dim=1)
        
        # Multi-task outputs
        similarity = torch.sigmoid(self.similarity_head(sim_features))
        domain_pred = self.domain_head(raw_emb)
        confidence = torch.sigmoid(self.confidence_head(sim_features))
        
        return {
            'similarity': similarity,
            'domain': domain_pred,
            'confidence': confidence
        }

In [5]:
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def train_model(model, train_loader, val_loader, epochs=10):
    optimizer = optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.BCELoss()
    
    best_accuracy = 0
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        
        for batch in train_loader:
            optimizer.zero_grad()
            
            outputs = model(batch['raw_tokens'], batch['standard_tokens'])
            loss = criterion(outputs.squeeze(), batch['labels'].float())
            
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_predictions = []
        val_labels = []
        
        with torch.no_grad():
            for batch in val_loader:
                outputs = model(batch['raw_tokens'], batch['standard_tokens'])
                predictions = (outputs.squeeze() > 0.5).cpu().numpy()
                
                val_predictions.extend(predictions)
                val_labels.extend(batch['labels'].cpu().numpy())
        
        # Metrics
        accuracy = accuracy_score(val_labels, val_predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(
            val_labels, val_predictions, average='binary'
        )
        
        print(f"Epoch {epoch+1}/{epochs}")
        print(f"  Train Loss: {train_loss/len(train_loader):.4f}")
        print(f"  Val Accuracy: {accuracy:.4f}")
        print(f"  Val F1: {f1:.4f}")
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), 'best_column_mapper.pth')
    
    return model

In [19]:
class SiameseColumnMapper(nn.Module):
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2'):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        emb_size = self.encoder.config.hidden_size  # dynamically get embedding size

        self.classifier = nn.Sequential(
            nn.Linear(emb_size*4, 256),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, raw_tokens, standard_tokens):
        raw_emb = self.encoder(**raw_tokens).pooler_output
        std_emb = self.encoder(**standard_tokens).pooler_output

        diff = torch.abs(raw_emb - std_emb)
        prod = raw_emb * std_emb

        features = torch.cat([raw_emb, std_emb, diff, prod], dim=1)
        return self.classifier(features)


In [20]:
from torch.utils.data import Dataset

class ColumnMappingDataset(Dataset):
    def __init__(self, dataframe, tokenizer_name='sentence-transformers/all-MiniLM-L6-v2', max_length=32):
        self.df = dataframe.reset_index(drop=True)
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        raw_text = str(row['raw_column_name'])
        std_text = str(row['standard_column_name'])
        label = float(row['is_match'])

        raw_tokens = self.tokenizer(raw_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')
        std_tokens = self.tokenizer(std_text, truncation=True, padding='max_length', max_length=self.max_length, return_tensors='pt')

        raw_tokens = {k: v.squeeze(0) for k, v in raw_tokens.items()}
        std_tokens = {k: v.squeeze(0) for k, v in std_tokens.items()}

        return {'raw_tokens': raw_tokens, 'standard_tokens': std_tokens, 'label': torch.tensor(label, dtype=torch.float)}

def collate_fn(batch):
    batch_raw = {key: torch.stack([item['raw_tokens'][key] for item in batch]) for key in batch[0]['raw_tokens']}
    batch_std = {key: torch.stack([item['standard_tokens'][key] for item in batch]) for key in batch[0]['standard_tokens']}
    labels = torch.stack([item['label'] for item in batch])
    return batch_raw, batch_std, labels


In [21]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm

def train_siamese(model, train_loader, val_loader, epochs=5, lr=2e-5, device='cuda'):
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.BCELoss()

    best_val_acc = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch_raw, batch_std, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            batch_raw = {k: v.to(device) for k, v in batch_raw.items()}
            batch_std = {k: v.to(device) for k, v in batch_std.items()}
            labels = labels.to(device)

            optimizer.zero_grad()
            outputs = model(batch_raw, batch_std).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)

        # Validation
        model.eval()
        val_preds = []
        val_labels = []
        with torch.no_grad():
            for batch_raw, batch_std, labels in val_loader:
                batch_raw = {k: v.to(device) for k, v in batch_raw.items()}
                batch_std = {k: v.to(device) for k, v in batch_std.items()}
                labels = labels.to(device)

                outputs = model(batch_raw, batch_std).squeeze()
                preds = (outputs > 0.5).float()
                val_preds.extend(preds.cpu().numpy())
                val_labels.extend(labels.cpu().numpy())

        val_acc = accuracy_score(val_labels, val_preds)
        precision, recall, f1, _ = precision_recall_fscore_support(val_labels, val_preds, average='binary')

        print(f"Epoch {epoch+1} | Loss: {avg_loss:.4f} | Val Acc: {val_acc:.4f} | F1: {f1:.4f}")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), 'best_siamese_column_mapper.pth')
            print("Best model saved!")

    return model


In [22]:
import pandas as pd
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

# Load dataset
train_df = pd.read_csv("./data/neural_column_mapping_medium_train.csv")
val_df = pd.read_csv("./data/neural_column_mapping_medium_val.csv")

# Prepare datasets and loaders
train_dataset = ColumnMappingDataset(train_df)
val_dataset = ColumnMappingDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

# Initialize model
model = SiameseColumnMapper(model_name='sentence-transformers/all-MiniLM-L6-v2')

# Train
trained_model = train_siamese(model, train_loader, val_loader, epochs=5, device='cuda')


Epoch 1: 100%|██████████| 3453/3453 [03:47<00:00, 15.17it/s]


Epoch 1 | Loss: 0.2260 | Val Acc: 0.9642 | F1: 0.9198
Best model saved!


Epoch 2: 100%|██████████| 3453/3453 [03:50<00:00, 14.98it/s]


Epoch 2 | Loss: 0.0806 | Val Acc: 0.9755 | F1: 0.9443
Best model saved!


Epoch 3: 100%|██████████| 3453/3453 [03:45<00:00, 15.30it/s]


Epoch 3 | Loss: 0.0548 | Val Acc: 0.9779 | F1: 0.9504
Best model saved!


Epoch 4: 100%|██████████| 3453/3453 [03:50<00:00, 15.01it/s]


Epoch 4 | Loss: 0.0411 | Val Acc: 0.9794 | F1: 0.9534
Best model saved!


Epoch 5: 100%|██████████| 3453/3453 [03:37<00:00, 15.90it/s]


Epoch 5 | Loss: 0.0345 | Val Acc: 0.9811 | F1: 0.9569
Best model saved!


In [23]:
import torch
from transformers import AutoTokenizer

# Load trained model
model = SiameseColumnMapper(model_name='sentence-transformers/all-MiniLM-L6-v2')
model.load_state_dict(torch.load('best_siamese_column_mapper.pth', map_location='cpu'))
model.eval()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


In [24]:
def predict_top_k(model, tokenizer, raw_column, standard_columns, k=3, device='cpu', max_length=32):
    """
    Predict top-k standard columns for a raw column
    """
    model.to(device)
    results = []

    for std_col in standard_columns:
        # Tokenize pair
        raw_tokens = tokenizer(
            raw_column, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt'
        )
        std_tokens = tokenizer(
            std_col, truncation=True, padding='max_length', max_length=max_length, return_tensors='pt'
        )

        # Remove batch dim
        raw_tokens = {k: v.to(device) for k, v in raw_tokens.items()}
        std_tokens = {k: v.to(device) for k, v in std_tokens.items()}

        # Predict similarity
        with torch.no_grad():
            score = model(raw_tokens, std_tokens).item()

        results.append({'standard_column': std_col, 'confidence': score})

    # Sort by confidence
    results.sort(key=lambda x: x['confidence'], reverse=True)
    return results[:k]


In [36]:
# Example raw column
raw_col = "warranty"

# List of all possible standard columns
standard_cols = [
    "merchant_id",
    "warehouse_location",
    "selling_price",
    "order_id",
    "account_number",
    "transaction_amount",
    "fees",
    "opening_date",
    "manager_id",
    "product_rating",
    "account_status",
    "registration_date",
    "color",
    "shipping_city",
    "tax_amount",
    "authorization_code",
    "item_id",
    "customer_id",
    "order_total",
    "account_balance",
    "hire_date",
    "customer_name",
    "performance_rating",
    "brand_name",
    "transaction_type",
    "product_category",
    "currency_code",
    "order_source",
    "chargeback_flag",
    "product_price",
    "customer_type",
    "actual_delivery",
    "shipping_zip",
    "employee_id",
    "swift_code",
    "unit_cost",
    "discount_amount",
    "transaction_date",
    "salary",
    "item_category",
    "batch_number",
    "model_number",
    "reference_number",
    "interest_rate",
    "product_name",
    "order_status",
    "last_sold_date",
    "quantity_available",
    "customer_state",
    "created_date",
    "last_updated",
    "payment_method",
    "termination_date",
    "delivery_date",
    "product_id",
    "shipping_cost",
    "customer_segment",
    "routing_number",
    "supplier_id",
    "iban",
    "date_of_birth",
    "last_login_date",
    "review_count",
    "years_experience",
    "annual_fee",
    "account_holder_name",
    "order_date",
    "shipping_address",
    "item_code",
    "occupation",
    "account_id",
    "gender",
    "transaction_id",
    "account_type",
    "closing_date",
    "employee_name",
    "education_level",
    "product_description",
    "card_last_four",
    "item_subcategory",
    "skill_level",
    "lead_time_days",
    "phone_extension",
    "office_location",
    "employment_status",
    "benefits_eligible",
    "customer_city",
    "customer_email",
    "minimum_balance",
    "item_name",
    "reorder_point",
    "monthly_fee",
    "last_received_date",
    "expiration_date",
    "product_weight",
    "order_priority",
    "vacation_days",
    "quantity_on_hand",
    "order_notes",
    "risk_score",
    "product_dimensions",
    "card_type",
    "overdraft_limit",
    "loyalty_points",
    "customer_status",
    "material",
    "emergency_contact",
    "branch_code",
    "customer_zip",
    "processing_time",
    "terminal_id",
    "quantity_reserved",
    "department",
    "transaction_status",
    "product_cost",
    "last_transaction_date",
    "reorder_quantity",
    "coupon_code",
    "income_range",
    "inventory_quantity",
    "customer_country",
    "warranty_period",
    "customer_address",
    "vendor_id",
    "size",
    "shipping_state",
    "estimated_delivery",
    "job_title",
    "preferred_language",
    "customer_phone",
    "employee_email",
    "markup_percentage",
    "account_currency",
    "net_amount",
    "sick_days",
]

# Predict top-3 matches
top_matches = predict_top_k(model, tokenizer, raw_col, standard_cols, k=3)
for match in top_matches:
    print(match)

{'standard_column': 'warranty_period', 'confidence': 0.99979168176651}
{'standard_column': 'shipping_state', 'confidence': 0.03150755539536476}
{'standard_column': 'order_date', 'confidence': 0.03122042305767536}
