In [None]:
'''
Imports
'''

# General
import json
import numpy as np
from typing import List, Dict
from collections import Counter
import re
import pandas as pd
try:
  import wandb
except:
  ! pip install wandb
  import wandb


try:
  from datasets import Dataset
except:
  ! pip install datasets
  from datasets import Dataset

# Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_score, recall_score, f1_score



# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset

# Transformers
try:
    from transformers import (
        T5TokenizerFast,
        T5Tokenizer,
        T5ForConditionalGeneration,
        Trainer,
        TrainingArguments,
        DataCollatorForSeq2Seq,
        Seq2SeqTrainer,
        Seq2SeqTrainingArguments
    )
except:
    ! pip install transformers
    from transformers import (
        T5TokenizerFast,
        T5Tokenizer,
        T5ForConditionalGeneration,
        Trainer,
        TrainingArguments,
        DataCollatorForSeq2Seq,
        Seq2SeqTrainer,
        Seq2SeqTrainingArguments
    )

! pip install accelerate
! pip install fuzzywuzzy[speedup]

In [None]:
import os
#os.environ["WANDB_MODE"] = "offline"
from google.colab import drive
if "COLAB_GPU" in os.environ:
        drive.mount('/content/drive')

output_dir = "/content/drive/MyDrive/t5_outputs"
os.makedirs(output_dir, exist_ok=True)

In [None]:
"""
Formats raw email chain and structured tenant profile data for T5 model training or inference.

Args:
    - data (List[Dict]): A list of dictionaries, where each dictionary contains:
        - "email_chain": A list of emails (each with a "body" key).
        - "tenant_profile": A nested dictionary with tenant metadata.

Returns:
    - List[Dict[str, str]]: A list of dictionaries, each containing:
        - "input_text": A prompt string (email conversation).
        - "target_text": A semicolon-delimited string encoding the tenant profile fields.
"""
def format_for_t5(data: List[Dict]) -> List[Dict[str, str]]:
    extracted = []
    for item in data:
        email_text = " ".join(email["body"].replace("\n", " ") for email in item["email_chain"])
        input_text = f"extract tenant profile: {email_text}"

        profile = item.get("tenant_profile", {})
        prefs = profile.get("Property Preferences", {})

        fields = {
            "FirstName": profile.get("Tenant Representative Details", {}).get("First Name", ""),
            "LastName": profile.get("Tenant Representative Details", {}).get("Last Name", ""),
            "Email": profile.get("Tenant Representative Details", {}).get("Email", ""),
            "Phone": profile.get("Tenant Representative Details", {}).get("Phone", ""),
            "CompanyName": profile.get("Company Details", {}).get("Company Name", ""),
            "Industry": profile.get("Company Details", {}).get("Industry", ""),
            "CompanySize": profile.get("Company Details", {}).get("Company Size", ""),
            "GrowthStage": profile.get("Company Details", {}).get("Growth Stage", ""),
            "CurrentNeighborhood": "; ".join(profile.get("Company Details", {}).get("Current Neighborhood", [])),
            "FirstInteraction": profile.get("First Interaction", ""),
            "LastInteraction": profile.get("Last Interaction", ""),
            "DecisionMakerRole": profile.get("Decision-Maker Role", ""),
            "PropertyType": prefs.get("Property Type", ""),
            "PreferredNeighborhood": "; ".join(prefs.get("Preferred Neighborhood", [])),
            "Budget": prefs.get("Estimated or Stated Budget", ""),
            "MustHaves": "; ".join(prefs.get("Must-Haves", [])),
            "NiceToHaves": "; ".join(prefs.get("Nice-to-Haves", [])),
            "SpaceSize": prefs.get("Space Size", ""),
            "PreferredLeaseTerm": prefs.get("Preferred Lease Term", ""),
            "MovingTerm": prefs.get("Moving Term", ""),
            "min_months": str(prefs.get("Moving Timing", {}).get("min_months", "")),
            "max_months": str(prefs.get("Moving Timing", {}).get("max_months", "")),
            "MovingTimeline": profile.get("Moving Timeline", ""),
            "PainPoint": "; ".join(profile.get("Pain Points", [])),
            "UrgencyScore": str(profile.get("Urgency Score", "")),
            "Outcome": profile.get("Outcome", ""),
            "Personality": profile.get("Tenant Personality", "")
        }

        target_text = "; ".join(f"{k}: {v}" for k, v in fields.items() if v)
        extracted.append({
            "input_text": f"Extract tenant profile: {email_text}",
            "target_text": target_text
        })
    return extracted

In [None]:
"""
Tokenizes input-output text pairs for T5 model training or evaluation.

Args:
    - examples (List[Dict[str, str]]): A list of dictionaries where each dict contains:
        - "input_text": the prompt text for the model.
        - "target_text": the expected output for the prompt.
    - tokenizer: A HuggingFace tokenizer compatible with T5 (e.g., T5Tokenizer or T5TokenizerFast).
    - max_input_length (int, optional): Maximum token length for input sequences. Defaults to 512.
    - max_target_length (int, optional): Maximum token length for target sequences. Defaults to 128.

Returns:
    - Dict[str, torch.Tensor]: A dictionary containing tokenized inputs and labels, including:
        - "input_ids": token IDs for input_texts
        - "attention_mask": attention mask for inputs
        - "labels": token IDs for target_texts
"""

def tokenize_t5_examples(examples: List[Dict[str, str]], tokenizer, max_input_length=512, max_target_length=128):
    model_inputs = tokenizer(
        [ex["input_text"] for ex in examples],
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            [ex["target_text"] for ex in examples],
            max_length=max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
"""
A custom PyTorch Dataset class for formatting and tokenizing email-based tenant profile data for T5 models.

Methods:
    __len__(): Returns the number of examples in the dataset.
    __getitem__(idx): Returns tokenized input and label tensors for the example at index `idx`.

Returns:
    - Dict[str, List[int]]: A dictionary containing tokenized fields:
        - "input_ids": Token IDs for the input prompt
        - "attention_mask": Mask to ignore padding tokens in the input
        - "labels": Token IDs for the expected output (target text)
"""

class T5EmailDataset(torch.utils.data.Dataset):
    def __init__(self, examples, tokenizer, max_input_length=512, max_target_length=128):
        self.examples = examples
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        example = self.examples[idx]

        # Tokenize input text
        model_inputs = self.tokenizer(
            example["input_text"],
            max_length=self.max_input_length,
            padding="max_length",
            truncation=True
        )

        # Tokenize target text as labels
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(
                example["target_text"],
                max_length=self.max_target_length,
                padding="max_length",
                truncation=True
            )

        # Just return input/label token IDs â€” padding masking will be handled by the data collator
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs


In [None]:
'''
Initialize Tokenizer, Model and Data Collator
'''
tokenizer = T5Tokenizer.from_pretrained("t5-base", model_max_length=1024, truncation=True)
model = T5ForConditionalGeneration.from_pretrained("t5-base")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)


In [None]:
'''
Read and Process each file
'''

with open("train.json") as f:
  train_data = json.load(f)
with open("val.json") as f:
  val_data = json.load(f)
with open("test.json") as f:
  test_data = json.load(f)

'''
For running with github repo.:
with open("../3. Data Split/train.json") as f:
    train_data = json.load(f)
with open("../3. Data Split/val.json") as f:
    val_data = json.load(f)
with open("../3. Data Split/test.json") as f:
    test_data = json.load(f)
'''

train = format_for_t5(train_data)
print(train[0])
val = format_for_t5(val_data)
test= format_for_t5(test_data)

train_dataset = T5EmailDataset(train, tokenizer)
val_dataset = T5EmailDataset(val, tokenizer)
test_dataset = T5EmailDataset(test, tokenizer)


In [None]:
"""
Parse a T5 output string into a dictionary of field-value pairs.

Args:
    text (str): A semicolon-separated key-value string like 'Budget: $45k/month; SpaceSize: 3,500 sqft'
Returns:
    dict: Parsed dictionary of extracted values
"""
def parse_t5_output(text):

    entity_dict = {}
    for pair in text.split(";"):
        if ":" in pair:
            key, value = pair.split(":", 1)
            entity_dict[key.strip()] = value.strip()
    return entity_dict

In [None]:
"""
Evaluates field-level precision, recall, and F1 score for T5 model outputs in structured tenant profile extraction.

Args:
    - decoded_preds (List[str]): List of decoded prediction strings from the T5 model.
    - decoded_labels (List[str]): List of decoded ground truth target strings.
    - fields (List[str]): List of field names (keys) to evaluate (e.g., "Budget", "CompanyName", etc.).

Returns:
        - Precision: TP / (TP + FP)
        - Recall: TP / (TP + FN)
        - F1-Score: Harmonic mean of precision and recall
        - Support: Number of non-empty ground-truth entries for that field
"""

def evaluate_t5_structured_fieldwise(decoded_preds, decoded_labels, fields):
    y_true = []
    y_pred = []

    field_stats = {field: {"tp": 0, "fp": 0, "fn": 0, "support": 0} for field in fields}

    for pred_str, label_str in zip(decoded_preds, decoded_labels):
        pred_dict = parse_t5_output(pred_str)
        label_dict = parse_t5_output(label_str)

        for field in fields:
            y_true_val = label_dict.get(field, "").strip()
            y_pred_val = pred_dict.get(field, "").strip()

            if y_true_val:
                field_stats[field]["support"] += 1

                if y_pred_val == y_true_val:
                    field_stats[field]["tp"] += 1
                elif y_pred_val:
                    field_stats[field]["fp"] += 1
                    field_stats[field]["fn"] += 1
                else:
                    field_stats[field]["fn"] += 1
            elif y_pred_val:
                field_stats[field]["fp"] += 1
    results = {}
    macro_p = []
    macro_r = []
    macro_f1 = []
    # Compute and display metrics
    print(f"{'Field':<25} {'Precision':>10} {'Recall':>10} {'F1-Score':>10} {'Support':>10}")
    for field, stats in field_stats.items():
        tp = stats["tp"]
        fp = stats["fp"]
        fn = stats["fn"]
        support = stats["support"]
        precision = tp / (tp + fp) if (tp + fp) else 0
        recall = tp / (tp + fn) if (tp + fn) else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) else 0
        print(f"{field:<25} {precision:10.2f} {recall:10.2f} {f1:10.2f} {support:10}")
        results[f"{field}_precision"] = round(precision, 4)
        results[f"{field}_recall"] = round(recall, 4)
        results[f"{field}_f1"] = round(f1, 4)
        macro_p.append(precision)
        macro_r.append(recall)
        macro_f1.append(f1)

    if macro_p:
        macro_precision = round(np.mean(macro_p), 4)
        macro_recall = round(np.mean(macro_r), 4)
        macro_f1 = round(np.mean(macro_f1), 4)
    else:
        macro_precision = macro_recall = macro_f1 = 0.0

    results["macro_precision"] = macro_precision
    results["macro_recall"] = macro_recall
    results["macro_f1"] = macro_f1
    return results


In [None]:
"""
Used with Hugging Face Trainer to decode predictions before metrics.
Selects the token with the highest logit for each position.
"""
def preprocess_logits_for_metrics(logits, labels):
    if isinstance(logits, tuple):
        logits = logits[0]

    # Take argmax across vocab dimension
    pred_ids = torch.argmax(logits, dim=-1)

    return pred_ids, labels

In [None]:
"""
Extracts and returns a sorted list of unique field/entity keys from structured T5 prediction and label strings.

Args:
    - decoded_preds (List[str]): List of predicted output strings from the T5 model.
    - decoded_labels (List[str]): List of corresponding target label strings.

Returns:
    - List[str]: Sorted list of unique keys extracted from the ground truth label strings.

"""
def extract_entity_keys(decoded_preds, decoded_labels):
    raw_keys_preds = []
    raw_keys_labels = []

    for entry in decoded_preds:
        segments = re.findall(r'(\w[\w\s]*?):', entry)
        raw_keys_preds.extend([seg.strip().lower() for seg in segments])

    for entry in decoded_labels:
        segments = re.findall(r'(\w[\w\s]*?):', entry)
        raw_keys_labels.extend([seg.strip().lower() for seg in segments])


    key_counts = Counter(raw_keys_preds + raw_keys_labels)

    print(f"Preds: {sorted(Counter(raw_keys_preds))}")
    print(f"Labels: {sorted(Counter(raw_keys_labels))}")

    return sorted(Counter(raw_keys_labels))

In [None]:
"""
This function decodes model predictions and labels, extracts structured key-value pairs, compares
predicted and ground truth values field-wise, and calculates precision, recall, and F1-score for each
field as well as macro-averaged metrics across all fields.

Args:
    - eval_preds (Tuple[np.ndarray, np.ndarray]):
        A tuple where the first element is the raw model predictions (logits or token IDs),
        and the second element is the label IDs (with -100 representing masked tokens).

Returns:
    - Tuple[Dict[str, float], List[str]]:
        - A dictionary mapping each field to its precision, recall, and F1-score, including macro-averages.
        - A list of decoded prediction strings.

"""

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels == -100, tokenizer.pad_token_id, labels)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [str(p).strip().lower() for p in decoded_preds]
    decoded_labels = [str(l).strip().lower() for l in decoded_labels]
    print(f"Preds: {decoded_preds}")
    print(f"Labels: {decoded_labels}")

    all_keys = extract_entity_keys(decoded_preds, decoded_labels)

    results = evaluate_t5_structured_fieldwise(decoded_preds, decoded_labels, all_keys)

    return results, decoded_preds

In [None]:
'''
Define sweep configurations and set up sweep agent
'''

sweep_config_t5 = {
    'method': 'random',
    'metric': {
        'name': 'eval_exact_match',
        'goal': 'maximize'
    },
    'parameters': {
        'learning_rate': {'min': 1e-5, 'max': 5e-4},
        'per_device_train_batch_size': {'values': [4, 8, 16]},
        'num_train_epochs': {'values': [3, 5, 6]},
        'label_smoothing_factor': {'values': [0.0, 0.1]},
        'warmup_ratio': {'values': [0.0, 0.1]},
        'seed': {'values': [42, 123, 2025]}
    }
}
sweep_id_t5 = wandb.sweep(sweep_config_t5, project='t5_final')


In [None]:
"""
Trains and evaluates a T5 model for structured text-to-text tenant profile extraction.

This function initializes a W&B run, configures training hyperparameters using Hugging Face's
`TrainingArguments`, and trains the T5 model on the provided training and validation datasets.
After training, it evaluates the model on a test set, computes structured field-wise metrics
(precision, recall, F1), logs them to W&B, and saves the parsed predictions as a JSON file.

Requires global variables:
    - model (PreTrainedModel): The initialized T5 model.
    - tokenizer (PreTrainedTokenizer): Tokenizer for the T5 model.
    - train_dataset, val_dataset, test_dataset: Hugging Face Datasets or compatible PyTorch datasets.
    - preprocess_logits_for_metrics, data_collator: Utility functions for training.
    - output_dir (str): Path to save final JSON output (should exist or be created before calling).

"""

def train_t5():
    with wandb.init() as run:
        config = run.config

        training_args = TrainingArguments(
            output_dir="./t5_output",
            learning_rate=config.learning_rate,
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            num_train_epochs=config.num_train_epochs,
            label_smoothing_factor=config.label_smoothing_factor,
            warmup_ratio=config.warmup_ratio,
            weight_decay=0.01,
            logging_dir="./logs",
            report_to="wandb",
            seed=config.seed,
            fp16=True,
        )


        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            preprocess_logits_for_metrics=preprocess_logits_for_metrics,
            data_collator=data_collator,
        )

        try:
            trainer.train()
            outputs = trainer.predict(test_dataset)
            metrics = compute_metrics((outputs.predictions[0], outputs.label_ids))
            print(metrics)
            wandb.log({**{f"test_{k}": float(v) for k, v in metrics[0].items()}})

            def parse_profile_string(profile):
                # Use regex to find key-value pairs
                pattern = r'(\w+):\s*([^;]+)'
                matches = re.findall(pattern, profile)
                return {key.lower(): value.strip() for key, value in matches}


            # Parse each string into a dictionary
            parsed_data = [parse_profile_string(profile) for profile in metrics[1]]
            print(f"Parsed: {parsed_data}")

            # Convert to DataFrame
            df = pd.DataFrame(parsed_data)

            # Display the DataFrame (optional)
            print(df.head())

            # Send to wandb
            wandb_table = wandb.Table(dataframe=df)
            wandb.log({f"{run.name} Predictions": wandb_table})

            # Save all predictions to Google Drive
            pred_path = os.path.join(output_dir, f"{wandb.run.name}_predictions.json")
            df.to_json(pred_path, orient="records", indent=2)
            print(f"Predictions saved to: {pred_path}")


        except Exception as e:
            print(f"Training failed: {e}")
            return


In [None]:
'''
Run sweep agent
'''
wandb.agent(sweep_id_t5, function=train_t5, count=25)



---



In [None]:
'''
Predict and Evaluate results on T5 model without training
'''

def run_base_t5():
    print("Running base T5 model without training...")

    with wandb.init(project="t5_final", name="t5_base", reinit=True):
        try:
            input_texts = [ex["input_text"] for ex in test_examples]
            inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
            input_ids = inputs["input_ids"].to(model.device)
            attention_mask = inputs["attention_mask"].to(model.device)

            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=128)
            print(f"outputs: {outputs}")
            decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            decoded_preds = [p.strip().lower() for p in decoded_preds]
            decoded_labels = [ex["target_text"].strip().lower() for ex in test_examples]

            # Compute metrics
            metrics = compute_metrics((outputs, tokenizer(decoded_labels, return_tensors="np", padding=True, truncation=True, max_length=128)["input_ids"]))
            wandb.log({f"test_{k}": v.item() if hasattr(v, "item") else float(v) for k, v in metrics[0].items()})
            print(metrics)
            def parse_profile_string(profile):
                # Use regex to find key-value pairs
                pattern = r'(\w+):\s*([^;]+)'
                matches = re.findall(pattern, profile)
                return {key.lower(): value.strip() for key, value in matches}


            # Parse each string into a dictionary
            parsed_data = [parse_profile_string(profile) for profile in decoded_preds]
            print(f"Parsed: {parsed_data}")

            # Convert to DataFrame
            df = pd.DataFrame(parsed_data)

            # Display the DataFrame (optional)
            print(df.head())

            # Save all predictions to Google Drive
            pred_path = os.path.join(output_dir, f"base_predictions.json")
            df.to_json(pred_path, orient="records", indent=2)
            print(f"Predictions saved to: {pred_path}")

            # Send to wandb
            wandb_table = wandb.Table(dataframe=df)
            wandb.log({"Base Predictions": wandb_table})

        except Exception as e:
            print(f"Base inference failed: {e}")


In [None]:
'''
Run base model
'''
run_base_t5()