### T5 TRAINING SCRIPT

In [None]:
import pandas as pd

df = pd.read_csv("master_data.csv", encoding='latin')
df.head()

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from datasets import Dataset
from sklearn.model_selection import train_test_split
import random

def main():
    # Check for device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    # Model parameters
    model_params = {
        "MODEL": "t5-base",
        "TRAIN_BATCH_SIZE": 32,
        "VALID_BATCH_SIZE": 32,
        "TRAIN_EPOCHS": 5,
        "LEARNING_RATE": 5e-5,
        "MAX_SOURCE_TEXT_LENGTH": 200,
        "MAX_TARGET_TEXT_LENGTH": 128,
        "SEED": 42,
        "OUTPUT_DIR": "./model_outputs_2025_train5"
    }

    # Set seeds for reproducibility
    torch.manual_seed(model_params["SEED"])
    np.random.seed(model_params["SEED"])
    random.seed(model_params["SEED"])
    torch.backends.cudnn.deterministic = True

    # Load and prepare data
    data_path = './master_data.csv'
    df = pd.read_csv(data_path, encoding='latin')
    df.columns = df.columns.str.strip().str.upper().str.replace(r'\s+', '_', regex=True)

    # Clean missing values
    required_columns = ['CAMPAIGN', 'PUBLISHER', 'PLACEMENT_NAME', 'PLACEMENT_GROUP', 'TACTIC', 'AUDIENCE', 'AD_TYPE']
    df = df.dropna(subset=required_columns)
    df = df.reset_index(drop=True)

    # Include necessary columns
    df_rel = df[required_columns].copy()

    df_rel['placement_info'] = df_rel.apply(
        lambda row: f"Campaign: {row['CAMPAIGN']}, "
                    f"Placement Name: {row['PLACEMENT_NAME']}", axis=1)

    # Create the target text by combining multiple columns (including 'Publisher')
    df_rel['target_text'] = df_rel.apply(
        lambda row: f"Placement Group: {row['PLACEMENT_GROUP']}; "
                    f"Publisher: {row['PUBLISHER']}; "
                    f"Tactic: {row['TACTIC']}; "
                    f"Audience: {row['AUDIENCE']}; "
                    f"Ad Type: {row['AD_TYPE']}", axis=1)

    # Prepare the dataset
    df_pg = df_rel[['target_text', 'placement_info']].copy()

    # Verify that 'target_text' and 'placement_info' are strings
    assert df_pg['target_text'].map(type).eq(str).all(), "Not all 'target_text' entries are strings."
    assert df_pg['placement_info'].map(type).eq(str).all(), "Not all 'placement_info' entries are strings."

    # Split data into train, validation, and test sets
    train_df, temp_df = train_test_split(df_pg, test_size=0.2, random_state=model_params["SEED"])
    val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=model_params["SEED"])
    train_df = train_df.reset_index(drop=True)
    val_df = val_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    # Initialize tokenizer and model
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model.to(device)

    # Create datasets using 'datasets' library
    train_dataset = Dataset.from_pandas(train_df)
    val_dataset = Dataset.from_pandas(val_df)
    test_dataset = Dataset.from_pandas(test_df)

    def preprocess_function(examples):
        inputs = examples['placement_info']
        targets = examples['target_text']
        model_inputs = tokenizer(
            inputs,
            max_length=model_params["MAX_SOURCE_TEXT_LENGTH"],
            truncation=True,
        )
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(
                targets,
                max_length=model_params["MAX_TARGET_TEXT_LENGTH"],
                truncation=True,
            )
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    # Map the preprocessing function and remove original columns
    tokenized_train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=train_dataset.column_names
    )
    tokenized_val_dataset = val_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=val_dataset.column_names
    )
    tokenized_test_dataset = test_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=test_dataset.column_names
    )

    # Initialize DataCollatorForSeq2Seq
    data_collator = DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        model=model,
        padding=True,
        label_pad_token_id=-100,
    )

    # Define compute_metrics function 
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred

        # If predictions are in tuple format, get the first element
        if isinstance(predictions, tuple):
            predictions = predictions[0]

        # Move tensors to CPU and convert to NumPy arrays
        if isinstance(predictions, torch.Tensor):
            predictions = predictions.cpu().numpy()
        if isinstance(labels, torch.Tensor):
            labels = labels.cpu().numpy()

        # Replace -100 in labels with tokenizer.pad_token_id
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

        # Decode predictions and labels
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        # Print sample predictions --- keeps truncating..ignore
        print("\nSample Predictions:")
        for i in range(min(3, len(decoded_preds))):  
            print(f"\nSample {i+1}:")
            input_ids = tokenized_val_dataset[i]['input_ids']
            input_text = tokenizer.decode(input_ids, skip_special_tokens=True)
            print(f"Input: {input_text}")
            print(f"Prediction: {decoded_preds[i]}")
            print(f"Reference: {decoded_labels[i]}")

        return {}

    # Training args
    training_args = Seq2SeqTrainingArguments(
        output_dir=model_params["OUTPUT_DIR"],
        num_train_epochs=model_params["TRAIN_EPOCHS"],
        per_device_train_batch_size=model_params["TRAIN_BATCH_SIZE"],
        per_device_eval_batch_size=model_params["VALID_BATCH_SIZE"],
        learning_rate=model_params["LEARNING_RATE"],
        evaluation_strategy='epoch',
        save_strategy='epoch',
        logging_dir='./logs',
        predict_with_generate=True,
        fp16=torch.cuda.is_available(),
        logging_steps=10,
        save_total_limit=2,
        push_to_hub=False,
        report_to="none",
        gradient_accumulation_steps=2,
        gradient_checkpointing=False  # Set to False to avoid potential issues
    )

    # Initialize trainer
    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train_dataset,
        eval_dataset=tokenized_val_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Train model
    trainer.train()

    # Save model and tokenizer
    trainer.save_model(model_params["OUTPUT_DIR"])
    tokenizer.save_pretrained(model_params["OUTPUT_DIR"])

    # Evaluate model
    eval_results = trainer.evaluate()
    print(f"Evaluation results: {eval_results}")

    # **Prediction Step**
    print("Generating predictions on test set...")
    test_loader = torch.utils.data.DataLoader(
        tokenized_test_dataset,
        batch_size=model_params["VALID_BATCH_SIZE"],
        collate_fn=data_collator
    )

    model.eval()
    predictions = []
    actuals = []
    inputs_list = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch.get('labels')
            if labels is not None:
                labels = labels.cpu().numpy()

            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=model_params["MAX_TARGET_TEXT_LENGTH"],
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=0.8,
                early_stopping=False
            )

            preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

            inputs = tokenizer.batch_decode(input_ids, skip_special_tokens=True)

            predictions.extend(preds)
            inputs_list.extend(inputs)

            if labels is not None:
                # Replace -100 in labels as we can't decode them
                labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
                targets = tokenizer.batch_decode(labels, skip_special_tokens=True)
                actuals.extend(targets)
            else:
                actuals.extend([''] * len(preds))

    # Parse predictions and actuals into separate columns
    def parse_output(text):
        parsed = {}
        try:
            for item in text.split(';'):
                key, value = item.strip().split(':', 1)
                parsed[key.strip()] = value.strip()
        except Exception as e:
            print(f"Parsing error: {e}\nText: {text}")
        return parsed

    parsed_predictions = [parse_output(pred) for pred in predictions]
    parsed_actuals = [parse_output(act) for act in actuals]

    # Create a DataFrame for outputs
    output_df = pd.DataFrame({
        'Input Text': inputs_list,
        'Actual Placement Group': [pa.get('Placement Group', '') for pa in parsed_actuals],
        'Predicted Placement Group': [pp.get('Placement Group', '') for pp in parsed_predictions],
        'Actual Publisher': [pa.get('Publisher', '') for pa in parsed_actuals],
        'Predicted Publisher': [pp.get('Publisher', '') for pp in parsed_predictions],
        'Actual Tactic': [pa.get('Tactic', '') for pa in parsed_actuals],
        'Predicted Tactic': [pp.get('Tactic', '') for pp in parsed_predictions],
        'Actual Audience': [pa.get('Audience', '') for pa in parsed_actuals],
        'Predicted Audience': [pp.get('Audience', '') for pp in parsed_predictions],
        'Actual Ad Type': [pa.get('Ad Type', '') for pa in parsed_actuals],
        'Predicted Ad Type': [pp.get('Ad Type', '') for pp in parsed_predictions],
    })

    os.makedirs(model_params["OUTPUT_DIR"], exist_ok=True)
    output_df.to_csv(os.path.join(model_params["OUTPUT_DIR"], 'test_predictions.csv'), index=False)
    print("Predictions saved to:", os.path.join(model_params["OUTPUT_DIR"], 'test_predictions.csv'))

if __name__ == "__main__":
    main()


### INFERENCE SCRIPT

In [None]:
import torch
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import os

class PlacementPredictor:
    def __init__(self, model_path, device=None):
        self.device = device if device else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model_path = model_path
        self.max_source_length = 200
        self.max_target_length = 128

        print(f"Using device: {self.device}")
        print(f"Loading model from: {self.model_path}")

        # Load tokenizer and model
        self.tokenizer = T5Tokenizer.from_pretrained(self.model_path)
        self.model = T5ForConditionalGeneration.from_pretrained(self.model_path).to(self.device)
        self.model.eval()

    def prepare_input(self, campaign, placement_name):
        """Prepare input text in the same format as training data"""
        return f"Campaign: {campaign}, Placement Name: {placement_name}"

    def parse_output(self, text):
        """Parse the model output into a dictionary"""
        parsed = {}
        try:
            for item in text.split(';'):
                key, value = item.strip().split(':', 1)
                parsed[key.strip()] = value.strip()
        except Exception as e:
            print(f"Parsing error: {e}\nText: {text}")
        return parsed

    def predict(self, input_texts, batch_size=32):
        if isinstance(input_texts, str):
            input_texts = [input_texts]

        predictions = []

        # Process in batches
        for i in range(0, len(input_texts), batch_size):
            batch_texts = input_texts[i:i + batch_size]

            # Tokenize inputs
            inputs = self.tokenizer(
                batch_texts,
                max_length=self.max_source_length,
                truncation=True,
                padding=True,
                return_tensors="pt"
            ).to(self.device)

            # Generate predictions
            with torch.no_grad():
                outputs = self.model.generate(
                    input_ids=inputs["input_ids"],
                    attention_mask=inputs["attention_mask"],
                    max_length=self.max_target_length,
                    num_beams=2,
                    repetition_penalty=2.5,
                    length_penalty=0.8,
                    early_stopping=False
                )

            # Decode predictions
            decoded_preds = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
            predictions.extend(decoded_preds)

        # Parse predictions
        parsed_predictions = [self.parse_output(pred) for pred in predictions]
        return parsed_predictions

def load_and_prepare_data(file_path):
    """Load and prepare data for prediction"""
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)

        # Convert column names to match training format
        df.columns = df.columns.str.strip().str.upper().str.replace(r'\s+', '_', regex=True)

        # Ensure required columns exist
        required_columns = ['CAMPAIGN', 'PLACEMENT_NAME']
        missing_columns = [col for col in required_columns if col not in df.columns]

        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

        return df

    except Exception as e:
        print(f"Error loading data: {e}")
        raise

def main():
    # Configuration
    model_path = "./folder_with_model"  # update model path
    input_file_path = "./file_to_predict.csv"  # update input file path
    output_dir = "./output"

    try:
        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Initialize predictor
        predictor = PlacementPredictor(model_path)

        # Load and prepare data
        df = load_and_prepare_data(input_file_path)

        # Prepare input texts
        input_texts = [
            predictor.prepare_input(row['CAMPAIGN'], row['PLACEMENT_NAME'])
            for _, row in df.iterrows()
        ]

        # Get predictions
        predictions = predictor.predict(input_texts)

        # Update the DataFrame with predictions
        for i, pred in enumerate(predictions):
            df.at[i, 'PLACEMENT_GROUP'] = pred.get('Placement Group', '')
            df.at[i, 'PUBLISHER'] = pred.get('Publisher', '')
            df.at[i, 'TACTIC'] = pred.get('Tactic', '')
            df.at[i, 'AUDIENCE'] = pred.get('Audience', '')
            df.at[i, 'AD_TYPE'] = pred.get('Ad Type', '')

        # Save results
        output_path = os.path.join(output_dir, 'predictions.csv')
        df.to_csv(output_path, index=False)
        print(f"Predictions saved to: {output_path}")

        # Print sample predictions
        print("\nSample Predictions (first 3 rows):")
        for i in range(min(3, len(input_texts))):
            print(f"\nInput: {input_texts[i]}")
            print(f"Predictions:")
            print(f"Placement Group: {df.at[i, 'PLACEMENT_GROUP']}")
            print(f"Publisher: {df.at[i, 'PUBLISHER']}")
            print(f"Tactic: {df.at[i, 'TACTIC']}")
            print(f"Audience: {df.at[i, 'AUDIENCE']}")
            print(f"Ad Type: {df.at[i, 'AD_TYPE']}")

    except Exception as e:
        print(f"Error during prediction process: {e}")

if __name__ == "__main__":
    main()