In [1]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertForMaskedLM, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import Trainer, TrainingArguments

import xgboost as xgb
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error


In [2]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

train_df["catalog_content"] = train_df["catalog_content"].fillna("").str.strip()
test_df["catalog_content"] = test_df["catalog_content"].fillna("").str.strip()

print(train_df.head())
print(test_df.head())


   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   
2     261251  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   
3      55858  Item Name: Judee’s Blue Cheese Powder 11.25 oz...   
4     292686  Item Name: kedem Sherry Cooking Wine, 12.7 Oun...   

                                          image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12  
2  https://m.media-amazon.com/images/I/51+PFEe-w-...   1.97  
3  https://m.media-amazon.com/images/I/41mu0HAToD...  30.34  
4  https://m.media-amazon.com/images/I/41sA037+Qv...  66.49  
   sample_id                                    catalog_content  \
0     100179  Item Name: Rani 14-Spice Eshamaya's Mango Chut...   
1     245611  Item Name: Natural MILK TEA Flavoring extract ...   
2     146263  Item Name:

In [3]:
def smape(y_true, y_pred):
    numerator = np.abs(y_true - y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / np.clip(denominator, 1e-8, None))


In [10]:
class LoraMLMTrainer:
    def __init__(self, model_name="bert-base-uncased", lora_r=16, lora_alpha=32, lora_dropout=0.05):
        self.tokenizer = BertTokenizerFast.from_pretrained(model_name)
        self.model = BertForMaskedLM.from_pretrained(model_name)

        # Correct PEFT configuration for BERT MLM
        peft_config = LoraConfig(
            r=lora_r,
            lora_alpha=lora_alpha,
            lora_dropout=lora_dropout,
            target_modules=["query", "value"],
            bias="none",
            task_type="CAUSAL_LM"   # correct for MLM models in PEFT
        )

        # Attach LoRA adapters
        self.model = get_peft_model(self.model, peft_config)

    def tokenize_function(self, examples):
        return self.tokenizer(
            examples["text"],
            truncation=True,
            max_length=128
        )

    def train(self, texts, output_dir="lora_mlm_model"):
        # Prepare dataset
        df = pd.DataFrame({"text": texts})
        tokenized = self.tokenize_function(df.to_dict("list"))

        class SimpleDS(Dataset):
            def __init__(self, encodings):
                self.encodings = encodings
            def __len__(self):
                return len(self.encodings["input_ids"])
            def __getitem__(self, idx):
                return {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}

        train_dataset = SimpleDS(tokenized)

        # MLM masking
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=True,
            mlm_probability=0.15
        )

        # TrainingArguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=16,
            gradient_accumulation_steps=2,
            num_train_epochs=1,
            logging_steps=100,
            save_steps=5000,
            remove_unused_columns=False,
            fp16=True,
            report_to="none"
        )

        # Trainer (no tokenizer argument)
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            data_collator=data_collator
        )

        # Train LoRA-MLM model
        trainer.train()

        # Save model + tokenizer
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)


In [None]:
texts = train_df["catalog_content"].tolist()

mlm_trainer = LoraMLMTrainer()
mlm_trainer.train(texts, output_dir="lora_mlm_model")


Loading weights:   0%|          | 0/202 [00:00<?, ?it/s]

BertForMaskedLM LOAD REPORT from: bert-base-uncased
Key                         | Status     |  | 
----------------------------+------------+--+-
bert.pooler.dense.weight    | UNEXPECTED |  | 
cls.seq_relationship.bias   | UNEXPECTED |  | 
cls.seq_relationship.weight | UNEXPECTED |  | 
bert.pooler.dense.bias      | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Step,Training Loss


In [None]:
class TextEmbedder:
    def __init__(self, model_path="lora_mlm_model"):
        self.tokenizer = BertTokenizerFast.from_pretrained(model_path)
        self.model = BertForMaskedLM.from_pretrained(model_path)
        self.model.eval()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def embed_batch(self, texts):
        tokens = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
            outputs = self.model.bert(**tokens)
            hidden = outputs.last_hidden_state
            embeddings = hidden.mean(dim=1)

        return embeddings.cpu().numpy()

    def generate_embeddings(self, df, batch_size=64):
        all_embeddings = []
        for i in range(0, len(df), batch_size):
            batch_text = df["catalog_content"].iloc[i:i+batch_size].tolist()
            batch_embeds = self.embed_batch(batch_text)
            all_embeddings.append(batch_embeds)

        return np.vstack(all_embeddings)


In [None]:
embedder = TextEmbedder("lora_mlm_model")

train_embeddings = embedder.generate_embeddings(train_df)
test_embeddings = embedder.generate_embeddings(test_df)

np.save("train_embeds.npy", train_embeddings)
np.save("test_embeds.npy", test_embeddings)


In [None]:
class EnsembleRegressor:
    def __init__(self):
        self.xgb_model = None
        self.lgbm_model = None

    def train_xgb(self, X_train, y_train):
        model = xgb.XGBRegressor(
            n_estimators=500,
            learning_rate=0.03,
            max_depth=8,
            subsample=0.8,
            colsample_bytree=0.8,
            objective="reg:squarederror",
            tree_method="gpu_hist" if torch.cuda.is_available() else "hist",
        )
        model.fit(X_train, y_train)
        self.xgb_model = model

    def train_lgbm(self, X_train, y_train):
        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.03,
            "num_leaves": 64,
            "feature_fraction": 0.8,
            "bagging_fraction": 0.8,
            "bagging_freq": 5
        }
        dataset = lgb.Dataset(X_train, y_train)
        self.lgbm_model = lgb.train(params, dataset, num_boost_round=500)

    def predict(self, X, w1=0.5, w2=0.5):
        p1 = self.xgb_model.predict(X)
        p2 = self.lgbm_model.predict(X)
        return w1 * p1 + w2 * p2


In [None]:
X = train_embeddings
y = train_df["price"].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

ensemble = EnsembleRegressor()
ensemble.train_xgb(X_train, y_train)
ensemble.train_lgbm(X_train, y_train)

val_pred = ensemble.predict(X_val)
print("MAE:", mean_absolute_error(y_val, val_pred))
print("SMAPE:", smape(y_val, val_pred))


In [None]:
final_pred = ensemble.predict(test_embeddings)
final_pred = np.maximum(final_pred, 0.0)  # ensure positive

submission = pd.DataFrame({
    "sample_id": test_df["sample_id"],
    "price": final_pred
})

submission.to_csv("test_out.csv", index=False)
print("Saved test_out.csv")
