In [None]:
!pip install lightning

In [3]:
import os
import torch
import torch.nn.functional as F
import lightning as L
import wandb
import kagglehub
import numpy as np
import pandas as pd
import random
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import AutoTokenizer, AutoModel
from pytorch_lightning.loggers import WandbLogger

In [10]:
from google.colab import userdata
wb_token = userdata.get('WB_TOKEN')
!wandb login $wb_token

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
# Download latest version
path = kagglehub.dataset_download("kunalbhar/house-md-transcripts")

print("Path to dataset files:", path)
print(os.listdir("/root/.cache/kagglehub/datasets/kunalbhar/house-md-transcripts/versions/2"))

Downloading from https://www.kaggle.com/api/v1/datasets/download/kunalbhar/house-md-transcripts?dataset_version_number=2...


100%|██████████| 2.15M/2.15M [00:01<00:00, 1.94MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/kunalbhar/house-md-transcripts/versions/2
['season8.csv', 'season3.csv', 'season7.csv', 'season1.csv', 'season2.csv', 'season5.csv', 'season6.csv', 'season4.csv']


In [12]:
class HouseTranscriptDataset(Dataset):
    def __init__(self, csv_files, tokenizer, max_length=384):
        """
        Args:
            csv_files: List of file paths (each representing a season)
            tokenizer: Tokenizer for encoding text
            max_length: Max token length for padding
        """
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.data = []

        # Load transcripts from all seasons
        for file in csv_files:
            df = pd.read_csv(file, encoding="latin-1")
            season_name = os.path.splitext(os.path.basename(file))[0]
            df["season"] = season_name
            df['line'] = df['line'].str.replace(r'\[.*?\]', '', regex=True).str.strip()
            df['prev'] = df['line'].shift(1).fillna(df['line'])
            df = df[df["name"] == "House"]

            self.data.extend(df.to_dict("records"))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query_entry = self.data[idx]
        query = query_entry["line"]
        season = query_entry["season"]
        positive = query_entry["prev"]

        # Pick a random negative sample from a different season
        different_season_phrases = [
            entry["line"] for entry in self.data if entry["season"] != season
        ]
        negative = random.choice(different_season_phrases) if different_season_phrases else query  # Fallback to query

        # Tokenize inputs
        encode = lambda text: self.tokenizer(text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
        query_enc = encode(query)
        pos_enc = encode(positive)
        neg_enc = encode(negative)

        return {
            "query": query_enc, "positive": pos_enc, "negative": neg_enc
        }

In [13]:
class BiEncoder(L.LightningModule):
    def __init__(self, model_name='sentence-transformers/all-MiniLM-L6-v2', lr=2e-5):
        super().__init__()
        self.save_hyperparameters()  # Logs model hyperparameters
        self.model = AutoModel.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.lr = lr
        self.loss_fn = torch.nn.TripletMarginLoss(margin=1.0)

    def mean_pooling(self, model_output, attention_mask):
        """
        Applies mean pooling over token embeddings using the attention mask.
        """
        token_embeddings = model_output.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.shape).float()
        pooled_embeddings = (token_embeddings * input_mask_expanded).sum(1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return F.normalize(pooled_embeddings, p=2, dim=1)

    def forward(self, inputs):
        outputs = self.model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
        return self.mean_pooling(outputs, inputs["attention_mask"])

    def training_step(self, batch, batch_idx):
        query_emb = self({key: val.squeeze(1) for key, val in batch['query'].items()})
        pos_emb = self({key: val.squeeze(1) for key, val in batch['positive'].items()})
        neg_emb = self({key: val.squeeze(1) for key, val in batch['negative'].items()})
        loss = self.loss_fn(query_emb, pos_emb, neg_emb)
        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.95)  # Decays LR over time
        return {"optimizer": optimizer, "lr_scheduler": scheduler}

    def validation_step(self, batch, batch_idx):
        query_emb = self({key: val.squeeze(1) for key, val in batch['query'].items()})
        pos_emb = self({key: val.squeeze(1) for key, val in batch['positive'].items()})
        neg_emb = self({key: val.squeeze(1) for key, val in batch['negative'].items()})

        loss = self.loss_fn(query_emb, pos_emb, neg_emb)
        self.log("val_loss", loss, on_epoch=True, prog_bar=True)
        return loss

In [14]:
def train_biencoder(dataset, batch_size=16, epochs=3):

    dataset_size = len(dataset)
    val_split = 0.2
    val_size = int(val_split * dataset_size)
    train_size = dataset_size - val_size

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = BiEncoder()
    wandb_logger = WandbLogger(project="House-Transcript-Vector-Search")
    trainer = L.Trainer(max_epochs=epochs, logger=wandb_logger, accelerator="gpu" if torch.cuda.is_available() else "cpu")
    trainer.fit(model, train_loader, val_loader)
    return model

In [23]:
# Example usage:
base_path = "/root/.cache/kagglehub/datasets/kunalbhar/house-md-transcripts/versions/2"
csv_files = [f"{base_path}/{f}" for f in os.listdir(base_path)]
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
dataset = HouseTranscriptDataset(csv_files, tokenizer)
model = train_biencoder(dataset)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/loggers/wandb.py:397: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
/usr/local/lib/python3.11/dist-packages/lightning/pytorch/callbacks/model_checkpoint.py:654: Checkpoint directory ./House-Transcript-Vector-Search/j9mom01y/checkpoints exists and is not empty.
INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name    | Type              | Params | Mode 
------------------------------------------------------
0 | model   | BertModel         | 22.7 M | eval 
1 

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/usr/local/lib/python3.11/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.
/usr/local/lib/python3.11/dist-packages/lightning/pytorch/trainer/connectors/data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=11` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=3` reached.


In [19]:
device = torch.device("cuda")
model = torch.jit.load('model.pt')

In [23]:
base_path = "/root/.cache/kagglehub/datasets/kunalbhar/house-md-transcripts/versions/2"
csv_files = [f"{base_path}/{f}" for f in os.listdir(base_path)]
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model.to(device)

def encode_texts(texts, model, batch_size=32):
    """Encodes a list of texts into embeddings using the trained bi-encoder."""
    model.eval()
    embeddings = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i : i + batch_size]
            inputs = tokenizer(batch_texts, padding=True, truncation=True, max_length=386, return_tensors="pt")
            inputs.to(device)
            output = model(inputs)
            embeddings.append(output.to("cpu"))

    return np.vstack(embeddings)

# Load transcript data
transcripts = []

for file in csv_files:
    df = pd.read_csv(file, encoding="latin-1")
    df = df[df["name"] == "House"].reset_index(drop=True)
    transcripts.extend(df["line"].tolist())

# Encode all transcript phrases
transcript_embeddings = encode_texts(transcripts, model)

# Save embeddings and transcripts
np.save("transcript_embeddings.npy", transcript_embeddings)
pd.DataFrame(transcripts, columns=["list"]).to_csv("transcript_texts.csv", index=False)