In [None]:
#Importing all required packages
import pickle
import json
from tqdm import tqdm
import time
import os
from huggingface_hub import login
from langchain.schema import Document
import torch

# Use a pipeline as a high-level helper
from transformers import pipeline

#Cuda memory will faill without this
# import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

#Importing secretkey saved in KAggle secrets for logging into huggingface
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")

#Logging into huggingface liek this as CLI is not working for us in Kaggle
login(token=secret_value_0)

#Importing packages required to download model
import requests
from transformers import AutoProcessor, Gemma3ForConditionalGeneration

#For offloading model to GPU
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import AutoConfig

In [2]:
with open("/kaggle/input/all-marketing-material/all_marketing_material.pkl", "rb") as f:
    all_marketing_pages = pickle.load(f)

In [4]:
model_id = "google/gemma-3-4b-it"

#Downloading models locally to query them
model = Gemma3ForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",  # Offloads intelligently between GPU & CPU
    offload_folder="offload_dir"  # Offload excess weights to disk (temporary)
)
processor = AutoProcessor.from_pretrained(model_id)

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [14]:
#Lets test if this works on GPU
def local_llm(prompt: str, processor, model) -> str:
    """
    Sends a prompt to a HuggingFace Gemma model and returns the response, using GPU if available.
    """
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are a helpful assistant."}]
        },
        {
            "role": "user",
            "content": [{"type": "text", "text": prompt}]
        }
    ]

    try:
        # Step 1: Tokenize the messages
        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
            add_generation_prompt=True
        )

        # Step 2: Move inputs to the model's device (GPU or CPU)
        device = model.device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Step 3: Generate output with inference mode
        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=True,
                temperature=0.7
            )

        # Step 4: Decode generated output
        decoded_output = processor.batch_decode(
            outputs,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )[0]

        return decoded_output.strip()

    except Exception as e:
        print(f"Error in HuggingFace LLM call: {e}")
        return ""

In [15]:
def process_chunk_to_alpaca(doc: Document, processor, model) -> dict:
    source_name = doc.metadata.get("source", "Unknown Name")

    instruction_with_metadata = f"""
You are a business assistant analyzing raw business content from the following source:
SOURCE NAME: {source_name}

Your task is to extract the following from the provided transcript:
1. Frameworks (e.g., naming, advertising, validation models).
2. Bullet points for key ideas or steps.
3. Q&A (any implied or stated questions with answers).
4. Case Examples or stories.
5. Copywriting formulas (AIDA, PAS, etc.)
6. Classify this content into high-level topics: e.g., Naming, Ads, Psychology, Copywriting.
7. Convert suitable content into a step-by-step guide.

Return your output in clearly labeled sections, and only include sections with relevant content. Do not include a preamble.
""".strip()

    prompt = f"{instruction_with_metadata}\n\n{doc.page_content.strip()}"
    response = local_llm(prompt, processor, model)

    return {
        "instruction": instruction_with_metadata,
        "input": doc.page_content.strip(),
        "output": response,
        "metadata": doc.metadata
    }


In [None]:
# Lets make a state managed code that processes all docs with tqdm

In [32]:
PROCESSED_FILE = "alpaca_processed.jsonl"
FAILED_FILE = "alpaca_failed.jsonl"
MAX_RETRIES = 3
RETRY_DELAY = 2  # seconds between retries

def load_jsonl_ids(filename):
    if not os.path.exists(filename):
        return set()
    with open(filename, "r") as f:
        return {json.loads(line).get("metadata", {}).get("source", "") + str(json.loads(line).get("metadata", {}).get("page", "")) for line in f}

def save_jsonl(filename, data):
    with open(filename, "a") as f:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

def get_doc_id(doc: Document):
    source = doc.metadata.get("source", "")
    page = doc.metadata.get("page")

    if page is None:
        # Fallback to hashing part of the content if page is missing
        content_hash = str(abs(hash(doc.page_content[:50])))
        return f"{source}_hash_{content_hash}"

    return f"{source}_page_{page}"

def process_documents_with_retries(pages, processor, model):
    processed_ids = load_jsonl_ids(PROCESSED_FILE)
    failed_ids = load_jsonl_ids(FAILED_FILE)

    for doc in tqdm(pages, desc="Processing documents"):
        doc_id = get_doc_id(doc)

        if doc_id in processed_ids:
            continue

        retries = 0
        success = False

        while retries < MAX_RETRIES and not success:
            try:
                alpaca_entry = process_chunk_to_alpaca(doc, processor, model)
                save_jsonl(PROCESSED_FILE, alpaca_entry)
                success = True
            except Exception as e:
                retries += 1
                if retries < MAX_RETRIES:
                    time.sleep(RETRY_DELAY)
                else:
                    error_entry = {
                        "error": str(e),
                        "metadata": doc.metadata,
                        "input": doc.page_content[:500]  # preview of failed input
                    }
                    save_jsonl(FAILED_FILE, error_entry)
        time.sleep(1)

In [None]:
process_documents_with_retries(all_marketing_pages, processor, model)

Processing documents:   0%|          | 7/6608 [07:03<108:25:02, 59.13s/it]