In [None]:
print("Ready!)

In [None]:
# uninstall all modules if they exist

!pip uninstall -y torch torchvision transformers accelerate huggingface_hub requests tqdm protobuf \
    langchain sentencepiece safetensors xformers

In [None]:
# Install all compatible versions
# PyTorch + TorchVision (CUDA 11.8 compatible with A40)
!pip install --no-deps torch==2.1.0+cu118 torchvision==0.16.0 --index-url https://download.pytorch.org/whl/cu118

# Transformers, Accelerate, HuggingFace Hub
!pip install transformers==4.51.3
!pip install accelerate==1.7.0
!pip install huggingface_hub==0.32.4

# Utility libraries
!pip install requests==2.31.0
!pip install tqdm==4.67.1
!pip install protobuf==3.20.3

# LangChain (optional, only if you need it)
!pip install langchain==0.1.14

# SentencePiece and Safetensors
!pip install sentencepiece==0.2.0
!pip install safetensors==0.5.3

# xFormers (must match CUDA version & torch)
!pip install xformers==0.0.29.post3

In [2]:
#Importing all required packages
import pickle
import json
from tqdm import tqdm
import time
import os
from huggingface_hub import login
from langchain.schema import Document
import torch

# Use a pipeline as a high-level helper
from transformers import pipeline

# Importing packages required to download model
import requests
from transformers import AutoProcessor, Gemma3ForConditionalGeneration

#For offloading model to GPU
from accelerate import infer_auto_device_map, init_empty_weights
from transformers import AutoConfig

In [4]:
ls

Untitled.ipynb  all_marketing_material.pkl


In [5]:
# Load pickle file to extract marketing docs from it
pickle_file_path = 'all_marketing_material.pkl'

# Load the data
with open(pickle_file_path, 'rb') as f:
    all_marketing_pages = pickle.load(f)

In [7]:
# Log into huggingface, as gemma model is closed
secret_value_0 = "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"
login(token=secret_value_0)

In [8]:
# Load the gemma model from huggingface
model_id = "google/gemma-3-4b-it"

#Downloading models locally to query them
model = Gemma3ForConditionalGeneration.from_pretrained(
    model_id,
    device_map="auto",  # Offloads intelligently between GPU & CPU
    offload_folder="offload_dir"  # Offload excess weights to disk (temporary)
)
processor = AutoProcessor.from_pretrained(model_id)

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

In [9]:
# Configuring getting response form llm
def local_llm(prompt: str, processor, model) -> str:
    """
    Sends a prompt to a HuggingFace Gemma model and returns the response, using GPU if available.
    """
    messages = [
        {
            "role": "system",
            "content": [{"type": "text", "text": "You are a helpful assistant."}]
        },
        {
            "role": "user",
            "content": [{"type": "text", "text": prompt}]
        }
    ]

    try:
        # Step 1: Tokenize the messages
        inputs = processor.apply_chat_template(
            messages,
            tokenize=True,
            return_dict=True,
            return_tensors="pt",
            add_generation_prompt=True
        )

        # Step 2: Move inputs to the model's device (GPU or CPU)
        device = model.device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Step 3: Generate output with inference mode
        with torch.inference_mode():
            outputs = model.generate(
                **inputs,
                max_new_tokens=1024,
                do_sample=True,
                temperature=0.7
            )

        # Step 4: Decode generated output
        decoded_output = processor.batch_decode(
            outputs,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )[0]

        return decoded_output.strip()

    except Exception as e:
        print(f"Error in HuggingFace LLM call: {e}")
        return ""

In [10]:
def process_chunk_to_alpaca(doc: Document, processor, model) -> dict:
    source_name = doc.metadata.get("source", "Unknown Name")

    instruction_with_metadata = f"""
You are a business assistant analyzing raw business content from the following source:
SOURCE NAME: {source_name}

Your task is to extract the following from the provided transcript:
1. Frameworks (e.g., naming, advertising, validation models).
2. Bullet points for key ideas or steps.
3. Q&A (any implied or stated questions with answers).
4. Case Examples or stories.
5. Copywriting formulas (AIDA, PAS, etc.)
6. Classify this content into high-level topics: e.g., Naming, Ads, Psychology, Copywriting.
7. Convert suitable content into a step-by-step guide.

Return your output in clearly labeled sections, and only include sections with relevant content. Do not include a preamble.
""".strip()

    prompt = f"{instruction_with_metadata}\n\n{doc.page_content.strip()}"
    response = local_llm(prompt, processor, model)

    return {
        "instruction": instruction_with_metadata,
        "input": doc.page_content.strip(),
        "output": response,
        "metadata": doc.metadata
    }

In [None]:
# Test Time Taken for first 3 docs
alpaca_data = []

for doc in tqdm(all_marketing_pages[27:31], desc="Processing documents", leave=True):
    alpaca_entry = process_chunk_to_alpaca(doc, processor, model)
    alpaca_data.append(alpaca_entry)

Processing documents:  50%|█████     | 2/4 [01:10<01:12, 36.02s/it]

In [None]:
# Process All
alpaca_data = []

for doc in tqdm(all_marketing_pages, desc="Processing documents", leave=True):
    alpaca_entry = process_chunk_to_alpaca(doc, processor, model)
    alpaca_data.append(alpaca_entry)