In [3]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='meetingbank_final_llama4.csv')['train']


In [4]:
system_message = """You are an expert meeting assistant. Given the transcript of a meeting, your task is to read and analyze it, then generate a structured summary that captures the essence of the discussion.

Your summary must be correct, clear,thorough, and categorized into the following sections:

Discussion Points/Agenda : A brief overview of the main topics discussed during the meeting.

Decisions Made: Any clear conclusions, approvals, or agreed-upon actions.

Action Items: A list of tasks assigned to individuals or teams, with deadlines if mentioned. 

"""

def create_conversation(sample):
    return {
        "messages": [
            {"role": "system", "content": system_message},
            {"role": "user", "content": sample["transcript"]},
            {"role": "assistant", "content": sample["model_answer"]}
        ]
    }


In [5]:
dataset = dataset.map(create_conversation, remove_columns=dataset.column_names)
dataset = dataset.train_test_split(test_size=0.2, seed=43)  # 80% train, 20% test


In [4]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, BitsAndBytesConfig

# Hugging Face model id
model_id = "google/gemma-3-1b-it" # or `google/gemma-3-4b-pt`, `google/gemma-3-12b-pt`, `google/gemma-3-27b-pt`

# Select model class based on id
if model_id == "google/gemma-3-1b-it":
    model_class = AutoModelForCausalLM
else:
    model_class = AutoModelForImageTextToText

# Check if GPU benefits from bfloat16
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
else:
    torch_dtype = torch.float16

# Define model init arguments
model_kwargs = dict(
    attn_implementation="eager", # Use "flash_attention_2" when running on Ampere or newer GPU
    torch_dtype=torch_dtype, # What torch dtype to use, defaults to auto
    device_map="auto", # Let torch decide how to load the model
)

# BitsAndBytesConfig: Enables 4-bit quantization to reduce model size/memory usage
# model_kwargs["quantization_config"] = BitsAndBytesConfig(
#     load_in_4bit=True,
#     bnb_4bit_use_double_quant=True,
#     bnb_4bit_quant_type='nf4',
#     bnb_4bit_compute_dtype=model_kwargs['torch_dtype'],
#     bnb_4bit_quant_storage=model_kwargs['torch_dtype'],
# )

# Load model and tokenizer
model = model_class.from_pretrained(model_id, **model_kwargs)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-1b-it") # Load the Instruction Tokenizer to use the official Gemma template

In [5]:
from peft import LoraConfig

peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM",
    modules_to_save=["lm_head", "embed_tokens"] # make sure to save the lm_head and embed_tokens as you train the special tokens
)

In [6]:
from tqdm.notebook import tqdm

def count_total_tokens(messages, tokenizer):
    total = 0
    for msg in messages:
        content = msg["content"]
        # Tokenize the content (text only — no special handling for role)
        tokens = tokenizer.encode(content, add_special_tokens=False)
        total += len(tokens)
    return total

# Get all token counts
# all_token_lengths = [count_total_tokens(sample["messages"], tokenizer) for sample in tqdm(dataset["train"])]

# # Stats
# max_tokens = max(all_token_lengths)
# avg_tokens = sum(all_token_lengths) / len(all_token_lengths)
# percentile_95 = sorted(all_token_lengths)[int(0.95 * len(all_token_lengths))]

# print(f"Max tokens (input + output): {max_tokens}")
# print(f"Average tokens: {avg_tokens:.2f}")
# print(f"95th percentile tokens: {percentile_95}")


In [7]:
def filter_by_token_length(example):
    total_tokens = count_total_tokens(example["messages"], tokenizer)
    return total_tokens <= 16000

filtered_dataset = dataset.filter(filter_by_token_length)
print(f"Original train size: {len(dataset['train'])}")
print(f"Filtered train size: {len(filtered_dataset['train'])}")
print(f"Original test size: {len(dataset['test'])}")
print(f"Filtered test size: {len(filtered_dataset['test'])}")


Original train size: 4135
Filtered train size: 3933
Original test size: 1034
Filtered test size: 988


In [8]:
from trl import SFTConfig

args = SFTConfig(
    output_dir="gemma-meetingbank-it",         # directory to save and repository id
    max_seq_length=16312,                     # max sequence length for model and packing of the dataset
    packing=True,                           # Groups multiple samples in the dataset into a single sequence
    num_train_epochs=2,                     # number of training epochs
    per_device_train_batch_size=1,          # batch size per device during training
    gradient_accumulation_steps=4,          # number of steps before performing a backward/update pass
    gradient_checkpointing=True,            # use gradient checkpointing to save memory
    optim="adamw_torch_fused",              # use fused adamw optimizer
    logging_steps=10,                       # log every 10 steps
    save_strategy="epoch",                  # save checkpoint every epoch
    learning_rate=5e-5,                     # learning rate, based on QLoRA paper
    fp16=True if torch_dtype == torch.float16 else False,   # use float16 precision
    bf16=True if torch_dtype == torch.bfloat16 else False,   # use bfloat16 precision
    max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,                      # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",           # use constant learning rate scheduler
    push_to_hub=True,                       # push model to hub
    report_to="tensorboard",                # report metrics to tensorboard
    dataset_kwargs={
        "add_special_tokens": False, # We template with special tokens
        "append_concat_token": True, # Add EOS token as separator token between examples
    }
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [9]:
from trl import SFTTrainer

# Create Trainer object
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=filtered_dataset["train"],
    peft_config=peft_config,
    processing_class=tokenizer
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Packing train dataset:   0%|          | 0/3933 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [10]:
# Start training, the model will be automatically saved to the Hub and the output directory
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,3.0322
20,2.8746
30,2.8082
40,2.7624
50,2.6981
60,2.7037
70,2.6634
80,2.6698
90,2.6473
100,2.6323


TrainOutput(global_step=340, training_loss=2.6073169652153463, metrics={'train_runtime': 5945.1701, 'train_samples_per_second': 0.229, 'train_steps_per_second': 0.057, 'total_flos': 1.3417373276170906e+17, 'train_loss': 2.6073169652153463})

In [15]:
# free the memory again
del model
del trainer
torch.cuda.empty_cache()

In [None]:
from huggingface_hub import login

# Replace this with your Hugging Face token
hf_token = ""

# Log in
login(token=hf_token)

In [2]:
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForImageTextToText, BitsAndBytesConfig

model_id = "gemma-meetingbank-it"
# Check if GPU benefits from bfloat16
if torch.cuda.get_device_capability()[0] >= 8:
    torch_dtype = torch.bfloat16
else:
    torch_dtype = torch.float16
# Load Model with PEFT adapter
model = AutoModelForCausalLM.from_pretrained(
  model_id,
  device_map="auto",
  torch_dtype=torch_dtype,
  attn_implementation="eager",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [3]:
model.push_to_hub("gemma-meetingbank-it", use_auth_token=True)

# Optionally, you can also push the tokenizer if required
tokenizer.push_to_hub("gemma-meetingbank-it", use_auth_token=True)



adapter_model.safetensors:   0%|          | 0.00/1.23G [00:00<?, ?B/s]



README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/SleepyGorilla/gemma-meetingbank-it/commit/a39333da143093998a97223a131834d43fc4ed3a', commit_message='Upload tokenizer', commit_description='', oid='a39333da143093998a97223a131834d43fc4ed3a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/SleepyGorilla/gemma-meetingbank-it', endpoint='https://huggingface.co', repo_type='model', repo_id='SleepyGorilla/gemma-meetingbank-it'), pr_revision=None, pr_num=None)

In [None]:
from tqdm.notebook import tqdm
# Load inference pipeline
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Stop tokens
stop_token_ids = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<end_of_turn>")]

# Storage
results = []

# Loop through test dataset
for sample in tqdm(filtered_dataset["test"], desc="Generating predictions"):
    transcript = sample["messages"][1]["content"]
    ground_truth = sample["messages"][2]["content"]

    # Prepare prompt using chat template
    prompt = pipe.tokenizer.apply_chat_template(sample["messages"][:2], tokenize=False, add_generation_prompt=True)

    # Generate summary
    output = pipe(
        prompt,
        max_new_tokens=16384,
        do_sample=False,
        temperature=0.1,
        top_k=50,
        top_p=0.1,
        eos_token_id=stop_token_ids,
        disable_compile=True
    )

    # output = pipe(
    #     prompt,
    #     max_new_tokens=16384,
    #     do_sample=False,
    #     temperature=0.3,
    #     top_k=50,
    #     top_p=0.2,
    #     eos_token_id=stop_token_ids,
    #     disable_compile=True
    # )


    # Extract only the new generated part
    predicted_summary = output[0]["generated_text"][len(prompt):].strip()
    print(predicted_summary)
    print("Llama 4",ground_truth)
    # Append result
    results.append({
        "transcript": transcript,
        "ground_truth": ground_truth,
        "predicted_summary": predicted_summary
    })
    break

# Save to CSV
df = pd.DataFrame(results)
df.to_csv("gemma3_meeting_summary_predictions.csv", index=False)


Device set to use cuda:0


Generating predictions:   0%|          | 0/988 [00:00<?, ?it/s]

