In [None]:
# --- Install Required Libraries ---
!pip install -q bitsandbytes accelerate transformers datasets peft pymupdf



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m45.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m117.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m125.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# --- Mount Google Drive ---
from google.colab import drive
# drive.mount('drive/MyDrive/gc_talks/')

# --- Imports ---
import os
import fitz  # PyMuPDF
import glob
import re
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model



In [None]:
# --- Custom Metadata Extractors ---
def extract_pdf_with_metadata(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        metadata = doc.metadata
        text = "".join(page.get_text() for page in doc)
        doc.close()
        return {
            "text": text.strip(),
            "source": os.path.basename(pdf_path),
            "author": metadata.get("author", "Unknown"),
            "title": metadata.get("title", os.path.basename(pdf_path)),
            "date": metadata.get("creationDate", "Unknown")
        }
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None

def extract_txt_with_custom_metadata(txt_path):
    try:
        with open(txt_path, "r", encoding="utf-8") as f:
            lines = f.readlines()

        idx = next((i for i, line in enumerate(lines) if "2:3" in line), None)
        if idx is not None and idx + 2 < len(lines):
            title = lines[idx + 1].strip().strip("\"“”")
            author = lines[idx + 2].strip().replace("By ", "").strip()
            text = "".join(lines[idx + 3:])
        else:
            title, author = "Untitled", "Unknown"
            text = "".join(lines)

        return {
            "text": text.strip(),
            "source": os.path.basename(txt_path),
            "author": author,
            "title": title,
            "date": "Unknown"
        }

    except Exception as e:
        print(f"Error reading {txt_path}: {e}")
        return None

# --- Load Documents with Metadata ---
folder_path = "drive/MyDrive/gc_talks/"
documents = []



In [None]:
for filepath in glob.glob(folder_path + "*.pdf"):
    doc = extract_pdf_with_metadata(filepath)
    if doc: documents.append(doc)

for filepath in glob.glob(folder_path + "*.txt"):
    doc = extract_txt_with_custom_metadata(filepath)
    if doc: documents.append(doc)

print(f"Loaded {len(documents)} documents.")

# --- Clean, Chunk, and Attach Metadata ---
def clean_text(text):
    return re.sub(r'\s+', ' ', text).strip()

def chunk_text(text, max_chunk_size=2000):
    return [text[i:i+max_chunk_size] for i in range(0, len(text), max_chunk_size)]

all_chunks = []
for doc in documents:
    chunks = chunk_text(clean_text(doc["text"]))
    for chunk in chunks:
        prompt = f"""Instruction:
Write in the literary style of {doc['author']}, based on a document titled \"{doc['title']}\" written on {doc['date']}.

Response:
{chunk}
"""
        all_chunks.append({"text": prompt})

# --- Create Dataset and Tokenize ---
dataset = Dataset.from_list(all_chunks).train_test_split(test_size=0.1)

model_name = "google/gemma-7b"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def tokenize_function(examples):
    outputs = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
    outputs["labels"] = outputs["input_ids"].copy()
    return outputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# --- Load Model and Apply LoRA ---
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    trust_remote_code=True
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()



Loaded 6065 documents.


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Map:   0%|          | 0/22378 [00:00<?, ? examples/s]

Map:   0%|          | 0/2487 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

trainable params: 6,422,528 || all params: 8,544,103,424 || trainable%: 0.0752


In [None]:
# --- Training Configuration ---
training_args = TrainingArguments(
    output_dir="drive/MyDrive/gemma_lora_outputs/",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=3,
    learning_rate=2e-4,
    save_steps=50,
    save_total_limit=2,
    logging_steps=50,
    fp16=True,
    optim="paged_adamw_8bit",
    report_to="none",
    # evaluation_strategy="no"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
)



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# --- Begin Training ---
# trainer.train()

trainer.train(resume_from_checkpoint=True)



Step,Training Loss
3200,1.3091
3250,1.3536
3300,1.2851
3350,1.3106
3400,1.2895
3450,1.3343
3500,1.302
3550,1.3072
3600,1.3347
3650,1.3147


TrainOutput(global_step=4194, training_loss=0.3256199881754208, metrics={'train_runtime': 17893.0258, 'train_samples_per_second': 3.752, 'train_steps_per_second': 0.234, 'total_flos': 1.5996700774335775e+18, 'train_loss': 0.3256199881754208, 'epoch': 2.999553132540888})

In [None]:
# --- Save Final Adapter and Tokenizer ---
model.save_pretrained("/content/drive/MyDrive/gemma_lora_outputs/final_lora_adapter")
tokenizer.save_pretrained("/content/drive/MyDrive/gemma_lora_outputs/final_tokenizer")


('/content/drive/MyDrive/gemma_lora_outputs/final_tokenizer/tokenizer_config.json',
 '/content/drive/MyDrive/gemma_lora_outputs/final_tokenizer/special_tokens_map.json',
 '/content/drive/MyDrive/gemma_lora_outputs/final_tokenizer/tokenizer.model',
 '/content/drive/MyDrive/gemma_lora_outputs/final_tokenizer/added_tokens.json',
 '/content/drive/MyDrive/gemma_lora_outputs/final_tokenizer/tokenizer.json')