In [1]:
!pip install pytesseract
!pip install pdf2image
!pip install wandb

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [2]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mjacob-lan[0m ([33mjacob-lan-university-of-california-berkeley[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
import os
import pytesseract
import random
from pdf2image import convert_from_path
from PIL import Image
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset

In [4]:
# Paths
REPORTS_DIR = "reports"
TEXTS_DIR = "extracted_texts"
# TEXTS_DIR = "extracted_texts1"
MODEL_DIR = "CAMeL-Lab/bert-base-arabic-camelbert-mix"

In [5]:
# Ensure text output folder exists
os.makedirs(TEXTS_DIR, exist_ok=True)

In [6]:
# 1. OCR Arabic PDFs
def ocr_pdf(pdf_path):
    images = convert_from_path(pdf_path)
    text = ""
    for img in images:
        text += pytesseract.image_to_string(img, lang='ara') + "\n"
    return text

In [7]:
# 2.1 Extract and clean all PDFs
def extract_all_pdfs():
    texts = []
    for root, _, files in os.walk(REPORTS_DIR):
        for filename in files:
            if filename.endswith(".pdf"):
                pdf_path = os.path.join(root, filename)
                print(f"🔍 Processing {pdf_path}...")
                raw_text = ocr_pdf(pdf_path)
                clean_text = raw_text.replace("\n", " ").strip()

                if len(clean_text) > 100:
                    texts.append({"text": clean_text})
                    print(f"Extracted {len(clean_text)} characters.")
                else:
                    print(f"Skipped {filename} (too short after OCR)")

                # Optional: save cleaned text
                relative_path = os.path.relpath(pdf_path, REPORTS_DIR)
                txt_filename = relative_path.replace(".pdf", ".txt").replace("/", "_")
                txt_path = os.path.join(TEXTS_DIR, txt_filename)
                with open(txt_path, "w", encoding="utf-8") as f:
                    f.write(clean_text)
    print(f"Total documents extracted: {len(texts)}")
    return texts

In [8]:
# 2.2 Load all saved text
def load_all_texts():
    texts = []
    for filename in os.listdir(TEXTS_DIR):
        if filename.endswith(".txt"):
            txt_path = os.path.join(TEXTS_DIR, filename)
            with open(txt_path, "r", encoding="utf-8") as f:
                text = f.read().strip()
                texts.append({"text": text})
    print(f"Loaded {len(texts)} documents from text files.")
    return texts

In [9]:
# Training set and test set
def split_dataset(texts, train_ratio=0.8, seed=42):
    random.seed(seed)
    random.shuffle(texts)
    split_idx = int(len(texts) * train_ratio)
    train_texts = texts[:split_idx]
    test_texts = texts[split_idx:]
    print(f"Training set size: {len(train_texts)}")
    print(f"Test set size: {len(test_texts)}")
    return train_texts, test_texts

In [10]:
# 3. Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForMaskedLM.from_pretrained(MODEL_DIR)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/86.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/468 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/305k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-mix were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# 4. Prepare dataset
def prepare_dataset(texts):
    dataset = Dataset.from_list(texts)

    def tokenize_function(example):
        return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

    tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
    return tokenized_dataset

In [12]:
# 5. Fine-tune
def train_model(dataset):
    training_args = TrainingArguments(
        output_dir="./finetuned_model",
        per_device_train_batch_size=4,
        num_train_epochs=3,
        logging_dir="./logs",
        save_strategy="epoch",
        logging_steps=10,
        overwrite_output_dir=True,
    )

    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    trainer.train()
    model.save_pretrained("./finetuned_model")
    tokenizer.save_pretrained("./finetuned_model")

In [None]:
if __name__ == "__main__":
    # texts = extract_all_pdfs()
    texts = load_all_texts()
    train_texts, test_texts = split_dataset(texts)
    dataset = prepare_dataset(train_texts)
    train_model(dataset)
    print("Fine-tuning complete. Model saved to ./finetuned_model")

Loaded 709 documents from text files.
Training set size: 567
Test set size: 142


Map:   0%|          | 0/567 [00:00<?, ? examples/s]

  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mjacob-lan[0m ([33mjacob-lan-university-of-california-berkeley[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,2.368
20,1.9156
30,1.8399
40,1.5579
50,1.5458
60,1.3637
70,1.2315
80,1.624
90,1.187
100,1.2247
