In [20]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("ai-forever/mGPT-1.3B-georgian")
model = AutoModelForCausalLM.from_pretrained("ai-forever/mGPT-1.3B-georgian")




tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/1.89M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/582 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

  torch.utils._pytree._register_pytree_node(


pytorch_model.bin:   0%|          | 0.00/5.77G [00:00<?, ?B/s]

In [15]:
import pandas as pd
import json

# Adjust filename, column names, and output file as needed
CSV_FILE = "conference_talks.csv"
GE_COL = "GEO"  # Name of the column holding Georgian text
EN_COL = "ENG"   # Name of the column holding English text
OUTPUT_JSON = "parallel_conf_data.json"


In [16]:

def load_and_clean_data():
    """
    Reads the CSV, drops rows that have missing or empty text
    in either GeorgianText or EnglishText, and returns a clean DataFrame.
    """
    # Load the CSV (assumes UTF-8 encoding)
    df = pd.read_csv(CSV_FILE, encoding="utf-8")

    # Check for missing or NaN in either column
    missing_df = df[df[GE_COL].isna() | df[EN_COL].isna()]
    if not missing_df.empty:
        print("Rows dropped due to missing data:")
        print(missing_df)
        print("-" * 50)

    # Drop those rows from the main DataFrame
    df.dropna(subset=[GE_COL, EN_COL], inplace=True)

    # Optionally, drop rows that have text but are blank (e.g., "")
    blank_df = df[(df[GE_COL].str.strip() == "") | (df[EN_COL].str.strip() == "")]
    if not blank_df.empty:
        print("Rows dropped due to blank (empty) text:")
        print(blank_df)
        print("-" * 50)
        df = df.drop(blank_df.index)

    # Reset index to keep DataFrame tidy
    df.reset_index(drop=True, inplace=True)
    
    return df



In [17]:
def convert_df_to_translation_format(df):
    """
    Converts each row to a dict with a 'translation' field
    conforming to a typical Hugging Face seq2seq structure:
    {
       'translation': {
          'src': '...',
          'tgt': '...'
       }
    }
    """
    data = []
    for idx, row in df.iterrows():
        ge_text = row[GE_COL]
        en_text = row[EN_COL]

        # Build the dictionary in a typical huggingface 'translation' format
        data.append({
            "translation": {
                "ge": ge_text,
                "en": en_text
            }
        })
    return data



In [18]:
{
  "translation": { "src": "some text", "tgt": "some text" }
}

def save_to_json(data_list, output_path):
    """
    Saves the list of translation dictionaries to a JSON file with
    UTF-8 encoding and no ASCII escaping (so Georgian characters remain legible).
    """
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(data_list, f, ensure_ascii=False, indent=2)
    print(f"Saved parallel data to {output_path}")


In [None]:
def main():
    # Step 1: Load and clean CSV
    df = load_and_clean_data()
    print(f"After cleaning, {len(df)} rows remain.\n")

    # Step 2: Convert to translation format
    parallel_conf_data = convert_df_to_translation_format(df)

    # Step 3: Save to JSON
    save_to_json(parallel_conf_data, OUTPUT_JSON)


if __name__ == "__main__":
    main()



Rows dropped due to missing data:
                                                 title  \
188                                        უსმინე მას!   
225  დიდი სიყვარული ჩვენი მამაზეციერის შვილების მიმართ   
246                     მოემზადეთ ღმერთთან შესახვედრად   
248  გამოცხადება ეკლესიისთვის, გამოცხადება ჩვენი ცხ...   
258                                          უფლის ხმა   
264                     არ შეგეშინდეთ სიკეთის კეთებისა   
292                                             მამები   
298                      სულიწმინდა თანამგზავრის როლში   
304     „არჩეულნი იმისთვის, რომ დაამოწმონ ჩემი სახელი“   
336                                უწყვეტი გამოცხადება   
363                    იერემიას გოდება: ერიდეთ მონობას   
372                 მორჩილებას ლოცვა-კურთხევები მოაქვს   
375                                         გამოსყიდვა   
385                                სად არის საბურველი?   
386            სინანულისა და გადაწყვეტილებების შესახებ   
394                შენიშნეთ დალოცვები 

In [21]:
import json
import random
from datasets import Dataset

def load_and_prepare_data(json_file):
    """
    Loads parallel data in JSON, converts each record
    into a prompt-completion pair suitable for causal LM.
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Each item has item["translation"]["en"] and item["translation"]["ge"]
    # We'll create a list of dicts with fields 'prompt' and 'completion'
    records = []
    for item in data:
        en_text = item["translation"]["en"]
        ge_text = item["translation"]["ge"]

        # Construct the prompt
        prompt = (f"Translate from English to Georgian:\n"
                  f"English: {en_text}\n"
                  f"Georgian:")

        # The model should continue with the correct Georgian text
        completion = f" {ge_text}"

        records.append({
            "prompt": prompt,
            "completion": completion
        })

    # Shuffle if desired
    random.shuffle(records)

    # Convert to a huggingface Dataset
    return Dataset.from_list(records)


In [29]:
from tokenizers import Tokenizer
from tokenizers.models import Unigram
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import UnigramTrainer

def train_unigram_tokenizer(json_file_path, vocab_size=32000, limit_alphabet=6000):
    """
    Trains a custom Unigram SentencePiece tokenizer using the tokenizers library.
    Returns a trained `tokenizers.Tokenizer` object.
    """
    # 1) Initialize a tokenizer with the Unigram model
    tokenizer = Tokenizer(Unigram())
    tokenizer.pre_tokenizer = Whitespace()

    # 2) Create a trainer for the Unigram model
    trainer = UnigramTrainer(
        vocab_size=vocab_size,
        show_progress=True,
        special_tokens=["<unk>", "<s>", "</s>", "<pad>", "<mask>"],
        # limit_alphabet can help control how many unique characters are included
        limit_alphabet=limit_alphabet
    )

    # 3) Train on your corpus
    tokenizer.train(
        files=[json_file_path],
        trainer=trainer
    )

    # 4) Set the tokenizer's post-processing or normalizer if needed
    # (For example, you could configure a BertNormalizer or NFD normalization.)

    return tokenizer



In [32]:
from transformers import PreTrainedTokenizerFast

def create_fast_tokenizer(tokenizer, model_name="custom-sentencepiece-unigram"):
    """
    Wraps a `tokenizers.Tokenizer` (Unigram SentencePiece) into a
    Hugging Face PreTrainedTokenizerFast for downstream usage.
    """
    fast_tokenizer = PreTrainedTokenizerFast(
        tokenizer_object=tokenizer,
        # Provide additional arguments to help HF understand special tokens
        # or naming conventions
        bos_token="<s>",
        eos_token="</s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>"
    )
    fast_tokenizer.name_or_path = model_name
    return fast_tokenizer


In [33]:
import os

def main():
    # 1) Train or load existing tokenizer
    if not os.path.exists("my_unigram_tokenizer.json"):
        # Train a new tokenizer
        tokenizer_sp = train_unigram_tokenizer("parallel_conf_data.json", vocab_size=32000)
        # Save the raw tokenizer object
        tokenizer_sp.save("my_unigram_tokenizer.json")
    else:
        # Load existing tokenizer from file
        from tokenizers import Tokenizer
        tokenizer_sp = Tokenizer.from_file("my_unigram_tokenizer.json")

    # 2) Wrap in a PreTrainedTokenizerFast
    fast_tokenizer = create_fast_tokenizer(tokenizer_sp, model_name="my-unigram-tokenizer")

    # Optionally, save so you can load it like any HF tokenizer
    fast_tokenizer.save_pretrained("my_unigram_tokenizer")

    # 3) Use it in a standard Hugging Face pipeline, e.g. training your model
    # Example: We'll create a trivial dataset to illustrate a tokenize_function
    from datasets import Dataset

    # Suppose we want each row to have 'prompt' + 'completion' for a causal LM
    data = [
        {"prompt": "Translate: English: Hello world\nGeorgian:", "completion": " გამარჯობა მსოფლიო"},
        {"prompt": "Translate: English: How are you?\nGeorgian:", "completion": " როგორ ხარ?"},
    ]
    ds = Dataset.from_list(data)

    def tokenize_function(examples):
        # We'll combine the prompt and completion
        joined = [p + c for p, c in zip(examples["prompt"], examples["completion"])]
        out = fast_tokenizer(joined, padding="max_length", truncation=True, max_length=128)
        out["labels"] = out["input_ids"].copy()  # causal LM style
        return out

    tokenized_ds = ds.map(tokenize_function, batched=True)

    # At this point, tokenized_ds is ready for a Hugging Face Trainer
    # using any model (like ai-forever/mGPT-1.3B-georgian).
    print(tokenized_ds[0])

if __name__ == "__main__":
    main()


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

{'prompt': 'Translate: English: Hello world\nGeorgian:', 'completion': ' გამარჯობა მსოფლიო', 'input_ids': [8675, 2615, 30, 8108, 30, 74, 2796, 51, 179, 24297, 124, 30, 23390, 1854, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,

In [36]:
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments

model = AutoModelForCausalLM.from_pretrained("ai-forever/mGPT-1.3B-georgian")

training_args = TrainingArguments(
    output_dir="my_mgpt_ft",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="epoch",
    fp16=True  # if your GPU supports float16
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds,  # from the snippet above
    data_collator=None, # If you want dynamic padding or something, see DataCollatorForLanguageModeling
)

trainer.train()



ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`