In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("hf_key")
login(token = hf_token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:
%%capture
%pip install -U transformers 
%pip install -U datasets 
%pip install -U accelerate 
%pip install -U peft 
%pip install -U trl 
%pip install -U bitsandbytes 
%pip install -U mlflow

In [4]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, mlflow
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

In [5]:
base_model = "google/gemma-2-2b"
dataset_name = "premio-ai/TheArabicPile_Articles"
new_model = "Gemma-2-2b-ar"

In [6]:
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    torch_dtype = torch.bfloat16
    attn_implementation = "flash_attention_2"
else:
    torch_dtype = torch.float16
    attn_implementation = "eager"

In [7]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)


In [10]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [12]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model = get_peft_model(model, peft_config)

In [15]:
dataset = load_dataset(dataset_name,"dedup", split="train")
dataset = dataset.shuffle(seed=65).select(range(12000))
dataset

Dataset({
    features: ['text'],
    num_rows: 12000
})

In [16]:
dataset = dataset.train_test_split(test_size=0.1)


In [17]:
from datetime import datetime

# Setting Hyperparamter
mlflow.set_experiment("MLflow PEFT model")

training_arguments = TrainingArguments(
    report_to="mlflow",
    # Name the MLflow run
    run_name=f"gemma-2B-ar-QLoRA-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}",
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    max_steps=2500,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,

)
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

model.config.use_cache = False
trainer.train()


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/10800 [00:00<?, ? examples/s]

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss,Validation Loss
500,3.6687,2.690394
1000,3.7628,2.618979
1500,2.8252,2.570303
2000,3.6483,2.521795
2500,3.5708,2.502804


TrainOutput(global_step=2500, training_loss=2.615820879983902, metrics={'train_runtime': 4025.9427, 'train_samples_per_second': 1.242, 'train_steps_per_second': 0.621, 'total_flos': 9326411910526464.0, 'train_loss': 2.615820879983902, 'epoch': 0.46296296296296297})

In [18]:
trainer.model.save_pretrained(new_model)
trainer.model.push_to_hub(new_model, use_temp_dir=False)

adapter_model.safetensors:   0%|          | 0.00/83.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/hassan123mohamed/Gemma-2-2b-ar/commit/6e4063ae344cd1f52f132326064cdc451117cd52', commit_message='Upload model', commit_description='', oid='6e4063ae344cd1f52f132326064cdc451117cd52', pr_url=None, repo_url=RepoUrl('https://huggingface.co/hassan123mohamed/Gemma-2-2b-ar', endpoint='https://huggingface.co', repo_type='model', repo_id='hassan123mohamed/Gemma-2-2b-ar'), pr_revision=None, pr_num=None)

In [28]:
!pip install pyngrok


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [24]:
prompt_template = """You are a writting assistant to help writing essays and articles in arabic

{prompt}

### Response:
"""



In [25]:
from mlflow.models import infer_signature

sample = "اكتب عن الحضارة المصرية"

# MLflow infers schema from the provided sample input/output/params
signature = infer_signature(
    model_input=sample,
    model_output="",
    # Parameters are saved with default values if specified
    params={"max_new_tokens": 1024, "repetition_penalty": 1.15, "return_full_text": False},
)
signature


inputs: 
  [string (required)]
outputs: 
  [string (required)]
params: 
  ['max_new_tokens': long (default: 1024), 'repetition_penalty': double (default: 1.15), 'return_full_text': boolean (default: False)]

In [26]:
last_run_id = mlflow.last_active_run().info.run_id
# Save a tokenizer without padding because it is only needed for training
tokenizer_no_pad = AutoTokenizer.from_pretrained(base_model, add_bos_token=True)

# If you interrupt the training, uncomment the following line to stop the MLflow run
# mlflow.end_run()

with mlflow.start_run(run_id=last_run_id):
    mlflow.log_params(peft_config.to_dict())
    mlflow.transformers.log_model(
        transformers_model={"model": trainer.model, "tokenizer": tokenizer_no_pad},
        prompt_template=prompt_template,
        signature=signature,
        artifact_path="model",  # This is a relative path to save model files within MLflow run
    )

2024/10/12 22:47:03 INFO mlflow.transformers: Overriding save_pretrained to False for PEFT models, following the Transformers behavior. The PEFT adaptor and config will be saved, but the base model weights will not and reference to the HuggingFace Hub repository will be logged instead.
2024/10/12 22:47:04 INFO mlflow.transformers: Skipping saving pretrained model weights to disk as the save_pretrained argumentis set to False. The reference to the HuggingFace Hub repository google/gemma-2-2b will be logged instead.


README.md:   0%|          | 0.00/25.8k [00:00<?, ?B/s]

2024/10/12 22:47:04 INFO mlflow.transformers: text-generation pipelines saved with prompt templates have the `return_full_text` pipeline kwarg set to False by default. To override this behavior, provide a `model_config` dict with `return_full_text` set to `True` when saving the model.
2024/10/12 22:47:04 INFO mlflow.transformers: A local checkpoint path or PEFT model is given as the `transformers_model`. To avoid loading the full model into memory, we don't infer the pip requirement for the model. Instead, we will use the default requirements, but it may not capture all required pip libraries for the model. Consider providing the pip requirements explicitly.


In [30]:
!ngrok config add-authtoken 


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [46]:
get_ipython().system_raw("mlflow ui --port 5000 &") # run tracking UI in the background


[2024-10-13 00:29:11 +0000] [968] [INFO] Starting gunicorn 23.0.0
[2024-10-13 00:29:11 +0000] [968] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-10-13 00:29:11 +0000] [968] [ERROR] connection to ('127.0.0.1', 5000) failed: [Errno 98] Address already in use
[2024-10-13 00:29:12 +0000] [968] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-10-13 00:29:12 +0000] [968] [ERROR] connection to ('127.0.0.1', 5000) failed: [Errno 98] Address already in use
[2024-10-13 00:29:13 +0000] [968] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-10-13 00:29:13 +0000] [968] [ERROR] connection to ('127.0.0.1', 5000) failed: [Errno 98] Address already in use
[2024-10-13 00:29:14 +0000] [968] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-10-13 00:29:14 +0000] [968] [ERROR] connection to ('127.0.0.1', 5000) failed: [Errno 98] Address already in use
[2024-10-13 00:29:15 +0000] [968] [ERROR] Connection in use: ('127.0.0.1', 5000)
[2024-10-13 00:29:15 +0000] [968] [ERROR] connection to (

In [47]:
ngrok.kill()


In [45]:
!pip install pyngrok --quiet


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [48]:
ngrok_tunnel = ngrok.connect(addr="5000", proto="http", bind_tls=True)
print("MLflow Tracking UI:", ngrok_tunnel.public_url)

MLflow Tracking UI: https://6c03-34-91-255-126.ngrok-free.app


In [49]:
%%capture
%pip install -U accelerate


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [52]:
mlflow_model = mlflow.pyfunc.load_model("runs:/5dfb70f457b54fff8dcc10f2b45a5668/model")




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [57]:
import pandas as pd
from IPython.display import HTML, display

def display_table(dataset_or_sample):
    # A helper fuction to display a Transformer dataset or single sample contains multi-line string nicely
    pd.set_option("display.max_colwidth", None)
    pd.set_option("display.width", None)
    pd.set_option("display.max_rows", None)


test_prompt = """
### title:
الحضارة المصرية القديمة

### Question:
write about pharaohs and how they were connected other civilisation in arabic
"""

generated_query = mlflow_model.predict(test_prompt)[0]
print(generated_query)
display_table({"prompt": test_prompt, "generated_query": generated_query})



The ancient Egyptian civilization was one of the most important civilizations that existed during the first millennium BC. It is considered as an example for all civilized societies, because it has left behind many monuments such as pyramids, temples and statues. The Egyptians have been able to build these great works thanks to their advanced technology at that time which allowed them to use stone tools made from granite or limestone with precision so much so that some people believe there may be more than 10 different types of stones used by Ancient Egypt!


In [59]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="hassan123mohamed/Gemma-2-2b-ar-final",
               device="cuda",)
text = "اكتب عن الحضارة المصرية القديمة"
outputs = pipe(text, max_new_tokens=512)
response = outputs[0]["generated_text"]
print(response)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

اكتب عن الحضارة المصرية القديمة المقال : 1 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مصر منذ 5000 سنة قبل الميلاد. 2 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مصر منذ 5000 سنة قبل الميلاد. 3 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مصر منذ 5000 سنة قبل الميلاد. 4 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مصر منذ 5000 سنة قبل الميلاد. 5 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مصر منذ 5000 سنة قبل الميلاد. 6 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مصر منذ 5000 سنة قبل الميلاد. 7 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مصر منذ 5000 سنة قبل الميلاد. 8 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مصر منذ 5000 سنة قبل الميلاد. 9 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مصر منذ 5000 سنة قبل الميلاد. 10 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مصر منذ 5000 سنة قبل الميلاد. 11 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مصر منذ 5000 سنة قبل الميلاد. 12 - الحضارة المصرية القديمة هي الحضارة التي نشأت في مص

In [60]:
!pip install gradio

  pid, fd = os.forkpty()


Collecting gradio
  Downloading gradio-5.0.2-py3-none-any.whl.metadata (15 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.0 (from gradio)
  Downloading gradio_client-1.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.6.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting semantic-version~=2.0 (from gradio)
  Downloading semantic_version-2.10.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting tomlkit==0.12.0 (from gradio)
  Downloading tomlkit-0.12.0-py3-none-any.whl.metadata (2.7 kB)
Downloading gradio-5.0.2-py3-none-any.whl (42.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0mm
[?25hDownloading gradio_client-1.4.0-py3-none-any.whl (319 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.8/319.8 kB[0m [31m19.3 MB/s

In [61]:
def predict(prompt):
    completion = model(prompt)[0]["generated_text"]
    return completion

import gradio as gr

gr.Interface(fn=predict, inputs="text", outputs="text").launch()

* Running on local URL:  http://127.0.0.1:7860
Kaggle notebooks require sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

* Running on public URL: https://754c934ce9c9fd0201.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


