In [10]:
!pip install -U peft bitsandbytes transformers accelerate




In [11]:
!pip install -U trl




In [12]:
!pip install PyMuPDF





**Prebuilt data from huggingface data hub**

In [13]:
from datasets import Dataset ,load_dataset


In [14]:
dataset=load_dataset("roneneldan/TinyStories", split="train")

In [15]:
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 2119719
})


In [16]:
print(dataset[0])

{'text': 'One day, a little girl named Lily found a needle in her room. She knew it was difficult to play with it because it was sharp. Lily wanted to share the needle with her mom, so she could sew a button on her shirt.\n\nLily went to her mom and said, "Mom, I found this needle. Can you share it with me and sew my shirt?" Her mom smiled and said, "Yes, Lily, we can share the needle and fix your shirt."\n\nTogether, they shared the needle and sewed the button on Lily\'s shirt. It was not difficult for them because they were sharing and helping each other. After they finished, Lily thanked her mom for sharing the needle and fixing her shirt. They both felt happy because they had shared and worked together.'}


In [17]:
print(dataset[1])

{'text': 'Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong.\n\nOne day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. Beep liked how the leaves fall and wanted to play with them. Beep drove under the tree and watched the leaves fall on him. He laughed and beeped his horn.\n\nBeep played with the falling leaves all day. When it was time to go home, Beep knew he needed more fuel. He went to the fuel place and got more healthy fuel. Now, Beep was ready to go fast and play again the next day. And Beep lived happily ever after.'}


In [18]:
print(dataset[2])

{'text': 'One day, a little fish named Fin was swimming near the shore. He saw a big crab and wanted to be friends. "Hi, I am Fin. Do you want to play?" asked the little fish. The crab looked at Fin and said, "No, I don\'t want to play. I am cold and I don\'t feel fine."\n\nFin felt sad but wanted to help the crab feel better. He swam away and thought of a plan. He remembered that the sun could make things warm. So, Fin swam to the top of the water and called to the sun, "Please, sun, help my new friend feel fine and not freeze!"\n\nThe sun heard Fin\'s call and shone its warm light on the shore. The crab started to feel better and not so cold. He saw Fin and said, "Thank you, little fish, for making me feel fine. I don\'t feel like I will freeze now. Let\'s play together!" And so, Fin and the crab played and became good friends.'}



**Our own custom data (non instrcution data) for domain specific finetuning**

In [19]:
import fitz

In [20]:
def extract_text_from_pdf(pdf_path):
  text_blocks=[]
  with fitz.open(pdf_path) as doc:
    for page in doc:
      text=page.get_text("text").strip()
      if text:
        text_blocks.append(text)
  return text_blocks

In [21]:
pdf_texts = extract_text_from_pdf("/content/Metformin.pdf")

In [22]:
pdf_texts

['Metformin is one of the most widely prescribed oral antihyperglycemic agents.\u200b\n Its primary mechanism of action involves the activation of AMP-activated protein kinase \n(AMPK), a central metabolic regulator that promotes glucose uptake and fatty acid oxidation \nwhile inhibiting hepatic gluconeogenesis.\u200b\n Beyond its glycemic control, Metformin has been shown to improve cardiovascular outcomes \nand display anti-inflammatory properties.\u200b\n Recent studies also suggest potential anticancer effects through inhibition of the mTOR \nsignaling pathway and suppression of tumor angiogenesis. \n \nClinical trials have demonstrated that combining Atorvastatin with Ezetimibe results in \nsignificant reductions in low-density lipoprotein cholesterol (LDL-C) levels compared to \nmonotherapy.\u200b\n Ezetimibe acts by inhibiting the Niemann–Pick C1-like 1 (NPC1L1) transporter in the intestinal \nwall, reducing cholesterol absorption, while Atorvastatin inhibits hepatic HMG-CoA red

In [23]:
import re

In [24]:
def split_paragraphs(pages):
    paragraphs = []
    for page_text in pages:
        # Split on double line breaks or long newlines
        chunks = re.split(r'\n\s*\n', page_text)
        for chunk in chunks:
            clean = chunk.strip()
            if len(clean) > 30:  # ignore too short lines
                paragraphs.append(clean)
    return paragraphs


In [25]:

paragraphs = split_paragraphs(pdf_texts)

In [26]:
data = [{"text": p} for p in paragraphs]

In [27]:
print(data)

[{'text': 'Metformin is one of the most widely prescribed oral antihyperglycemic agents.\u200b\n Its primary mechanism of action involves the activation of AMP-activated protein kinase \n(AMPK), a central metabolic regulator that promotes glucose uptake and fatty acid oxidation \nwhile inhibiting hepatic gluconeogenesis.\u200b\n Beyond its glycemic control, Metformin has been shown to improve cardiovascular outcomes \nand display anti-inflammatory properties.\u200b\n Recent studies also suggest potential anticancer effects through inhibition of the mTOR \nsignaling pathway and suppression of tumor angiogenesis.'}, {'text': 'Clinical trials have demonstrated that combining Atorvastatin with Ezetimibe results in \nsignificant reductions in low-density lipoprotein cholesterol (LDL-C) levels compared to \nmonotherapy.\u200b\n Ezetimibe acts by inhibiting the Niemann–Pick C1-like 1 (NPC1L1) transporter in the intestinal \nwall, reducing cholesterol absorption, while Atorvastatin inhibits he

In [28]:
dataset=Dataset.from_list(data)

In [29]:
dataset

Dataset({
    features: ['text'],
    num_rows: 4
})

In [30]:
model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

In [31]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling


In [32]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [33]:
if tokenizer.pad_token is None:
  tokenizer.pad_token=tokenizer.eos_token

In [34]:
def tokenize_fn(examples):
  tokens=tokenizer(examples["text"],truncation=True,padding="max_length",max_length=512)
  tokens["labels"]=tokens["input_ids"].copy()
  return tokens

In [35]:
tokenized = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [36]:
tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4
})

In [37]:
model = AutoModelForCausalLM.from_pretrained(model_name)


In [38]:

training_args = TrainingArguments(
    output_dir="./llama-pharma-domain",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=50,
    learning_rate=2e-5,
    fp16=True,
    report_to="none"
)

In [39]:
from transformers import TrainingArguments
help(TrainingArguments)

Help on class TrainingArguments in module transformers.training_args:

class TrainingArguments(builtins.object)
 |
 |  TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop
 |  itself**.
 |
 |  Using [`HfArgumentParser`] we can turn this class into
 |  [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
 |  command line.
 |
 |  Parameters:
 |      output_dir (`str`, *optional*, defaults to `"trainer_output"`):
 |          The output directory where the model predictions and checkpoints will be written.
 |      overwrite_output_dir (`bool`, *optional*, defaults to `False`):
 |          If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir`
 |          points to a checkpoint directory.
 |      do_train (`bool`, *optional*, defaults to `False`):
 |          Whether to run training or not. This argument is not directly used 

In [40]:
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized
)

In [None]:
trainer.train()



In [None]:
# Here we are not specfiying anything means this is full fine-tuning
# Now lets see the LORA based method

import torch, gc
gc.collect()
torch.cuda.empty_cache()


In [None]:
!pip install -U peft bitsandbytes transformers accelarate

In [None]:


from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset

In [None]:
model = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model)


In [None]:
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_fn(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


In [None]:

import dataset

tokenized = dataset.map(tokenize_fn, batched=True)


In [None]:
tokenized

In [None]:
model=AutoModelForCausalLM.from_pretrained(model,load_in_8bit=True,device_map="auto")

In [None]:
lora_config=LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=16,
    target_modules=["q_proj","v_proj"],
    bias="none",
)

In [None]:
model=get_peft_model(model,lora_config)

In [None]:
args = TrainingArguments(
    output_dir="./tinyllama-lora",
    num_train_epochs=5,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=20,
    save_total_limit=1,
    report_to="none"
)

In [None]:
trainer=Trainer(
    model=model,
    args=args,
    train_dataset=tokenized
)

In [None]:
trainer.train()

In [None]:
model_path = "/content/tinyllama-lora/checkpoint-5"

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")


In [None]:
prompt = "Clinical trials demonstrated that combining Atorvastatin with Ezetimibe"


In [None]:


inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

In [None]:
outputs=model.generate(
    **inputs,
    max_new_tokens=100,
    temparature=0.8,
    top_p=0.9,
    do_sample=True,
    repetition_penalty=1.1
)

In [None]:
print("\nModel Output:\n")
print(tokenizer.decode(outputs[0],skip_special_tokens=True))

In [None]:


dataset = load_dataset("HuggingFaceFW/fineweb")
pubmed = load_dataset("ncbi/pubmed")
dataset = load_dataset("datajuicer/the-pile-pubmed-abstracts-refined-by-data-juicer")
dataset = load_dataset("open-llm-leaderboard/open_llm_corpus")
owt = load_dataset("Skylion007/openwebtext")
ds = load_dataset("armanc/scientific_papers")
