In [None]:
!pip install bitsandbytes
!pip install peft
!pip install accelerate

In [None]:
file_path = "new_file.txt"
text_content = """
This is the first document.
Here is another example of text data.
Unlabeled data is used for unsupervised learning.
Fine-tuning a model on new data can improve its performance.
Machine learning models require a lot of data to train effectively.
Natural language processing is a field of artificial intelligence.
Deep learning models can learn complex patterns from data.
Transformer models have revolutionized NLP tasks.
Language models can generate human-like text.
Pre-trained models can be fine-tuned on specific tasks.
Fine-tuning helps adapt models to new domains and tasks.
Text data can be sourced from various domains and contexts.
Large-scale datasets are crucial for training robust models.
Unsupervised learning does not require labeled data.
Self-supervised learning is a powerful approach in NLP.
The quality of text data affects the performance of language models.
Tokenization is an important step in preprocessing text data.
Sequence models can handle variable-length text inputs.
Attention mechanisms enable models to focus on relevant parts of the input.
Language models can be evaluated on various benchmarks.
Text generation is a common application of language models.
Sentiment analysis is a popular task in NLP.
Contextual embeddings capture the meaning of words in context.
Text classification is used to categorize text into predefined classes.
Entity recognition identifies named entities in text.
Language models can be used for text summarization.
Data augmentation techniques can enhance the training process.
Fine-tuning requires careful selection of hyperparameters.
Preprocessing text data involves cleaning and normalizing it.
Language models can be adapted to different languages.
"""
# Open the file in write mode
with open(file_path, 'w') as file:
    # Write content to the file
    file.write(text_content)
     
print(f"File '{file_path}' created successfully.")

In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
model_name='TinyLlama/TinyLlama-1.1B-Chat-v1.0'
tokenizer = AutoTokenizer.from_pretrained(model_name)#
#model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T")
 
#Quantisation
from transformers import BitsAndBytesConfig, Trainer, TrainingArguments
# Configure the model for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # For 8-bit quantization
    load_in_4bit=False  # Set to True if you want 4-bit quantization
)
# Configure the model for 8-bit quantization
#bnb_config = BitsAndBytesConfig.from_pretrained(model_name, load_in_8bit=True)
 
# Load the quantized model
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config)
##Adding trainable quantised peft adapter which will help in transfer learning through layer freezing
from peft import LoraConfig
 
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)
model.add_adapter(peft_config)
# # Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# model.to(device)
 
import torch
from transformers import TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments
 
# Function to load dataset
def load_dataset(file_path, tokenizer, block_size=128):
    dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=block_size
    )
    return dataset
 
# Function to create data collator
def create_data_collator(tokenizer):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,  # Masked Language Modeling (MLM) is false for autoregressive models like GPT-2
    )
    return data_collator
 
# Load dataset
train_dataset = load_dataset("/kaggle/working/new_file.txt", tokenizer)
 
# Create data collator
data_collator = create_data_collator(tokenizer)
 
# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)
 
# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)
 

2024-06-05 12:47:37.575511: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-05 12:47:37.575574: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-05 12:47:37.577007: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



In [2]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss


TrainOutput(global_step=20, training_loss=1.7415674209594727, metrics={'train_runtime': 68.0911, 'train_samples_per_second': 0.587, 'train_steps_per_second': 0.294, 'total_flos': 32057044500480.0, 'train_loss': 1.7415674209594727, 'epoch': 20.0})

In [3]:
model.save_pretrained("./fine-tuned-llama")
tokenizer.save_pretrained("./fine-tuned-llama")



('./fine-tuned-llama/tokenizer_config.json',
 './fine-tuned-llama/special_tokens_map.json',
 './fine-tuned-llama/tokenizer.model',
 './fine-tuned-llama/added_tokens.json',
 './fine-tuned-llama/tokenizer.json')

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the model and tokenizer
model = AutoModelForCausalLM.from_pretrained("./fine-tuned-llama")
tokenizer = AutoTokenizer.from_pretrained("./fine-tuned-llama")

# Check if the tokenizer is using padding side left or right
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Generate a response
prompt = "why text classfication is used"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate the model's response
output = model.generate(**inputs, max_length=100)

# Decode the output
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response,".........")


why text classfication is used in the context of image recognition. .........


In [6]:
prompt = "what is mean by machine learning"
inputs = tokenizer(prompt, return_tensors="pt")

# Generate the model's response
output = model.generate(**inputs, max_length=100)

# Decode the output
response = tokenizer.decode(output[0], skip_special_tokens=True)
print(response,".........")


what is mean by machine learning and how it can be used in the field of finance. .........
