# Let's Start by Installing Dependencies

In [None]:
!pip install transformers accelerate bitsandbytes>=0.39.0  -qU

## Create a text-generation pipeline with GPT-NEO open-source model

In [None]:
from transformers import pipeline
# put the name of the pipeline
# put the name of the model. get it from HuggingFace
# https://huggingface.co/EleutherAI/gpt-neo-125m
generator = pipeline('text-generation', model ='EleutherAI/gpt-neo-125M')


Downloading (…)lve/main/config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/560 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

## Now that pipeline is called, let's test the text-generation feature!

In [None]:
post = "This is my first blog post, I'm really excited!"
result = generator(post, max_length=50, do_sample=True, temperature=0.9)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
result

[{'generated_text': "This is my first blog post, I'm really excited! I've been writing this blog for a long time, and just started a new one for my new family and friends, so I'm taking this time off when I think about it!\n"}]

In [None]:
post = "Write a python function that takes two numbers as function params and returns their convolution equation. Star with code:\n"
result = generator(post, max_length=512, do_sample=True, temperature=0.9)



Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
result[0]['generated_text']

"Write a python function that takes two numbers as function params and returns their convolution equation. Star with code:\n>>> myInt = [1,2]\n>>> my = np.array(myVar(int) for int in range((4,'f','n'))))\n>>> my = np.array(myVar(2))\n>>> my\narray([[1,                                                                         \n              [2],                                                                      \n              [3,                                                                     \n               [4,                                                                     \n               [5,                                                                      \n           "

# Finetuning

Finetuning is the process of feeding the model the specific data we need it to understand and answer based on it

In order to finetune the model, we need to call it from the pretrained instance on Huggingface

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
model_name = "gpt2" #we will use the gpt2 default model

# we need a model and a tokenizer

model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Time to check the model performance before finetuning

In [None]:
#now that we called the model
# we need to generate text
#remember we need to encode the input ( tokenize it )
#and decode the output ( de-tokenize it , bring it back to text from tokenx )
def generate_text(prompt, max_length=100, temperature=0.7):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output_ids = model.generate(input_ids, max_length=max_length, temperature=temperature)
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return generated_text


and here's an example:

In [None]:
prompt = "In a futuristic world, "
generated_text = generate_text(prompt, max_length=150, temperature=0.8)
print(generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In a futuristic world,  the world is a place of chaos and chaos.  The world is a place of chaos and chaos.  The world is a place of chaos and chaos.  The world is a place of chaos and chaos.  The world is a place of chaos and chaos.  The world is a place of chaos and chaos.  The world is a place of chaos and chaos.  The world is a place of chaos and chaos.  The world is a place of chaos and chaos.  The world is a place of chaos and chaos.  The world is a place of chaos and chaos.  The world is a place of chaos and chaos. 


whenever we want to finetune, we need to preprocess our data

The data we are dealing with is from Kaggle
https://www.kaggle.com/datasets/elvis23/mental-health-conversational-data/

In [None]:
import json
def preprocess_intents_json(intents_file):
    with open(intents_file, "r") as f:
        data = json.load(f)

    preprocessed_data = []

    for intent in data["intents"]:
        for pattern in intent["patterns"]:
            preprocessed_data.append(f"User: {pattern}\n")
            for response in intent["responses"]:
                preprocessed_data.append(f"Assistant: {response}\n")

    return "".join(preprocessed_data)

def save_preprocessed_data(preprocessed_data, output_file):
    with open(output_file, "w") as f:
        f.write(preprocessed_data)



Preprocess data:

In [None]:
intents_file = "intents.json"
output_file = "mental_health_data.txt"

preprocessed_data = preprocess_intents_json(intents_file)
save_preprocessed_data(preprocessed_data, output_file)

Time to finetune the model: for finetuning, as we mentioned before, we need to call the model separated: model and tokenizer

In [None]:
#import required libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments



In python, a clean code is a code that can speak itself

in order to keep our code clean, divide each "functional part" to a separate funciton

each part that does onething, put it in a separate function and call it
Here's what we did for finetuning function

In [None]:


def fine_tune_gpt2(model_name, train_file, output_dir):
    # Load GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load training dataset
    train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path=train_file,
        block_size=128)
    # Create data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)
    # Set training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
    )
    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )
    trainer.train()
    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)






Time to finetune!

In [None]:
fine_tune_gpt2("gpt2", "mental_health_data.txt", "output")




Step,Training Loss


In [None]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer



def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer


def generate_text(sequence, max_length):
    model_path = "output"
    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=True))




In [None]:

# sequence = input() # oil price
sequence = "User: I feel overwhelmed today"
max_len = 30
generate_text(sequence, max_len) # oil price for July June which had been low at as low as was originally stated Prices have since resumed