In [2]:
from datasets import Dataset, load_dataset

In [53]:
import json
def preprocess_intents_json(intents_file):
    with open(intents_file, "r") as f:
        data = json.load(f)
    
    preprocessed_data = []
    
    for intent in data["intents"]:
        for pattern in intent["patterns"]:
            preprocessed_data.append(f"User: {pattern}\n")
            for response in intent["responses"]:
                preprocessed_data.append(f"Assistant: {response}\n")
    
    return "".join(preprocessed_data)

def save_preprocessed_data(preprocessed_data, output_file):
    with open(output_file, "w") as f:
        f.write(preprocessed_data)

intents_file = "intents.json"
output_file = "mental_health_data.txt"

preprocessed_data = preprocess_intents_json(intents_file)
save_preprocessed_data(preprocessed_data, output_file)

In [3]:
from transformers import TextDataset, GPT2Tokenizer

model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

train_dataset = TextDataset(
        tokenizer=tokenizer,
        file_path="mental_health_data.txt",
        block_size=128)
len(train_dataset.examples[0])
# for example in train_dataset.examples:
#         print(tokenizer.convert_ids_to_tokens(example, skip_special_tokens=True))

2023-10-05 00:07:39.002666: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9231] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-10-05 00:07:39.002690: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-10-05 00:07:39.002718: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1516] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-10-05 00:07:39.009567: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


128

In [4]:
def preprocess_data(dataset: Dataset)-> str:
    preprocessed_data = [f"[function]: {i['function']} \n[docstring]: {i['docstring']}" for i in dataset]
    return " ".join(preprocessed_data)
    
def tokenize_data(text: str) -> list[int]:
    block_size = 128
    tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
    examples = []
    for i in range(0, len(tokenized_text) - block_size + 1, block_size):  # Truncate in block of block_size
        examples.append(
            tokenizer.build_inputs_with_special_tokens(tokenized_text[i : i + block_size])
        )
    return examples

In [5]:
dataset = load_dataset("juraj-juraj/doc_gen")

In [7]:
sentences = preprocess_data(dataset["test"])
tokenized_sentences = tokenize_data(sentences)

dataset["test"][0]["function"]


'def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo=\'ball_tree\', verbose=False):\n    \n    X = []\n    y = []\n\n    # Loop through each person in the training set\n    for class_dir in os.listdir(train_dir):\n        if not os.path.isdir(os.path.join(train_dir, class_dir)):\n            continue\n\n        # Loop through each training image for the current person\n        for img_path in image_files_in_folder(os.path.join(train_dir, class_dir)):\n            image = face_recognition.load_image_file(img_path)\n            face_bounding_boxes = face_recognition.face_locations(image)\n\n            if len(face_bounding_boxes) != 1:\n                # If there are no people (or too many people) in a training image, skip the image.\n                if verbose:\n                    print("Image {} not suitable for training: {}".format(img_path, "Didn\'t find a face" if len(face_bounding_boxes) < 1 else "Found more than one face"))\n            else:\n                # 

In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [9]:
def fine_tune_gpt2(model_name, train_dataset, output_dir):
    # Load GPT-2 model and tokenizer
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load training dataset
    # train_dataset = TextDataset(
    #     tokenizer=tokenizer,
    #     file_path=train_file,
    #     block_size=128)
    # Create data collator for language modeling
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False)
    # Set training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=5,
        per_device_train_batch_size=4,
        save_steps=10_000,
        save_total_limit=2,
    )
    # Train the model
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )
    trainer.train()
    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

In [15]:
fine_tune_gpt2("gpt2", tokenized_sentences, "code_gpt")

OutOfMemoryError: CUDA out of memory. Tried to allocate 148.00 MiB (GPU 0; 5.79 GiB total capacity; 4.99 GiB already allocated; 52.62 MiB free; 5.04 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

: 

In [11]:
model_name = "./code_gpt/"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name) 

prompt = f"[function]: {dataset['test'][0]['function']}"

inputs = tokenizer(prompt, return_tensors="pt")
output_sequences = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],
    max_length=50,
)
print(tokenizer.decode(output_sequences[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[function]: def train(train_dir, model_save_path=None, n_neighbors=None, knn_algo='ball_tree', verbose=False):
    
    X = []
    y = []

    # Loop through each person in the training set
    for class_dir in os.listdir(train_dir):
        if not os.path.isdir(os.path.join(train_dir, class_dir)):
            continue

        # Loop through each training image for the current person
        for img_path in image_files_in_folder(os.path.join(train_dir, class_dir)):
            image = face_recognition.load_image_file(img_path)
            face_bounding_boxes = face_recognition.face_locations(image)

            if len(face_bounding_boxes)!= 1:
                # If there are no people (or too many people) in a training image, skip the image.
                if verbose:
                    print("Image {} not suitable for training: {}".format(img_path, "Didn't find a face" if len(face_bounding_boxes) < 1 else "Found more than one face"))
            else:
                # Add face enco