In [11]:
!pip install wandb
!pip install transformers[torch] accelerate -U

from transformers import (TextDataset, DataCollatorForLanguageModeling,GPT2Tokenizer,
                          GPT2LMHeadModel,Trainer, TrainingArguments)
import pandas as pd
from google.colab import files
import matplotlib.pyplot as plt
import wandb
wandb.init(mode="disabled")
import warnings
warnings.filterwarnings('ignore')



In [12]:
# Assuming the file name is 'peaceandme.csv', you can read it using pandas
file_name = '/content/peaceandme.csv'
dataset = pd.read_csv(file_name)

# Display the first few rows of the dataset
print(dataset.head())

                                             Context  \
0  I'm going through some things with my feelings...   
1  I'm going through some things with my feelings...   
2  I'm going through some things with my feelings...   
3  I'm facing severe depression and anxiety and I...   
4  I'm facing severe depression and anxiety and I...   

                                            Response  
0  It sounds like you're going through a tough ti...  
1  Feeling worthless can be overwhelming, but it'...  
2  It's courageous to recognize the need for chan...  
3  I'm sorry to hear that you're going through se...  
4  Dealing with overwhelming feelings like severe...  


In [13]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203 entries, 0 to 202
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Context   203 non-null    object
 1   Response  203 non-null    object
dtypes: object(2)
memory usage: 3.3+ KB


In [14]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets (80% train, 20% test)
train_df, test_df = train_test_split(dataset, test_size=0.2, random_state=42)

# Optional: Further split the training set into training and validation sets (90% train, 10% validation)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)

# Display the sizes of each set
print(f'Training set size: {len(train_df)}')
print(f'Validation set size: {len(val_df)}')
print(f'Test set size: {len(test_df)}')

# Display the first few rows of each set
print('Training set:')
print(train_df.head())
print('Validation set:')
print(val_df.head())
print('Test set:')
print(test_df.head())


Training set size: 145
Validation set size: 17
Test set size: 41
Training set:
                                               Context  \
49   I need help knowing how to deal with stress. W...   
105  Now I have anger and trust issues. How can I t...   
142  I'm struggling with time management. How can I...   
125  How can I find a mental health professional fo...   
175                                             Thanks   

                                              Response  
49   Stress management involves adopting healthy ha...  
105  Understanding the source of your anger and tru...  
142  Time management can be improved with tools lik...  
125  If you prefer face-to-face sessions or are see...  
175                                        My pleasure  
Validation set:
                                               Context  \
14   I just don't know what I want in life anymore....   
91   I need help dealing with stress. How can I han...   
58   I've been having horrible anxiety f

In [15]:
def load_dataset(file_path, tokenizer, block_size = 1024):
    dataset_train = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset_train

In [16]:
def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=mlm,
    )
    return data_collator

In [17]:
def train(train_file_path, model_name, output_dir, overwrite_output_dir,
          per_device_train_batch_size, num_train_epochs):
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    # Load datasets
    train_dataset = load_dataset(train_file_path, tokenizer)

    # Load data collator
    data_collator = load_data_collator(tokenizer)

    # Save tokenizer
    tokenizer.save_pretrained(output_dir)

    # Load or initialize model
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Save model
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        num_train_epochs=num_train_epochs,
        per_device_train_batch_size=per_device_train_batch_size,
        logging_dir="./logs",
        logging_steps=100,  # Log every 100 steps
        save_steps=500,  # Save checkpoint every 500 steps
        logging_first_step=True,
        save_total_limit=2,
        learning_rate=.0001
    )
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
    )
    hist = trainer.train()
    trainer.save_model()
    return trainer,hist

In [18]:
train_file_path = '/content/peaceandme.csv'  # Path to your uploaded dataset
model_name = 'gpt2'
output_dir = '/content/custom_model'  # Specify your desired output directory
overwrite_output_dir = True
per_device_train_batch_size = 2
num_train_epochs = 100

In [19]:
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [20]:
model = GPT2LMHeadModel.from_pretrained(model_name)
model

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [21]:
model.base_model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0-11): 12 x GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (act): NewGELUActivation()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

In [None]:
# Train
hist=train(
    train_file_path=train_file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
)

In [None]:
print(f"Number of training examples: {len(train_file_path)}")

In [None]:
print("Global Step:", hist[1].global_step)
print("Epoch:", hist[1].metrics['epoch'])
print("Train Runtime:", hist[1].metrics['train_runtime'])
print("Train Samples Per Second:", hist[1].metrics['train_samples_per_second'])
print("Train Steps Per Second:", hist[1].metrics['train_steps_per_second'])
print("Total FLOPS:", hist[1].metrics['total_flos'])
print("Train Loss:", hist[1].metrics['train_loss'])

In [None]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

def generate_text(model_path, sequence, max_length):

    model = load_model(model_path)
    tokenizer = load_tokenizer(model_path)
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=50,
        top_p=0.95,
    )
    return tokenizer.decode(final_outputs[0], skip_special_tokens=True)

In [None]:
model_path = "/content/custom_model"
sequence = dataset['Context'].iloc[100]
max_len = 100
print("Q : ",dataset['Context'].iloc[100])
print()
print("A : ",dataset['Response'].iloc[100])
print()
print("G : ",generate_text(model_path, sequence, max_len))

In [None]:
sequence = dataset['Context'].iloc[50]
max_len = 200
print("Q : ",dataset['Context'].iloc[50])
print()
print("A : ",dataset['Response'].iloc[50])
print()
print("G : ",generate_text(model_path, sequence, max_len))

In [None]:
sequence = dataset['Context'].iloc[150]
max_len = 200
print("Q : ",dataset['Context'].iloc[150])
print()
print("A : ",dataset['Response'].iloc[150])
print()
print("G : ",generate_text(model_path, sequence, max_len))

In [None]:
sequence = "I'm sad"
max_len = 500
print("Q : ",sequence)
print()
print("G : ",generate_text(model_path, sequence, max_len))

In [None]:
sequence = input("Enter the sequence: ")
max_len = 500
print("Q : ",sequence)
print()
print("G : ",generate_text(model_path, sequence, max_len))

In [None]:
sequence = input("Enter the sequence: ")
max_len = 500
print("Q : ",sequence)
print()
print("G : ",generate_text(model_path, sequence, max_len))