In [None]:
!pip install -U accelerate
!pip install -U transformers
!pip install datasets

Collecting accelerate
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/314.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━[0m [32m235.5/314.1 kB[0m [31m6.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [None]:
from transformers import GPTNeoForCausalLM, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
import re
import time

# Function to parse WhatsApp chat data
def parse_whatsapp_chat(chat_file):
    with open(chat_file, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    messages = []
    for line in lines:
        match = re.match(r'(\d{1,2}/\d{1,2}/\d{2}), (\d{1,2}:\d{2}\s?[APMapm]{2}) - (.*?): (.*)', line)
        if match:
            date, time, sender, message = match.groups()
            messages.append({
                'date': date,
                'time': time,
                'sender': sender,
                'message': message
            })
    return messages

# Load and parse chat file
chat_file = '/content/WhatsApp Chat with Dilluu.txt'
messages = parse_whatsapp_chat(chat_file)

# Filter friend's messages and create a dataset
friend_name = 'Dilluu'  # Replace with your friend's name as it appears in the chat
friend_messages = [msg['message'] for msg in messages if msg['sender'] == friend_name]
df = pd.DataFrame(friend_messages, columns=['text'])
dataset = Dataset.from_pandas(df)

# Load pre-trained model and tokenizer
model_name = "EleutherAI/gpt-neo-125M"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name)

# Set pad_token_id to eos_token_id
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    inputs['labels'] = inputs['input_ids'].clone()
    return inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/526M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

Map:   0%|          | 0/5924 [00:00<?, ? examples/s]

In [None]:
# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    fp16=True,  # Enable mixed precision training
    logging_dir='./logs',  # Directory for storing logs
    logging_steps=500,
)

# Create Trainer and train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start training and measure time
start_time = time.time()
trainer.train()
end_time = time.time()

training_time = end_time - start_time
print(f"Training completed in {training_time / 60:.2f} minutes")


Step,Training Loss
500,0.3754
1000,0.1867
1500,0.178


Step,Training Loss
500,0.3754
1000,0.1867
1500,0.178
2000,0.1425
2500,0.1355
3000,0.1369
3500,0.1129
4000,0.1072


Training completed in 9.44 minutes


In [None]:
import torch

In [None]:

def generate_styled_response(prompt, model, tokenizer, max_length=150, temperature=0.7, top_k=50, top_p=0.9):
    inputs = tokenizer.encode(prompt, return_tensors='pt').to('cuda') # Move inputs to GPU
    # print(inputs)
    outputs = model.generate(
        inputs,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Truncate the prompt from the response
    # response = response[len(prompt):].strip()
    return response

# Example prompt

x = 0
while (x == 0):
  prompt = input("enter the prompttt: ")

  # Define generation parameters
  max_length = 400
  temperature = 0.1
  top_k = 50
  top_p = 0.3

  response = generate_styled_response(prompt, model, tokenizer, max_length=max_length, temperature=temperature, top_k=top_k, top_p=top_p)
  print(response)
  x = int(input("enter 0 to coninue: "))


enter the prompttt: hi




hi chudhamm
enter 0 to coninue: 0
enter the prompttt: em chesthunnav
em chesthunnaviki vellam
enter 0 to coninue: 0
enter the prompttt: chaduthunnavaa sweetu
chaduthunnavaa sweetu
enter 0 to coninue: 0
enter the prompttt: good morningg
good morningg kaushuuu💗💗
enter 0 to coninue: 0
enter the prompttt: kaushuu 
kaushuu 
enter 0 to coninue: 0
enter the prompttt: sleeping i am
sleeping i amee
enter 0 to coninue: 0
enter the prompttt: byee ma
byee ma
enter 0 to coninue: 0
enter the prompttt: goood nightt
goood nightt😍😍


KeyboardInterrupt: Interrupted by user