In [None]:
!pip install pandas
!pip install pyarrow
!pip install fastparquet

Collecting fastparquet
  Downloading fastparquet-2023.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3 (from fastparquet)
  Downloading cramjam-2.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cramjam, fastparquet
Successfully installed cramjam-2.7.0 fastparquet-2023.10.1


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Read a Parquet file
path = r'/content/drive/My Drive/Colab Notebooks/NLP/chatbot/train-00000-of-00006-f8cc7498b1ca040f.parquet'
df = pd.read_parquet(path)

In [None]:
df.shape

(249038, 3)

In [None]:
# To take 1/20th of the data randomly
train_df = df.sample(frac=0.20, random_state=42)

In [None]:
test_df = df.sample(frac=0.05, random_state=123)

In [None]:
test_df.shape

(12452, 3)

In [None]:
train_df.shape

(49808, 3)

In [None]:
!pip install nltk



In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import pos_tag, ne_chunk
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
def categorize_questions(questions):
    stop_words = set(stopwords.words('english'))

    # Extracting key nouns from the questions as potential categories
    categories = defaultdict(int)
    for question in questions:
        words = word_tokenize(question)
        words = [word for word in words if word.isalpha() and word not in stop_words]
        tags = pos_tag(words)

        for word, tag in tags:
            if tag in ['NN', 'NNS', 'NNP', 'NNPS']:  # Nouns and proper nouns
                categories[word.lower()] += 1

    # Selecting the most common noun as the category
    if categories:
        most_common_category = max(categories, key=categories.get)
    else:
        most_common_category = "General"

    return most_common_category

In [None]:
import pandas as pd
import json
import re
from collections import defaultdict

# Function to extract questions and answers from text
def extract_qa_pairs(text):
    pairs = text.split("\n\n")
    questions, answers = [], []
    for pair in pairs:
        if "####Human####:" in pair:
            question = pair.replace("####Human####:", "").strip()
            questions.append(question)
        elif "####Assistant####:" in pair:
            answer = pair.replace("####Assistant####:", "").strip()
            answers.append(answer)
    return questions, answers

In [None]:
import json
import pickle
import nltk
import numpy as np
from nltk.stem.lancaster import LancasterStemmer
from scipy.sparse import lil_matrix
from tqdm import tqdm

In [None]:
# Extract the data and save as json
intents = []

for _, row in tqdm(train_df.iterrows(), total=train_df.shape[0], desc="Processing Intents"):
    questions, answers = extract_qa_pairs(row['text'])
    category = categorize_questions(questions)
    intent = {
        "tag": category,
        "patterns": questions,
        "responses": answers,
        "context_set": ""
    }
    intents.append(intent)

# Save to JSON
filename = r'/content/drive/My Drive/Colab Notebooks/NLP/chatbot/train_intent_data.json'

output = {"intents": intents}
with open(filename, 'w') as f:
    json.dump(output, f, indent=4)

Processing Intents: 100%|██████████| 49808/49808 [01:35<00:00, 523.10it/s]


In [None]:
# Extract the test data and save as json
intents = []

for _, row in tqdm(test_df.iterrows(), total=test_df.shape[0], desc="Processing Intents"):
    questions, answers = extract_qa_pairs(row['text'])
    category = categorize_questions(questions)
    intent = {
        "tag": category,
        "patterns": questions,
        "responses": answers,
        "context_set": ""
    }
    intents.append(intent)

# Save to JSON
filename = r'/content/drive/My Drive/Colab Notebooks/NLP/chatbot/test_intent_data.json'

output = {"intents": intents}
with open(filename, 'w') as f:
    json.dump(output, f, indent=4)

Processing Intents: 100%|██████████| 12452/12452 [00:23<00:00, 536.20it/s]


In [None]:
def create_training_data(data, model_filename):

    Stemmer = LancasterStemmer()

    words = {Stemmer.stem(word.lower()) for intent in tqdm(data['intents'], desc="Processing words") for pattern in intent['patterns'] for word in nltk.word_tokenize(pattern) if word != '?'}
    labels = sorted({intent['tag'] for intent in data['intents']})

    for intent in tqdm(data['intents'], desc="Extending labels"):
        if "more" in intent:
            labels.extend(intent['tag'] + '_' + more + '_' + keys for more in intent['more'] for keys in intent['more'][more])

    training_size = sum(len(intent['patterns']) for intent in data['intents'])
    training = lil_matrix((training_size, len(words)), dtype=int)
    output = lil_matrix((training_size, len(labels)), dtype=int)

    row_index = 0
    for intent in tqdm(data['intents'], desc="Creating training data"):
        for pattern in intent['patterns']:
            words_in_pattern = [Stemmer.stem(word) for word in nltk.word_tokenize(pattern)]
            training[row_index, :] = [1 if word in words_in_pattern else 0 for word in words]
            output[row_index, labels.index(intent['tag'])] = 1
            row_index += 1

        if "more" in intent:
            for more in intent['more']:
                for keys in intent['more'][more]:
                    tag = intent['tag'] + '_' + more + '_' + keys
                    doc = ['what', 'is', intent['tag'], more, keys]
                    training[row_index, :] = [1 if Stemmer.stem(word) in doc else 0 for word in words]
                    output[row_index, labels.index(tag)] = 1
                    row_index += 1

    training = training.tocsr()
    output = output.tocsr()

    with open(model_filename, 'wb') as file:
        pickle.dump((sorted(words), labels, training, output), file)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
filename = r'/content/drive/My Drive/Colab Notebooks/NLP/chatbot/train_intent_data.json'

with open(filename) as file:
    data = json.load(file)

In [None]:
# filename = r'/content/drive/My Drive/Colab Notebooks/NLP/chatbot/sub_model_data.pickle'
# create_training_data(data, filename)

In [None]:
!pip install transformers



In [None]:
# Load and Preprocess data for Finetunning
import pickle
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np


In [None]:
# Load dataset
import json
filename = r'/content/drive/My Drive/Colab Notebooks/NLP/chatbot/train_intent_data.json'

with open(filename, 'r') as f:
    data = json.load(f)

In [None]:
len(data['intents'])

49808

In [None]:
# Filter out intents with empty patterns or responses
cleaned_intents = [intent for intent in data["intents"] if intent["patterns"] and intent["responses"]]
data["intents"] = cleaned_intents

# Then use this cleaned data to create your dataset


In [None]:
len(data['intents'])

49807

In [None]:
import random

class ChatDataset(Dataset):
    def __init__(self, tokenizer, data, max_length=512):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = max_length

    def __len__(self):
        return len(self.data["intents"])

    def __getitem__(self, idx):
        intent = self.data["intents"][idx]

        # Check if patterns and responses lists are not empty
        if not intent["patterns"] or not intent["responses"]:
            raise ValueError(f"Empty patterns or responses found in intent at index {idx}")

        pattern = random.choice(intent["patterns"])
        response = random.choice(intent["responses"])

        combined_text = pattern + " " + response
        encoding = self.tokenizer(combined_text, return_tensors='pt', max_length=self.max_length, padding="max_length", truncation=True)

        inputs = encoding.input_ids.squeeze()
        attention_mask = encoding.attention_mask.squeeze()

        return inputs, attention_mask


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
import torch

# Initialize tokenizer and model for a smaller variant of GPT-2
model_name = 'distilgpt2'  # You can choose 'gpt2-small' or 'gpt2-medium'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token
model = GPT2LMHeadModel.from_pretrained(model_name)

# Assuming you have a dataset class 'ChatDataset'
dataset = ChatDataset(tokenizer, data)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=-1)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [None]:
# Fine-tuning loop
for epoch in range(5):
    model.train()
    for batch in dataloader:
        try:
            inputs, attention_mask = [x.to(device) for x in batch]
            outputs = model(inputs, labels=inputs, attention_mask=attention_mask)
            loss = outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

        except ValueError as e:
            print(f"Skipping batch due to error: {e}")
            continue

    print(f"Epoch {epoch}, Loss: {loss.item()}")


# Save the fine-tuned model
model.save_pretrained('/content/drive/My Drive/Colab Notebooks/NLP/chatbot/fine_tuned_model')

Epoch 0, Loss: 8.7312650680542
Epoch 1, Loss: 10.383710861206055
Epoch 2, Loss: 9.240330696105957
Epoch 3, Loss: 10.68027114868164
Epoch 4, Loss: 10.184288024902344


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import json

# Load the fine-tuned model and tokenizer

model_name = 'distilgpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set the padding token
model = GPT2LMHeadModel.from_pretrained('/content/drive/My Drive/Colab Notebooks/NLP/chatbot/fine_tuned_model')


# Load your test dataset
with open('/content/drive/My Drive/Colab Notebooks/NLP/chatbot/test_intent_data.json', 'r') as f:
    test_data = json.load(f)

test_dataset = ChatDataset(tokenizer, test_data)
test_loader = DataLoader(test_dataset, batch_size=2)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def evaluate(model, dataloader):
    model.eval()
    total_loss, total_accuracy = 0, 0
    with torch.no_grad():
        for batch in dataloader:
            inputs, attention_mask = [x.to(device) for x in batch]
            outputs = model(inputs, labels=inputs, attention_mask=attention_mask)
            loss = outputs.loss
            total_loss += loss.item()

            # For accuracy, you'll need to modify this part based on how your responses are structured
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            accuracy = (predictions == inputs).float().mean()
            total_accuracy += accuracy.item()

    avg_loss = total_loss / len(dataloader)
    avg_accuracy = total_accuracy / len(dataloader)
    return avg_loss, avg_accuracy

# Evaluate the model
loss, accuracy = evaluate(model, test_loader)
print(f"Perplexity: {torch.exp(torch.tensor(loss))}, Accuracy: {accuracy}")


Perplexity: 18363.35546875, Accuracy: 0.0027654631284131062


In [11]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import json

# Function to generate model responses
def generate_response(model, tokenizer, text, device):
    # Check if the input text is not empty
    if not text.strip():
        return ""  # Return empty string for empty input

    # Encode the input text with left padding
    encoding = tokenizer.encode_plus(
        text,
        return_tensors='pt',
        max_length=50,  # Adjust as needed
        padding="max_length",
        truncation=True
    )
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Check if input_ids is not empty
    if input_ids.size(1) == 0:
        return ""  # Return empty string for empty tensor

    # Generate response using the model
    with torch.no_grad():
        output_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=50, num_return_sequences=1)

    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response


# Load the fine-tuned model and tokenizer
model_name = 'distilgpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'left'  # Set padding to the left

model = GPT2LMHeadModel.from_pretrained('/content/drive/My Drive/Colab Notebooks/NLP/chatbot/fine_tuned_model')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load your test dataset
with open('/content/drive/My Drive/Colab Notebooks/NLP/chatbot/test_intent_data.json', 'r') as f:
    test_data = json.load(f)["intents"]

# Initialize BLEU score smoothing function
smoothie = SmoothingFunction().method1

# Evaluate using BLEU and accuracy
total_bleu_score = 0
correct_responses = 0
total_responses = 0

for intent in test_data:
    for pattern in intent["patterns"]:
        generated_response = generate_response(model, tokenizer, pattern, device)
        reference_responses = [response.split() for response in intent["responses"]]
        generated_response_tokens = generated_response.split()

        # Calculate BLEU score with smoothing
        bleu_score = sentence_bleu(reference_responses, generated_response_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothie)
        total_bleu_score += bleu_score

        # Check for correct response
        if generated_response in intent["responses"]:
            correct_responses += 1
        total_responses += 1

avg_bleu_score = total_bleu_score / total_responses
accuracy = correct_responses / total_responses

print(f"Average BLEU Score: {avg_bleu_score}, Accuracy: {accuracy}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.

Average BLEU Score: 0.020330079378294764, Accuracy: 0.0
