In [1]:
import os
import re
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import matplotlib.pyplot as plt

# File path
DATASET_PATH = r"/content/The Project Gutenberg eBook of Indi.txt"


# Load dataset
def load_dataset(path):
    with open(path, 'r', encoding='utf-8') as file:
        text = file.read()
    return text


# Preprocess dataset
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text


# Create vector space model
def create_vector_space_model(corpus):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(corpus)
    return vectorizer, tfidf_matrix


# Load pre-trained GPT2 model and tokenizer
def load_gpt2_model():
    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    return tokenizer, model


# Generate response with GPT-2
def generate_response_gpt2(prompt, tokenizer, model, max_length=100):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response


# Query handling
def handle_query(query, vectorizer, tfidf_matrix, corpus, tokenizer, model, threshold=0.3):
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
    best_match_idx = np.argmax(similarities)
    best_match_score = similarities[best_match_idx]

    if best_match_score >= threshold:
        response = corpus[best_match_idx]
    else:
        response = f"Answer for this question not present in the dataset. But this might be related: {generate_response_gpt2(query, tokenizer, model)}"
    return response


# Evaluate chatbot
def evaluate_chatbot(test_queries, vectorizer, tfidf_matrix, corpus, tokenizer, model):
    responses = [handle_query(query, vectorizer, tfidf_matrix, corpus, tokenizer, model) for query in test_queries]
    print("\nChatbot Responses:")
    for i, response in enumerate(responses):
        print(f"Q{i + 1}: {test_queries[i]}\nA{i + 1}: {response}\n")


# Visualization
def visualize_evaluation(similarities, labels):
    plt.figure(figsize=(8, 6))
    plt.bar(range(len(similarities)), similarities, tick_label=labels)
    plt.xlabel("Query")
    plt.ylabel("Cosine Similarity")
    plt.title("Similarity of Responses to Queries")
    plt.show()


# Main function
def main():
    print("Loading dataset...")
    dataset = load_dataset(DATASET_PATH)
    corpus = dataset.split('\n\n')  # Split into paragraphs
    corpus = [preprocess_text(para) for para in corpus if para.strip()]

    print("Creating vector space model...")
    vectorizer, tfidf_matrix = create_vector_space_model(corpus)

    print("Loading GPT-2 model...")
    tokenizer, model = load_gpt2_model()

    print("Chatbot ready! Ask your questions.")
    while True:
        query = input("You: ")
        if query.lower() in ['exit', 'quit']:
            print("Exiting chatbot. Goodbye!")
            break
        response = handle_query(query, vectorizer, tfidf_matrix, corpus, tokenizer, model)
        print(f"Bot: {response}")


if __name__ == "__main__":
    main()



Loading dataset...


FileNotFoundError: [Errno 2] No such file or directory: '/content/The Project Gutenberg eBook of Indi.txt'

In [None]:
import re
import matplotlib.pyplot as plt
from transformers import AutoModelForCausalLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction



# Step 1: Preprocess Dataset
def preprocess_and_split_dataset(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Remove Project Gutenberg metadata
    text = re.split(r'\*\*\* START OF.*?\*\*\*', text, maxsplit=1)[-1]
    text = re.split(r'\*\*\* END OF.*?\*\*\*', text, maxsplit=1)[0]

    # Split into chunks (e.g., paragraphs)
    paragraphs = [p.strip() for p in re.split(r'\n\n+', text) if p.strip()]
    return paragraphs


# Step 2: Retrieve Relevant Chunks Using Keyword Search
def retrieve_relevant_chunk(question, paragraphs, top_k=3):
    question_words = set(question.lower().split())
    scored_paragraphs = []

    for para in paragraphs:
        para_words = set(para.lower().split())
        common_words = question_words.intersection(para_words)
        score = len(common_words)  # Count common words as relevance score
        scored_paragraphs.append((para, score))

    # Sort paragraphs by score (descending) and select top_k
    scored_paragraphs = sorted(scored_paragraphs, key=lambda x: x[1], reverse=True)
    relevant_chunks = [para for para, score in scored_paragraphs[:top_k]]
    return relevant_chunks


# Step 3: Load GPT Model for Text Generation
def load_gpt_model():
    print("Loading GPT model...")
    model_name = "gpt2"
    model = AutoModelForCausalLM.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model.eval()  # Set to evaluation mode
    return model, tokenizer


# Step 4: Generate Response Using GPT
def generate_response_gpt(question, context, model, tokenizer):
    # Limit the context length and truncate if necessary
    context = " ".join(context.split()[:500])  # Limit context to a maximum of 500 tokens if necessary

    prompt = (
        f"You are a knowledgeable assistant. Use the context below to answer the question.\n\n"
        f"Context: {context}\n\nQuestion: {question}\nAnswer:"
    )

    # Encode the prompt with truncation
    input_ids = tokenizer.encode(prompt, return_tensors="pt", truncation=True, max_length=1024)

    # Check if input length exceeds model's max length and truncate if necessary
    if input_ids.shape[1] > 1024:
        print("Input is too long, truncating context.")
        input_ids = input_ids[:, :1024]  # Truncate the input to fit the model's max length

    # Generate the response
    outputs = model.generate(
        input_ids,
        max_new_tokens=200,  # Number of tokens to generate
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        temperature=0.7,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True
    )

    # Decode the generated output
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer.split("Answer:")[-1].strip()


# Step 5: Evaluate Responses
def evaluate_responses(questions, contexts, generated_responses):
    bleu_scores = []
    smoothing_function = SmoothingFunction().method1

    for i, (context, response) in enumerate(zip(contexts, generated_responses)):
        # BLEU score with smoothing
        bleu_score = sentence_bleu([context.split()], response.split(), smoothing_function=smoothing_function)
        bleu_scores.append(bleu_score)

    return bleu_scores


# Step 6: Visualize Metrics
def visualize_metrics(bleu_scores):
    x = range(len(bleu_scores))
    plt.figure(figsize=(10, 6))
    plt.plot(x, bleu_scores, label='BLEU Score', marker='o')
    plt.title('Evaluation Metrics')
    plt.xlabel('Question Index')
    plt.ylabel('Score')
    plt.legend()
    plt.grid()
    plt.show()


# Main Function
def main():
    dataset_path = r"/content/The Project Gutenberg eBook of Indi.txt"

    print("Preprocessing and splitting dataset...")
    paragraphs = preprocess_and_split_dataset(dataset_path)
    gpt_model, gpt_tokenizer = load_gpt_model()

    print("\nChatbot is ready! Ask your questions below. Type 'exit' to finish and calculate metrics.")

    # Interactive loop to collect questions and generate answers
    questions = []
    contexts = []
    generated_responses = []

    while True:
        question = input("\nYour Question: ")
        if question.lower() in ['exit', 'quit']:
            break

        # Retrieve relevant chunks and generate answer
        relevant_chunks = retrieve_relevant_chunk(question, paragraphs)
        context = " ".join(relevant_chunks[:3])  # Use top 3 chunks for context
        response = generate_response_gpt(question, context, gpt_model, gpt_tokenizer)

        print(f"\nAnswer: {response}")
        questions.append(question)
        contexts.append(context)
        generated_responses.append(response)

    # Evaluate collected responses
    print("\nEvaluating responses...")
    bleu_scores = evaluate_responses(questions, contexts, generated_responses)

    # Visualize metrics
    print("\nVisualizing metrics...")
    visualize_metrics(bleu_scores)


if __name__ == "__main__":
    main()