In [None]:
!pip install llama-index-llms-groq
!pip install llama-index
!pip install pdfplumber
!pip install scikit-learn

In [2]:
from llama_index.llms.groq import Groq
import pdfplumber
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import numpy as np


In [3]:
# Create an API key at the Groq console (https://console.groq.com/keys)
# Then set it to the environment variable GROQ_API_KEY or pass it directly

api_key = "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"  # Replace with your actual API key
llm = Groq(model="llama3-70b-8192", api_key=api_key)


In [4]:
def extract_text_with_pdfplumber(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Example usage
pdf_path = "/content/agenda.pdf"  # Replace with the path to your PDF file
extracted_text = extract_text_with_pdfplumber(pdf_path)
print(extracted_text)


Agenda
Global IndiaAI Summit 2024 3-4 July 2024, New Delhi
Day 1 - Wednesday, July 3
Start Session Description
10:00 - 11:00
Opening Ceremony (Auditorium 1)
11:00 – 13:30
GPAI Session (@ Summit Room) : Executive Council Meeting
(By Invitation only)
11:30 - 13:00 Side Event 1 (@ The session is about learning and sharing knowledge on Large
Auditorium 1): Language Models (LLMs) and Large Multimodal Models (LMMs),
with the aim to understand the unique linguistic and cultural
IndiaAI: Large diversity inherent to India, examining how LLMs can effectively
Language Model address challenges associated with multilingualism. Furthermore,
the session will cover the ethical considerations and biases linked
to these models, promoting a discussion on responsible AI
practices, including fairness, inclusiveness, misinformation
mitigation, and intellectual property rights within diverse cultural
contexts. Lastly, the session will focus on the collaborative
opportunities among indigenous communities, aca

In [5]:
# Define a list of questions and their expected answers (true labels)
questions_and_answers = [
    ("What is the name of the summit?", "Global IndiaAI Summit 2024"),
    ("When and where is the Global IndiaAI Summit 2024 taking place?", "3-4 July 2024, New Delhi"),
    ("What is the time and venue for the opening ceremony on Day 1?", "10:00 - 11:00 at Auditorium 1"),
    ("What is the GPAI Session about at 11:00 on Day 1?", "Executive Council Meeting (By Invitation only)"),
]

In [None]:
!pip install rouge-score

In [7]:
from rouge_score import rouge_scorer

In [16]:
# Function to ask questions and get predictions from the model
def ask_questions(text, questions_and_answers):
    true_labels = []
    predicted_labels = []

    for question, true_answer in questions_and_answers:
        true_labels.append(true_answer)
        response = llm.complete(f" Give  answers based on {text}, for the question: {question}")
        predicted_labels.append(response.text.strip())  # Accessing the text attribute directly

    return true_labels, predicted_labels

# Example usage
true_labels, predicted_labels = ask_questions(extracted_text, questions_and_answers)
print("True Labels:")
print(true_labels)
print("Predicted Labels:")
print(predicted_labels)


True Labels:
['Global IndiaAI Summit 2024', '3-4 July 2024, New Delhi', '10:00 - 11:00 at Auditorium 1', 'Executive Council Meeting (By Invitation only)']
Predicted Labels:
['Global IndiaAI', 'Here are two-word answers based on the agenda:\n\n* New Delhi\n* July 2024', 'Here are the two-word answers:\n\n10:00 Auditorium', 'Executive Council']


In [17]:
# Function to evaluate model using token-level comparisons
def evaluate_model(true_labels, predicted_labels):
    correct_count = 0
    total_count = len(true_labels)

    for true_label, predicted_label in zip(true_labels, predicted_labels):
        # Tokenize and convert to sets
        true_tokens = set(true_label.lower().split())
        predicted_tokens = set(predicted_label.lower().split())

        # Calculate intersection
        intersection = true_tokens.intersection(predicted_tokens)

        # If there is any intersection, count it as correct (lenient matching)
        if intersection:
            correct_count += 1

    precision = correct_count / total_count
    recall = correct_count / total_count
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

precision, recall, f1 = evaluate_model(true_labels, predicted_labels)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Precision: 1.0
Recall: 1.0
F1 Score: 1.0


In [18]:
# Print each question, true answer, and predicted answer
print("Comparison of True and Predicted Answers:")
for i, (true_label, predicted_label) in enumerate(zip(true_labels, predicted_labels)):
    print(f"Question {i + 1}:")
    print(f"True Answer: {true_label}")
    print(f"Predicted Answer: {predicted_label}")
    if true_label != predicted_label:
        print("--> Score impacted: Check precision and recall for this question.")
    print()


Comparison of True and Predicted Answers:
Question 1:
True Answer: Global IndiaAI Summit 2024
Predicted Answer: Global IndiaAI
--> Score impacted: Check precision and recall for this question.

Question 2:
True Answer: 3-4 July 2024, New Delhi
Predicted Answer: Here are two-word answers based on the agenda:

* New Delhi
* July 2024
--> Score impacted: Check precision and recall for this question.

Question 3:
True Answer: 10:00 - 11:00 at Auditorium 1
Predicted Answer: Here are the two-word answers:

10:00 Auditorium
--> Score impacted: Check precision and recall for this question.

Question 4:
True Answer: Executive Council Meeting (By Invitation only)
Predicted Answer: Executive Council
--> Score impacted: Check precision and recall for this question.



In [19]:
# Function to calculate precision, recall, and F1 score for each answer
def calculate_scores(true_label, predicted_label):
    true_tokens = set(true_label.lower().split())
    predicted_tokens = set(predicted_label.lower().split())

    intersection = true_tokens.intersection(predicted_tokens)
    precision = len(intersection) / len(predicted_tokens) if predicted_tokens else 0
    recall = len(intersection) / len(true_tokens) if true_tokens else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1

# Function to evaluate model using token-level comparisons and print details
def evaluate_and_print(true_labels, predicted_labels):
    total_precision, total_recall, total_f1 = 0, 0, 0
    total_count = len(true_labels)

    print("Comparison of True and Predicted Answers with Scores:")
    for i, (true_label, predicted_label) in enumerate(zip(true_labels, predicted_labels)):
        precision, recall, f1 = calculate_scores(true_label, predicted_label)
        total_precision += precision
        total_recall += recall
        total_f1 += f1

        print(f"Question {i + 1}:")
        print(f"True Answer: {true_label}")
        print(f"Predicted Answer: {predicted_label}")
        print(f"Precision: {precision:.2f}")
        print(f"Recall: {recall:.2f}")
        print(f"F1 Score: {f1:.2f}")
        print()

    avg_precision = total_precision / total_count
    avg_recall = total_recall / total_count
    avg_f1 = total_f1 / total_count

    print(f"Average Precision: {avg_precision:.2f}")
    print(f"Average Recall: {avg_recall:.2f}")
    print(f"Average F1 Score: {avg_f1:.2f}")

# Example usage
evaluate_and_print(true_labels, predicted_labels)


Comparison of True and Predicted Answers with Scores:
Question 1:
True Answer: Global IndiaAI Summit 2024
Predicted Answer: Global IndiaAI
Precision: 1.00
Recall: 0.50
F1 Score: 0.67

Question 2:
True Answer: 3-4 July 2024, New Delhi
Predicted Answer: Here are two-word answers based on the agenda:

* New Delhi
* July 2024
Precision: 0.23
Recall: 0.60
F1 Score: 0.33

Question 3:
True Answer: 10:00 - 11:00 at Auditorium 1
Predicted Answer: Here are the two-word answers:

10:00 Auditorium
Precision: 0.29
Recall: 0.33
F1 Score: 0.31

Question 4:
True Answer: Executive Council Meeting (By Invitation only)
Predicted Answer: Executive Council
Precision: 1.00
Recall: 0.33
F1 Score: 0.50

Average Precision: 0.63
Average Recall: 0.44
Average F1 Score: 0.45


In [14]:
# Function to calculate ROUGE scores
def calculate_rouge(true_label, predicted_label):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(true_label, predicted_label)
    return scores

# Function to evaluate model using ROUGE scores
def evaluate_with_rouge(true_labels, predicted_labels):
    total_rouge1, total_rouge2, total_rougeL = 0, 0, 0
    count = len(true_labels)

    for true_label, predicted_label in zip(true_labels, predicted_labels):
        scores = calculate_rouge(true_label, predicted_label)
        total_rouge1 += scores['rouge1'].fmeasure
        total_rouge2 += scores['rouge2'].fmeasure
        total_rougeL += scores['rougeL'].fmeasure

    avg_rouge1 = total_rouge1 / count
    avg_rouge2 = total_rouge2 / count
    avg_rougeL = total_rougeL / count

    return avg_rouge1, avg_rouge2, avg_rougeL

# Example usage
avg_rouge1, avg_rouge2, avg_rougeL = evaluate_with_rouge(true_labels, predicted_labels)
print(f"ROUGE-1: {avg_rouge1:.2f}")
print(f"ROUGE-2: {avg_rouge2:.2f}")
print(f"ROUGE-L: {avg_rougeL:.2f}")


ROUGE-1: 0.52
ROUGE-2: 0.38
ROUGE-L: 0.50
