In [1]:
import os
import json
import pandas as pd

In [5]:
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

True

In [9]:
from langchain.chat_models import ChatOpenAI

In [6]:
KEY = os.getenv("OPENAI_API_KEY")

In [10]:
llm = ChatOpenAI(openai_api_key=KEY, model_name="gpt-3.5-turbo", temperature=0.5)

In [12]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains import SequentialChain
from langchain.callbacks import get_openai_callback

In [13]:
RESPONSE_JSON = {
    "1": {
        "MCQ": "multiple choice question",
        "Options": {
            "a": "choice here",
            "b": "choice here",
            "c": "choice here",
            "d": "choice here",
        },
        "Correct": "correct answer",
    },
    "2": {
        "MCQ": "multiple choice question",
        "Options": {
            "a": "choice here",
            "b": "choice here",
            "c": "choice here",
            "d": "choice here",
        },
        "Correct": "correct answer",
    },
    "3": {
        "MCQ": "multiple choice question",
        "Options": {
            "a": "choice here",
            "b": "choice here",
            "c": "choice here",
            "d": "choice here",
        },
        "Correct": "correct answer",
    }
}

In [21]:
QUIZ_GENERATION_TEMPLATE = """
    Text: {text}
    You are an expert MCQ generator. Given the above text, it is your job to create a quiz of {num_questions} multiple choice questions for {subject} students in {tone} tone. 
    Make sure of the following: (1) The questions are not repeated, and check all the questions to be conforming the text as well, (2) Use RESPONSE_JSON defined below as a guide to format your response, and (3) create {num_questions} MCQs.
    
    ### RESPONSE_JSON
    {response_json}
"""

In [22]:
quiz_generation_prompt = PromptTemplate(
    input_variables=["text", "num_questions", "subject", "tone", "response_json"],
    template=QUIZ_GENERATION_TEMPLATE
)

In [23]:
quiz_chain = LLMChain(llm=llm, prompt=quiz_generation_prompt, output_key="quiz", verbose=True)

In [25]:
QUIZ_EVALUATION_TEMPLATE = """
    You are an expert English grammarian and writer. Given a Multiple Choice Quiz for {subject} students,
    You need to evaluate the complexity of the question and give a complete analysis of the quiz. Use only a max of 50 words for complexity analysis. 
    If the quiz is not at par with the cognitive and analytical abilities of the students, update the quiz questions that need to be changed, and change the tone such that it perfectly fits the student's abilities
    
    Here is the quiz you need to evaluate:
    
    Quiz_MCQs:
    {quiz}
"""

In [26]:
quiz_evaluation_prompt = PromptTemplate(
    input_variables=["subject", "quiz"], 
    template=QUIZ_EVALUATION_TEMPLATE
)

In [27]:
review_chain = LLMChain(llm=llm, prompt=quiz_evaluation_prompt, output_key="review", verbose=True)

In [28]:
generate_evaluate_chain = SequentialChain(
    chains=[quiz_chain, review_chain], 
    input_variables=["text", "num_questions", "subject", "tone", "response_json"],
    output_variables=["quiz", "review"], 
    verbose=True
)

In [29]:
data_science = "../dataset/input/data_science.txt"

In [35]:
with open(data_science, 'r') as file:
    TEXT = file.read()

In [36]:
print(TEXT)

What is data science?
Data science is the study of data to extract meaningful insights for business. It is a multidisciplinary approach that combines principles and practices from the fields of mathematics, statistics, artificial intelligence, and computer engineering to analyze large amounts of data. This analysis helps data scientists to ask and answer questions like what happened, why it happened, what will happen, and what can be done with the results.

Why is data science important?
Data science is important because it combines tools, methods, and technology to generate meaning from data. Modern organizations are inundated with data; there is a proliferation of devices that can automatically collect and store information. Online systems and payment portals capture more data in the fields of e-commerce, medicine, finance, and every other aspect of human life. We have text, audio, video, and image data available in vast quantities.  

History of data science
While the term data scie

In [37]:
NUM_QUESTIONS = 5 
SUBJECT = "Data Science"
TONE = "Simple"

In [39]:
# Setup Token Usage Tracking in LangChain
with get_openai_callback() as cb:
    response = generate_evaluate_chain(
        {
            "text": TEXT,
            "num_questions": NUM_QUESTIONS,
            "subject": SUBJECT,
            "tone": TONE,
            "response_json": json.dumps(RESPONSE_JSON)
        }
    )



[1m> Entering new SequentialChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
    Text: What is data science?
Data science is the study of data to extract meaningful insights for business. It is a multidisciplinary approach that combines principles and practices from the fields of mathematics, statistics, artificial intelligence, and computer engineering to analyze large amounts of data. This analysis helps data scientists to ask and answer questions like what happened, why it happened, what will happen, and what can be done with the results.

Why is data science important?
Data science is important because it combines tools, methods, and technology to generate meaning from data. Modern organizations are inundated with data; there is a proliferation of devices that can automatically collect and store information. Online systems and payment portals capture more data in the fields of e-commerce, medicine, finance, and every other aspe

In [42]:
print(f"Total Tokens: {cb.total_tokens}")
print(f"Prompt Tokens: {cb.prompt_tokens}")
print(f"Completion Tokens: {cb.completion_tokens}")
print(f"Total Cost: ${cb.total_cost}")

Total Tokens: 2872
Prompt Tokens: 1986
Completion Tokens: 886
Total Cost: $0.004751


In [51]:
quiz = response.get("quiz")

quiz = json.loads(quiz)

In [54]:
formatted_quiz = []
for key, value in quiz.items():
    mcq = value["mcq"]
    options = " | ".join([f"{option}: {option_value}" for option, option_value in value["options"].items()])
    correct = value["correct"]
    formatted_quiz.append({"MCQ": mcq, "Choices": options, "Correct": correct})

In [55]:
quiz = pd.DataFrame(formatted_quiz)

In [56]:
quiz

Unnamed: 0,MCQ,Choices,Correct
0,What is data science?,a: The study of animals in their natural habit...,b
1,Which type of data analysis examines data to g...,a: Diagnostic analysis | b: Predictive analysi...,c
2,What does prescriptive analytics do?,a: Predicts what is likely to happen | b: Sugg...,b
3,What is one benefit of data science for busine...,a: Increasing customer complaints | b: Reducin...,c
4,How can data science help companies respond to...,a: By predicting change and reacting optimally...,a


In [57]:
quiz.to_csv("../dataset/output/data_science.csv", index=False)