# Getting Started

**Useful Documentation:**
* [Langchain](https://python.langchain.com/docs/)
* [Beautiful Soup](https://beautiful-soup-4.readthedocs.io/en/latest/#quick-start)
* [CCSB GitHub Repo](https://github.com/lgtanimoto/CulturalCSBytes/tree/main)

**Installations:**
```bash
$ pip install beautifulsoup4
$ pip install langchain
$ pip install python-dotenv
$ pip install requests
$ pip install tqdm
```

**Imports:**

In [403]:
# built-in python modules
import json
import os
import re
from io import StringIO
from html.parser import HTMLParser
import random
from typing import Dict, List
from tqdm import tqdm

# langchain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage, SystemMessage, AIMessage
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.prompts.example_selector.base import BaseExampleSelector

# python libraries
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import requests
import warnings

**Set-up OpenAI API Key**:
* generate a secret key in the [openai API key manager](https://platform.openai.com/account/api-keys)
* create a .env file with the environmental variable
```openai_api_key=insert_your_key_here```
* load in the api key by running the following code block

In [404]:
load_dotenv()
openai_api_key = os.getenv("openai_api_key")
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    llm = OpenAI(model_name="gpt-4", openai_api_key=openai_api_key)
chat_turbo = ChatOpenAI(temperature=0, openai_api_key=openai_api_key, model_name="gpt-3.5-turbo")

In [323]:
FILE_PREF = "object_data\\"

def write_json(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f, indent=4)

def read_json(filename):
    with open(FILE_PREF + filename, 'r') as f:
        data = json.load(f)
    return data

**Load Example Questions from the [GitHub Repo](https://github.com/lgtanimoto/CulturalCSBytes/tree/main/content/A000/A000/)**

Note: we scrape the 50 default-culture questions to use as examples when generating new questions.

In [295]:
# scrape default questions
URL = "https://github.com/lgtanimoto/CulturalCSBytes/tree/main/content/A000/A000"
page = requests.get(URL)
soup = BeautifulSoup(page.content, "html.parser")

# get all links to CCSB question json files
links = [("https://raw.githubusercontent.com" + link['href'].replace("blob/","")) for link in soup.find_all("a") if ".json" in link['href']]
# list of JSON objects for default questions
default_questions = [json.loads(requests.get(link).text) for link in links]

In [298]:
# read default questions from file
default_questions = read_json("default_questions.json")

In [None]:
# Strip HTML from string
class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.text = StringIO()
    def handle_data(self, d):
        self.text.write(d)
    def get_data(self):
        return self.text.getvalue()

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

# remove unwanted fields from dictionary
def remove_fields(question, fields=['FixedOrderFromBottom', 'AnswerImage', 'QuestionImage']):
    for field in fields:
        if field in question.keys():
            question.pop(field)

Create learning objective list and groupings.

In [None]:
# read list of learning standards (objectives) with the full 6-character code from file
raw_objectives = read_json("raw_objectives.json")

In [302]:
# stores learning objectives in groups accessed by abbreviated one letter group name key
group_names = read_json("lo_group_names.json")

lo_code_groups = {
    'C': [],
    'N': [],
    'D': [],
    'A': [],
    'I': [],
}

In [304]:
# list of learning standards (objectives) with abbreviated 4-character codes
lo_list = dict()

'''schema conversion:
First character 1A - 1, 1B - 2, 2 - 3, 3A - 4, 3B - 5
Second character is first character after dash
last two characters is the number at the end
'''
level_conversions = {
    "1A": "1",
    "1B": "2",
    "2": "3",
    "3A": "4",
    "3B": "5",
}

# Convert raw objectives' full 6-character codes to the abbreviated 4-character codes and categorize into the lo_code_groups
for k, v in raw_objectives.items():
    code_parts = k.split("-")
    level = code_parts[0]
    lo_group = code_parts[1][0]
    q_num = code_parts[2]

    abbrev_code = level_conversions[level] + lo_group + q_num
    lo_list[abbrev_code] = v
    lo_code_groups[lo_group].append(abbrev_code)


In [None]:
# read in learning objective list from file
lo_list = read_json("learning_objectives.json")

# read in learning objective groups dictionary from file
lo_code_groups = read_json("lo_code_groups.json")

Create examples list.

In [311]:
# read few-shot examples for topic extraction from file
with open(FILE_PREF + "topic_extraction_examples.txt", 'r') as topextr_file:
    topic_extraction_examples = topextr_file.read()  # few-shot example string for topic extraction from default questions

In [None]:
### Create Examples List

examples = []

for i, dq in enumerate(default_questions):
    q = dq['QuestionJSON']

    q['CorrectAnswer'] = str(q['CorrectAnswer'])

    remove_fields(q)
    for k in q.keys():
        q[k] = strip_tags(q[k])

    example = {
        'MQCode': dq['MQCode'],
        'learning_objective': lo_list[dq['MQCode']],
        'topic': None,
        'question_str': json.dumps(q, indent=4)
    }

    topic = chat_turbo([
        SystemMessage(content=topic_extraction_examples),
        HumanMessage(content=json.dumps(q, indent=4))
    ]).content

    example['topic'] = topic
    examples.append(example)


In [None]:
### Full List of Examples. (result of previous cell)
examples = read_json("default_question_examples.json")

Add coding examples.

In [None]:
with open(FILE_PREF + "codeexamples.txt", 'r') as file:
    content = file.read()

coding_content = re.split(r"\*\* (\w{4}) \*\*", content)[1:]

mqCodes = [mqcode for i, mqcode in enumerate(coding_content) if i % 2 == 0]
coding_snippets = [snippet for i, snippet in enumerate(coding_content) if i % 2 == 1]

idxs = []
for i, ex in enumerate(examples):
    if ex['MQCode'] in mqCodes:
        idxs.append(i)
        
coding_q_exs = []
for idx, code_snippet in zip(idxs, coding_snippets):
    updated_q = examples[idx]
    q_json = json.loads(updated_q['question_str'])
    q_json['code_snippet'] = code_snippet[2:-2]
    updated_q['question_str'] = json.dumps(q_json, indent=4)
    coding_q_exs.append(updated_q)


In [284]:
### read in list of coding examples. (result of previous cell)
coding_q_exs = read_json("coding_question_examples.json")

Define interest area categories and subtopics.

In [289]:
# list of interest areas
interest_areas = read_json("interest_areas.json")

# dictionary of subtopics within each interest area
interests = read_json("interests.json")

In [None]:
# categorize examples by learning objective group
ex_groups = {
    'C': [],
    'N': [],
    'D': [],
    'A': [],
    'I': [],
}

for ex in examples:
    ex_groups[ex['MQCode'][1]].append(ex)

Create Few-Shot Example Question Template

In [None]:
# Create custom example selector that chooses examples from the same 
# learning objective group as the target learning objective

class CustomExampleSelector(BaseExampleSelector):
    def __init__(self, examples: List[Dict[str, str]]):
        self.examples = examples
    def add_example(self, example: Dict[str, str]) -> None:
        # Add new example to store for a key.
        self.examples.append(example)
    def select_examples(self, input_variables: Dict[str, str]) -> List[dict]:
        # Select which examples to use based on the inputs.
        for code, obj in lo_list.items():
            if obj == input_variables['learning_objective']:
                similar_exs = ex_groups[code[1]]
                few_exs = random.sample(similar_exs, min(len(similar_exs), 3))
                return few_exs

In [None]:
# Instruct GPT-4 Few-Shot Example Template

question_template = """Learning objective: {learning_objective}\nTopic: {topic}\nQuestion:\n```json\n{{{question_str}}}\n```"""

example_prompt = PromptTemplate(
    input_variables=["learning_objective", "topic", "question_str"],
    template=question_template,
)

def make_prompt(coding: bool = False) -> FewShotPromptTemplate:
    if coding:
        exs = random.sample(coding_q_exs, min(len(coding_q_exs), 3))
        few_shot_prompt = FewShotPromptTemplate(
            examples=exs,
            example_prompt=example_prompt,
            prefix="""System Message: You are a high school computer science teacher who wants to connect CS concepts to the diverse interests of your students so that they can better see themselves working in CS. Applications of CS to different fields of study often have similar solutions in code and teach the same underlying CS principles. And by seeing questions related to a wide variety of disciplines, students can better appreciate that CS is everywhere, for everyone, and that the essence of good computer science is to find the common patterns in problems from all aspects of life and to develop solutions for them.\n\nTask: Given a CS learning objective and a topic of interest to your students, generate a question in JSON format that applies the learning objective to a scenario related to the provided topic. The question should involve a code snippet specified in the quorum programming language as shown in the example questions.""",    
            suffix="Learning objective: {learning_objective}\nTopic: {topic}\nQuestion:",
            input_variables=["learning_objective", "topic"]
        )
    else:
        example_selector = CustomExampleSelector(examples)
        few_shot_prompt = FewShotPromptTemplate(
            example_selector=example_selector,
            example_prompt=example_prompt,
            prefix="""System Message: You are a high school computer science teacher who wants to connect CS concepts to the diverse interests of your students so that they can better see themselves working in CS. Applications of CS to different fields of study often have similar solutions in code and teach the same underlying CS principles. And by seeing questions related to a wide variety of disciplines, students can better appreciate that CS is everywhere, for everyone, and that the essence of good computer science is to find the common patterns in problems from all aspects of life and to develop solutions for them.\n\nTask: Given a CS learning objective and a topic of interest to your students, generate a question in JSON format that applies the learning objective to a scenario related to the provided topic. Specify the content of any charts or graphics referenced in your question.""",    
            suffix="Learning objective: {learning_objective}\nTopic: {topic}\nQuestion:",
            input_variables=["learning_objective", "topic"]
        )
    return few_shot_prompt


Generate Questions.

In [None]:
def generate_qs(folder_path: str, coding: bool = False) -> list[dict]:
    # generate questions for each interest and learning objective group
    generated_qs = []
    for area, topic_list in list(interests.items()):
        for topic in topic_list[:1]:
            # randomly select learning standard
            lo_code, obj = random.choice(list(lo_list.items()))
            #print(f"Interest: {topic}\nLearning Objective: {lo_code}. {obj}\n")
            
            # create prompt
            prompt = make_prompt(coding).format(learning_objective=obj, topic=topic)
            print(prompt)

            # get gpt-4 response
            res = llm(prompt)
            q_str = "{" + res.split("{")[-1].split("}")[0] + "}"
            print(q_str + "\n")
            q = {
                'learning_objective': obj,
                'interest_area': area,
                'topic': topic,
                'MQCode': lo_code,
                'question_str': q_str,
            }
            generated_qs.append(q)

            # output question json to file
            f_name = q['MQCode'] + "-" + q['topic'].replace(" ", "_")
            write_json(q, folder_path + f_name)

    return generated_qs


In [None]:
# generate questions
coding = True
folder_path = "coding_questions\\6-28-2023\\"
qs = generate_qs(folder_path, coding)

In [None]:
# check a code snipped
q_idx = 0
print(json.loads(qs[q_idx]['question_str'])['code_snippet'])