In [1]:
import pandas as pd
import httpx
from dotenv import load_dotenv
import os
from functools import cache
from tqdm.auto import tqdm


load_dotenv()
out_dir = "~/active-projects/2024-07-itell-ai-testing/testing-data/"

# Summaries

In [56]:
summary_data_path = "~/active-projects/2024.03-itell-cornell/data/consented_supabase_tables_anonymized/summaries_rows.csv"
summ = pd.read_csv(summary_data_path)
summ_out = summ[["page_slug", "text"]].rename(columns={"text": "summary"})
display(summ_out.sample(5))
summ_out.to_json(out_dir + "summary.jsonl", orient="records", lines=True)

Unnamed: 0,page_slug,summary
71,learning-analytics-for-self-regulated-learning,"In this paper, Phillip Winne discusses SRL, wh..."
190,learning-analytics-for-self-regulated-learning,The model of SRL introduced in the article gui...
111,a-practitioner-s-guide-to-measurement-in-learn...,Learning analytics can be used to improve lear...
117,emotional,incorporating emotion into learning analytics ...
116,learning-analytics-for-self-regulated-learning,Tracing the work of learners is something that...


# Constructed Response Items

In [2]:
cri_data_path = "~/active-projects/2024.03-itell-cornell/data/consented_supabase_tables_anonymized/constructed_responses_rows.csv"

cri = pd.read_csv(cri_data_path)
cri_out = cri[["page_slug", "chunk_slug", "response", "score"]].rename(columns={"response": "answer"})
display(cri_out.sample(5))
# cri_out.to_json(out_dir + "cri.jsonl", orient="records", lines=True)

Unnamed: 0,page_slug,chunk_slug,answer,score
2412,learning-analytics-for-self-regulated-learning,Introduction-403t,"First, the learner identifies resources and co...",2
1989,emotional,14-Teacher-Analytics-486t,Researchers are developing methods for automat...,2
446,learning-an-1,How-to-Get-Started-Chunk-6-536t,it is important to be explicit so that we know...,1
734,learning-analytics-for-self-regulated-learning,Learning-Analytics-For-SRL-465t,"Calculation, delivery factors, and recommendat...",2
48,learning-analytics-for-self-regulated-learning,Materials-Studied-526t,texts can be described by various analytics in...,2


### Get Chunks

In [None]:
headers = {"Authorization": f"Bearer {os.getenv('STRAPI_KEY')}"}

url = "https://itell-strapi-um5h.onrender.com/api"

def build_query(page_slug, chunk_slug) -> dict:
    return {
        "filters[slug][$eq]":page_slug,
        "populate[Content][filters][Slug][$eqi]":chunk_slug,
        }

@cache
def call_api(page_slug, chunk_slug):
    try:
        with httpx.Client() as client:
            querystring = build_query(page_slug, chunk_slug)
            r = client.get(url + "/pages", params=querystring, headers=headers)
            chunk_json = r.json()
            chunk = chunk_json["data"][0]["attributes"]["Content"][0]
            return (chunk["CleanText"], chunk["Question"], chunk["ConstructedResponse"])
    except (httpx.HTTPError, ValueError):
        return (None, None, None)

def collect_chunks():
    page_chunk_slugs = list(cri_out[["page_slug", "chunk_slug"]].itertuples())
    chunks = []
    for ind, page_slug, chunk_slug in tqdm(page_chunk_slugs):
        clean_text, question, answer = call_api(page_slug, chunk_slug)
        chunks.append((ind, clean_text, question, answer))
    chunk_df = pd.DataFrame(chunks, columns=[0, "Chunk", "Question", "Answer"]).set_index(0).sort_index()
    return chunk_df

chunk_df = collect_chunks()

cri_chunk_out = pd.concat([cri_out, chunk_df], axis=1)
cri_chunk_out

In [5]:
cri_chunk_out.to_json(out_dir + "cri.jsonl", orient="records", lines=True)

# Chat

In [57]:
chat_data_path = "~/active-projects/2024.03-itell-cornell/data/consented_supabase_tables_anonymized/chat_messages_rows.csv"

chat = pd.read_csv(chat_data_path)

In [46]:
import json
from copy import deepcopy

def accumulate(row):
    messages = iter(json.loads(row.data))

    query = {
        "page_slug": row.page_slug
    }
    history = []

    for msg in messages:
        if msg["isUser"] is True:
            query["history"] = history
            query["message"] = msg["text"]
            yield deepcopy(query)
        
        history.append({
            "agent": "user" if msg["isUser"] else "bot",
            "text": msg["text"]
        })

chat_queries = []

for row in chat.itertuples():
    for query in accumulate(row):
        chat_queries.append(query)

In [53]:
chat_out = pd.DataFrame(chat_queries)
display(chat_out.sample(5))
chat_out.to_json(out_dir + "chat.jsonl", orient="records", lines=True)

Unnamed: 0,page_slug,history,message
56,learning-analytics-for-self-regulated-learning,"[{'agent': 'user', 'text': 'what is the purpos...",what is copes?
66,learning-analytics-for-self-regulated-learning,[],what can you do?
90,learning-analytics-for-self-regulated-learning,[],what is copes
36,learning-analytics-for-self-regulated-learning,"[{'agent': 'user', 'text': 'I finished reading...",they haven't gotten back to me
104,emotional,[],What is this chapter about
