# Development

In [1]:
%load_ext autoreload

In [2]:
%autoreload
import pathlib
import polars as pl
from skolegpt_instruct_dataset.data import get_data
from skolegpt_instruct_dataset.preprocess import preprocess_data
from skolegpt_instruct_dataset.utils import sample_and_print_example, analyse_pre_and_postfixes
from skolegpt_instruct_dataset.config import config

  from .autonotebook import tqdm as notebook_tqdm


# Parameters

In [3]:
cache_file = pathlib.Path("orca_sample.parquet")
use_cache = True

# Get Data

In [4]:
%autoreload

# get external data or load cached data
if cache_file.is_file() & use_cache:
    df = pl.read_parquet(cache_file)
else:
    df = get_data(n_max=config.n_max)
df.head()

id,system_prompt,question,response,source
str,str,str,str,str
"""cot.47001""","""You are an AI …","""Question: Whic…","""Sure, I'd be h…","""cot"""
"""t0.1981815""","""You are an AI …","""Read the bio b…","""- Instrument: …","""t0"""
"""flan.1484600""","""You are an AI …","""Determine if t…","""The sentence ""…","""flan"""
"""t0.750125""","""You are an AI …","""Question: cats…","""The correct an…","""t0"""
"""t0.1336221""","""You are an AI …","""Answer the fol…","""This product r…","""t0"""


# Preprocess Data

In [7]:
%autoreload
df = preprocess_data(
    df=df, 
    n_total=config.n_total, 
    instruction_sources=config.instruction_sources,
    common_postfixes=config.common_postfixes, 
    common_prefixes=config.common_prefixes, 
    seed=config.seed
)
print(len(df))
df.head()

90000


id,system_prompt,question,response,source
str,str,str,str,str
"""flan.81333""","""You are an AI …","""Gordon McKay i…","""Yes, these two…","""flan"""
"""flan.2293314""","""You are an AI …","""Select your an…","""This is about …","""flan"""
"""flan.1012603""","""You are an AI …","""Write a senten…","""The bear is sl…","""flan"""
"""flan.182682""","""You are an AI …","""How does the s…","""the two friend…","""flan"""
"""flan.1974845""","""You are an AI …","""Article: This …","""The article de…","""flan"""


# Experimental

In [166]:
sample_and_print_example(df)

ID
cot.38806

SYSTEM PROMPT:

You are an AI assistant that helps people find information. Provide a
detailed answer so user don’t need to search outside to understand the
answer.

QUESTION:

Can we conclude from "Children wear soccer uniforms." that "Children
are on a school bus."? Options: - yes - no - it is not possible to
tell Stream of thoughts:

RESPONSE:

No, we cannot conclude that "Children are on a school bus" from the
statement "Children wear soccer uniforms." The statement only tells us
that children wear soccer uniforms, but it does not provide any
information about their location or activity. Therefore, it is not
possible to tell whether they are on a school bus or not.


In [110]:
%autoreload
analyse_pre_and_postfixes(df)

--- Prefixes ---
Normalized Freq. 0.0% | Freq.: 0 | Term: 'Question:' 
Normalized Freq. 0.0% | Freq.: 0 | Term: 'Definition:' 
Normalized Freq. 0.0% | Freq.: 0 | Term: 'Detailed Instructions:' 
Normalized Freq. 0.0% | Freq.: 0 | Term: 'Instructions:' 
Normalized Freq. 0.0% | Freq.: 0 | Term: 'Q:' 
Normalized Freq. 0.0% | Freq.: 0 | Term: 'Teacher:' 
Normalized Freq. 1.00222% | Freq.: 902 | Term: 'Student:' 
Normalized Freq. 0.31556% | Freq.: 284 | Term: 'Write a sentence not in English.' 
Normalized Freq. 1.12444% | Freq.: 1012 | Term: 'Denny asked:' 

--- Postfixes ---
Normalized Freq. 0.0% | Freq.: 0 | Term: 'Answer:' 
Normalized Freq. 0.0% | Freq.: 0 | Term: 'Solution:' 
Normalized Freq. 0.0% | Freq.: 0 | Term: 'A:' 
Normalized Freq. 0.0% | Freq.: 0 | Term: 'Output:' 
Normalized Freq. 0.0% | Freq.: 0 | Term: 'Teacher:' 
Normalized Freq. 0.0% | Freq.: 0 | Term: 'Student:' 
Normalized Freq. 1.07778% | Freq.: 970 | Term: 'Stream of thoughts:' 
Normalized Freq. 1.72333% | Freq.: 1551 | 

In [140]:
# TODO næste gang:
# - Fjern multiple choice
# - Fjern tekster med ikke-engelske chars
# - Fjern flere dumme pre/postfixes