# Development

In [2]:
%load_ext autoreload

In [4]:
%autoreload
import pathlib
import polars as pl
from skolegpt_instruct_dataset.data import get_data
from skolegpt_instruct_dataset.preprocess import preprocess_data
from skolegpt_instruct_dataset.utils import sample_and_print_example, analyse_pre_and_postfixes
from skolegpt_instruct_dataset.config import config

# Parameters

In [5]:
cache_file = pathlib.Path("orca_sample.parquet")
cache_file_preprocessed = pathlib.Path("orca_sample_preprocessed.parquet")
use_cache = True

# Get Data

In [48]:
%autoreload

# get external data or load cached data
if cache_file.is_file() & use_cache:
    df_raw = pl.read_parquet(cache_file)
else:
    df_raw = get_data(n_max=config.n_max)
df_raw.head()

id,system_prompt,question,response,source
str,str,str,str,str
"""cot.47001""","""You are an AI …","""Question: Whic…","""Sure, I'd be h…","""cot"""
"""t0.1981815""","""You are an AI …","""Read the bio b…","""- Instrument: …","""t0"""
"""flan.1484600""","""You are an AI …","""Determine if t…","""The sentence ""…","""flan"""
"""t0.750125""","""You are an AI …","""Question: cats…","""The correct an…","""t0"""
"""t0.1336221""","""You are an AI …","""Answer the fol…","""This product r…","""t0"""


# Preprocess Data

In [54]:
%autoreload

if cache_file_preprocessed.is_file() & use_cache:
    df= pl.read_parquet(cache_file_preprocessed)
else:
    df = preprocess_data(
        df=df_raw, 
        n_total=config.n_total, 
        instruction_sources=config.instruction_sources,
        common_postfixes=config.common_postfixes, 
        common_prefixes=config.common_prefixes, 
        seed=config.seed
    )


In [331]:
_ = sample_and_print_example(df)

ID
cot.45481

SYSTEM PROMPT:

You are an AI assistant that helps people find information. User will
you give you a question. Your task is to answer as faithfully as you
can. While answering think step-bystep and justify your answer.

QUESTION:

Premise: "A man with black hair and gray pants is standing holding a
helmet and looks very dirty." Hypothesis: "A woman without a purse
rides a gorilla." Do we know that the hypothesis entailed by the
premise?. Le: OK, so how can I answer with some explanation?

RESPONSE:

Based on the given premise, we can conclude that a man with black hair
and gray pants is standing holding a helmet and looks very dirty.
However, there is no information provided in the premise that can lead
us to the hypothesis that a woman without a purse rides a gorilla.
Therefore, we cannot say that the hypothesis is entailed by the
premise.  In logic, entailment means that the truth of one statement
logically guarantees the truth of another statement. In this case,
there 

In [38]:
%autoreload
from skolegpt_instruct_dataset.utils import return_filter_char_list
def contains_characters(text, characters):
    for char in characters:
        if char in text:
            return True
    return False

df_filt = df.filter(~df["question"].map_elements(lambda x: contains_characters(text=x, characters=return_filter_char_list())))
df_filt = df_filt.filter(~df_filt["response"].map_elements(lambda x: contains_characters(text=x, characters=return_filter_char_list())))

print(len(df_filt))
sample_and_print_example(df_filt)

87808
ID
cot.88876

SYSTEM PROMPT:

You are an AI assistant that helps people find information.

QUESTION:

Stream of consciousness rationale: Blue outfit is not a red outfit.
Lounging on floor pillow is different to doing jumping jacks. The
question and answer pair are described below.

RESPONSE:

Question: What is the difference between wearing a blue outfit and a
red outfit?  Answer: Wearing a blue outfit is not the same as wearing
a red outfit. They are two different colors and can create different
moods or impressions.  Question: How is lounging on a floor pillow
different from doing jumping jacks?  Answer: Lounging on a floor
pillow is a relaxed and comfortable activity, while doing jumping
jacks is a more active and energetic activity. They involve different
levels of physical exertion and serve different purposes.


id,system_prompt,question,response,source
str,str,str,str,str
"""cot.88876""","""You are an AI …","""Stream of cons…","""Question: What…","""cot"""


# Experimental

In [None]:
import polars as pl

# Regex pattern to match the different option formats
option_patterns = [
    r'(?i)\b[A-D]\)',  # Matches A), B), C), D) in a case-insensitive manner
    r'(?i)\b[1-4]\)',  # Matches 1), 2), 3), 4) in a case-insensitive manner
    r'(?i)\b\([A-D]\)',  # Matches (A), (B), (C), (D) in a case-insensitive manner
    r'(?i)\b[A-D]\.'  # Matches A., B., C., D. in a case-insensitive manner
]
combined_option_pattern = '|'.join(option_patterns)

# Condition for filtering
condition = (
    (df["question"].str.contains("Options:") | df["question"].str.contains("OPT:")) |
    df["question"].str.contains(combined_option_pattern)
)

# Apply the filter
df_filtered = df.filter(~condition)
len(df_filtered)

In [None]:
%autoreload
s = sample_and_print_example(df_filtered)

In [None]:
df_filtered["source"].value_counts()

In [None]:
s["question"].item().strip()

In [None]:
%autoreload
analyse_pre_and_postfixes(df)

In [None]:
# TODO næste gang:
# - Fjern multiple choice
# - Fjern tekster med ikke-engelske chars
# - Fjern flere dumme pre/postfixes

In [None]:
# Improvements:
# - Translate Removal: Consider false postives
