In [None]:
# The Colab environment does not have this library.
!pip install openai



In [1]:
# import necessary packages
import os
import re
import json
import random
from collections import Counter

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report


import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential

In [2]:
# Load data from googld drive
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/pilot_cima/'
os.chdir(path)
print("path=", os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
path= /content/drive/MyDrive/pilot_cima


In [3]:
# Set the API key
with open('openai_key.txt', 'r') as file:
    OPENAI_API_KEY = file.read().strip()

openai.api_key = OPENAI_API_KEY

In [13]:
# Set variables

random_seed = 27 # using a random seed to ensure reproducibility
num_prev_sentences = 1  # the number of previous sentences to be considered
CategorySize = 10  # the number of samples in each category
Model = "gpt-4" # select a model 'gpt-3.5-turbo', 'gpt-4'

In [5]:
# Define utility functions

# Define function to count labels
def count_labels(dataset, key = 'tutorActions'):

    # Extract labels from dataset
    labels = []
    for example in dataset:
        labels += example[key]

    # Count label occurrences
    label_counts = Counter(labels)

    # Print count for each label
    for label, count in label_counts.items():
        print(label, count)


# Function to clean predicted labels
def clean_labels(outputs):
    """
    Clean predicted labels.

    Args:
    outputs (list): A list of predicted labels.

    Returns:
    cleaned_labels (list): A list of cleaned labels.
    """

    # Define a list of labels
    labels = ['Question', 'Hint', 'Correction', 'Confirmation']

    # Define a special symbol
    special_symbol = '*'

    # Define a regular expression pattern that matches any label in the label list
    label_pattern = '|'.join(labels)

    # Initialise an empty list to store cleaned labels
    cleaned_labels = []

    # Loop through each output and extract the label information
    for output in outputs:
        # Match the label using the regular expression pattern
        match = re.search(label_pattern, output)

        # If there is a match, append the label to the cleaned_labels list
        if match:
            cleaned_labels.append(match.group(0))
        # Otherwise, append the special symbol to the cleaned_labels list
        else:
            cleaned_labels.append(special_symbol)

    return cleaned_labels

In [6]:
# Load data

# The CIMA dataset is from Stasaski, K., Kao, K., & Hearst, M. A. (2020).\
# CIMA: A Large Open Access Dialogue Dataset for Tutoring.

with open('cima_raw_data.json', 'r') as f:
    data = json.loads(f.read())

    # Use 'prepDataset' database
    data = data['prepDataset']
    num = len(data.keys())
    print(data['0'].keys(), num)

# Convert Python object to DataFrame
df = pd.DataFrame.from_dict(data).T

# Display DataFrame as table in Notebook
display(df)

dict_keys(['past_convo', 'img', 'prep', 'engPrep', 'obj', 'engObj', 'color', 'engColor', 'grammarRules', 'studentActions', 'tutorResponses', 'tutorActions', 'tutorKeys']) 1135


Unnamed: 0,past_convo,img,prep,engPrep,obj,engObj,color,engColor,grammarRules,studentActions,tutorResponses,tutorActions,tutorKeys
0,"[""Pink"" is ""rosa"". Please try to fill in the b...","""pictures/dog_behind_pink_tree.png""",e dietro,is behind the,l'albero,tree,rosa,pink,"[[""l' (\""the\"") is prepended to the following ...","[False, True, False, False]",[Look at your order of words again. Adjectives...,"[[False, False, False, False, True], [True, Fa...",[]
1,"[Please try to fill in the blank in Italian. ,...","""pictures/plant_next_pink_bed.png""",e accanto al,is next to the,letto,bed,rosa,pink,"[[""al (\""to the\"", or \""of the\"") is used when...","[True, False, False, False]","[Wonderful!, Very good - you are almost comple...","[[False, False, False, True, False], [False, F...",[]
2,"[Please try to fill in the blank in Italian. ,...","""pictures/cat_next_blue_tree.png""",e vicino,is next to the,all'albero,tree,blu,blue,"[[""all' (\""to the\"" is prepended to the follow...","[False, False, True, False]",[So close! Look at the word tree and double ch...,"[[False, False, False, True, False], [True, Fa...",[]
3,"[Please try to fill in the blank in Italian. ,...","""pictures/cat_next_yellow_bunny.png""",e accanto al,is next to the,coniglio,bunny,giallo,yellow,"[[""al (\""to the\"", or \""of the\"") is used when...","[True, False, False, False]","[Well let's look at your last word ""amarillo""....","[[True, False, False, False, False], [True, Fa...",[]
4,"[Please try to fill in the blank in Italian. ,...","""pictures/plant_next_yellow_bed.png""",e accanto al,is next to the,letto,bed,giallo,yellow,"[[""al (\""to the\"", or \""of the\"") is used when...","[False, False, True, False]","[The ""is next to the"" is e accanto al., No, th...","[[False, True, False, False, False], [False, F...",[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,"[Please try to fill in the blank in Italian. ,...","""pictures/bunny_front_purple_tree.png""",e di fronte,is in front of the,all'albero,tree,viola,purple,"[[""all' (to the is prepended to the following ...","[True, False, False, False]","[Not quite. The word for ""tree"" is ""all'alber...","[[False, True, True, False, False]]",[]
1131,"[Please try to fill in the blank in Italian. ,...","""pictures/cat_on_yellow_bed.png""",e in cima al,is on top of the,letto,bed,giallo,yellow,"[[""al (to the, or of the) is used when the fol...","[False, True, False, False]","[I will help you with that, ""on top of"" transl...","[[False, True, False, False, False]]",[]
1132,"[Please try to fill in the blank in Italian. ,...","""pictures/bunny_front_pink_box.png""",e di fronte alla,is in front of the,scatola,box,rosa,pink,"[[""alla (to the) is used when the following wo...","[False, True, False, False]","[OK, we can break this down one by one. In fro...","[[False, True, False, False, False]]",[]
1133,[Green is verde. Please try to fill in the bla...,"""pictures/plant_inside_green_box.png""",e dentro la,is inside of the,scatola,box,verde,green,"[[""Prepositional phrases separate the two noun...","[False, True, False, False]",[Inside of is dentro la. Do you know green box?],"[[False, True, False, False, False]]",[]


In [7]:
# Data Preparation

def generate_dataset(dataframe, num_prev_sentences):

    # Dialogue actions labels
    tutorActions_labels = ['Question', 'Hint', 'Correction', 'Confirmation', 'Other']

    # create an empty list to store the output
    output = []

    # iterate through the dataframe
    for i in range(len(dataframe)):
        # create a dictionary to store each session
        session = {}

        # get the specified number of previous sentences of the conversation
        if num_prev_sentences == 2:
             session['tutor_initiation'] = dataframe['past_convo'][i][-2]
        session['student'] = dataframe['past_convo'][i][-1]

        # iterate through the tutorActions and tutorResponses, select responses with a single encoding
        single_encoding_responses = []
        actions = dataframe['tutorActions'][i]
        for j, action in enumerate(actions):
            # if the response has multiple encodings or is labeled as "Other", skip it
            if action[-1] == 1 or 'Other' in dataframe['tutorResponses'][i][j]:
                continue
            encoding_count = sum(action[:-1])
            if encoding_count == 1:
                single_encoding_responses.append(dataframe['tutorResponses'][i][j])

        # if there are responses with a single encoding, randomly select one (since more than one coworks may reply to the same student's utternace)
        if len(single_encoding_responses) > 0:
            index = random.randint(0, len(single_encoding_responses)-1)
            response = single_encoding_responses[index]
            actions = dataframe['tutorActions'][i][dataframe['tutorResponses'][i].index(response)]

            # create 'tutorResponses' and corresponding 'tutorActions' labels
            session['tutor_response'] = response
            session['tutorActions'] = [label for j, label in enumerate(tutorActions_labels[:-1]) if actions[j]]

            output.append(session)

    # write the output to a JSON file
    folder_name = "dataset"

    # check if the folder exists, if not create it
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)

    with open(f'./dataset/n{num_prev_sentences}_sentences_output.json', 'w') as f:
        json.dump(output, f)

    print('Example:\n', output[1])
    print('\nNum. of examples:', len(output))

    return output

dataset = generate_dataset(df, num_prev_sentences)

Example:
 {'student': "Oh okay! then I'll try e accanto al letto roso.", 'tutor_response': 'Wonderful!', 'tutorActions': ['Confirmation']}

Num. of examples: 1065


In [8]:
# Sampling

random.seed(random_seed)

# Define a function to sample data
def sample_dataset(data, category_size= CategorySize):

    tutorActions_labels = ['Question', 'Hint', 'Correction', 'Confirmation']

    sampled_data = []

    for label in tutorActions_labels:
        count = 0
        for sample in data:
            if label in sample['tutorActions'] and count < category_size:
                sampled_data.append(sample)
                data.remove(sample)
                count += 1

#    # Save the test set to a JSON file
#    with open(f'./dataset/sampled_data_{len(sampled_data)}.json', 'w') as f:
#        json.dump(sampled_data, f)

    # Print an example from the test set and the number of examples it contains
    print('\nNum. of examples:', len(sampled_data))
    print('Example:\n', sampled_data[-1])

    return sampled_data

sampled_data = sample_dataset(dataset, CategorySize)

# Extract labels from sampled data
label_true = [sample['tutorActions'][0] for sample in sampled_data]


Num. of examples: 40
Example:
 {'student': 'il coniglio è dietro il gatto giallo', 'tutor_response': "Good that's correct.", 'tutorActions': ['Confirmation']}


In [33]:
# Define the function to implement the model

# @retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(20))

def get_completion(messages, model= Model, temperature=0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature)
    return response.choices[0].message["content"]

def implement_model(input_data, sys_message):

    label_predict = []

    # Imprement the model to machine tag
    for i in range(len(input_data)):

        print('\n', f'Processing the {i}-th sample')

        # messages
        input_data[i]['tutorActions'] = []
        input = input_data[i]

        # Format the dictionary input
        formatted_str = ', '.join([f"'{key}': '{value}'" for key, value in input.items() if value])
        user_message = formatted_str

        messages = [
            {'role':'system','content':sys_message},
            {'role':'user','content':user_message}]

        # Predict
        predict = get_completion(messages)

        # Print the conversation and the predicted result
        print(user_message)
        print(i, 'predict = ', predict, 'true = ', label_true[i])

        label_predict.append(predict)

    return label_predict


In [36]:
# Prompts

sys_message1 = """
You will be given a snippet of a tutor-student conversation enclosed within triple backticks,
Your job is to carefully read this information line by line, and then \
provide a tag of the TUTOR_RESPONSE from the following list.
Choose ONLY ONE best tag from the list of tags provided here.

- Question
- Hint
- Correction
- Confirmation
"""

sys_message2 = """
You will be given a snippet of a tutor-student conversation enclosed within triple backticks,
Your job is to carefully read this information line by line, and then \
provide a tag of the TUTOR_RESPONSE from the following list.
After the colon, each tag's explanation is attached
Choose ONLY ONE best tag from the list of tags provided here.

- Question: The tutor is asking an open-ended question usually with a question mark
- Hint: The tutor is answering a student's question and/or providing additional information
- Correction: The tutor is explicitly correcting student's mistake
- Confirmation: The tutor is agreeing with student's words

"""

sys_message3 ="""
You will be given a snippet of a tutor-student conversation enclosed within triple backticks,

Your job is to perform the following steps:

1. Identify student's utterance and tutor's response.
2. Determine whether the tutor action is 'Question','Confirmation', \
   or none of them ('Other').
3. Unless it's NOT 'Other', choose ONLY ONE tag from 'Question' and 'Confirmation' and skip other steps. \
   If it's 'Other', proceed to step 4.
4. Determine whether the tutor action is 'Hint' or 'Correction'.
5. Choose ONLY ONE tag from 'Hint' and 'Correction'
"""

sys_message4 = """
You will be given a snippet of a tutor-student conversation enclosed within triple backticks,

Your job is to perform the following steps:

1. Identify student's utterance and tutor's response.
2. Determine whether the tutor is asking an open-ended question usually with a \
   question mark ('Question'), agreeing with student's words ('Confirmation'), \
   or none of them ('Other').
3. Unless it's NOT 'Other', choose ONLY ONE tag from 'Question' and 'Confirmation' and skip other steps. \
   If it's 'Other', proceed to step 4.
4. Determine whether tutor is answering a student's question and/or providing \
   additional information ('Hint') or is explicitly correcting student's mistake ('Correction').
5. Choose ONLY ONE tag from 'Hint' and 'Correction'

"""


# Implementing

data = sampled_data
label_predict = implement_model(data, sys_message3)


 Processing the 0-th sample
'student': 'So what's the correct answer?', 'tutor_response': 'Are you sure you have all the words in the right order?'
0 predict =  1. Student's utterance: 'So what's the correct answer?'
   Tutor's response: 'Are you sure you have all the words in the right order?'
2. The tutor action is 'Question'. true =  Question

 Processing the 1-th sample
'student': 'ok', 'tutor_response': 'OK. Could you say it with correct words?'
1 predict =  1. Student's utterance: 'ok'
   Tutor's response: 'OK. Could you say it with correct words?'

2. The tutor action is: 'Question' true =  Question

 Processing the 2-th sample
'student': 'OK, thanks. Is it "il gatto e vicino al il cane"?', 'tutor_response': 'How do you say red?'
2 predict =  1. Student's utterance: 'OK, thanks. Is it "il gatto e vicino al il cane"?'
   Tutor's response: 'How do you say red?'

2. The tutor action is 'Question'. true =  Question

 Processing the 3-th sample
'student': 'e dentro la rosa', 'tutor_

In [37]:
# Load test data from file
# with open('/content/drive/MyDrive/pilot_cima/dataset/test_data_40.json', 'r') as f:
#    test_data = json.load(f)

# Extract labels from test data
y_true = label_true

# Clean predicted labels
y_predict = clean_labels(label_predict)

# Remove samples with '*' label from evaluation
y_true_filtered, y_predict_filtered = [], []
for true, pred in zip(y_true, y_predict):
    if not pred.startswith('*'):
        y_true_filtered.append(true)
        y_predict_filtered.append(pred)

# Compute classification report
print(classification_report(y_true_filtered, y_predict_filtered))

              precision    recall  f1-score   support

Confirmation       0.89      0.80      0.84        10
  Correction       0.17      0.10      0.12        10
        Hint       1.00      0.30      0.46        10
    Question       0.41      0.90      0.56        10

    accuracy                           0.53        40
   macro avg       0.62      0.53      0.50        40
weighted avg       0.62      0.53      0.50        40



In [None]:
gpt_3_result = classification_report(y_true_filtered, y_predict_filtered)

In [None]:
print(gpt_3_result)

              precision    recall  f1-score   support

Confirmation       0.82      0.90      0.86        10
  Correction       0.40      0.80      0.53        10
        Hint       0.33      0.10      0.15        10
    Question       1.00      0.60      0.75        10

    accuracy                           0.60        40
   macro avg       0.64      0.60      0.57        40
weighted avg       0.64      0.60      0.57        40

