In [None]:
# The Colab environment does not have this library.
!pip install openai

Collecting openai
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
llmx 0.0.15a0 requires cohere, which is not installed.
llmx 0.0.15a0 requires tiktoken, which is not installed.[0m[31m
[0mSuccessfully installed openai-0.28.1


In [None]:
# import necessary packages
import os
import re
import json
import random
from collections import Counter

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, cohen_kappa_score

import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential

In [None]:
# Load data from googld drive
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/pilot_cima/'
os.chdir(path)
print("path=", os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
path= /content/drive/MyDrive/pilot_cima


In [None]:
# Set the API key
with open('openai_key.txt', 'r') as file:
    OPENAI_API_KEY = file.read().strip()

openai.api_key = OPENAI_API_KEY

# using a random seed to ensure reproducibility
random_seed = 27

In [None]:
# Define utility functions

# Define function to count labels
def count_labels(dataset, key = 'tutorActions'):

    # Extract labels from dataset
    labels = []
    for example in dataset:
        labels += example[key]

    # Count label occurrences
    label_counts = Counter(labels)

    # Print count for each label
    for label, count in label_counts.items():
        print(label, count)


# Function to clean predicted labels - [backward matching]
def clean_labels(outputs):

    # Define a list of labels
    labels = ['Question', 'Hint', 'Correction', 'Confirmation']

    # Define a special symbol
    special_symbol = '*'

    # Define a regular expression pattern that matches any label in the label list
    label_pattern = '|'.join(labels)

    # Initialise an empty list to store cleaned labels
    cleaned_labels = []

    # Loop through each output in the outputs list and extract the label information
    for output in outputs:

        # Find all labels in the output using the regular expression pattern
        matches = re.findall(label_pattern, output)

        # If there are matches, append the last match to the cleaned_labels list
        if matches:
            last_match = matches[-1]
            cleaned_labels.append(last_match)
        # If there are no matches, append the special symbol to the cleaned_labels list
        else:
            cleaned_labels.append(special_symbol)

    return cleaned_labels

In [None]:
# Load data

# The CIMA dataset is from Stasaski, K., Kao, K., & Hearst, M. A. (2020).\
# CIMA: A Large Open Access Dialogue Dataset for Tutoring.

with open('cima_raw_data.json', 'r') as f:
    data = json.loads(f.read())

    # Use 'prepDataset' database
    data = data['prepDataset']
    num = len(data.keys())
    print(data['0'].keys(), num)

# Convert Python object to DataFrame
df = pd.DataFrame.from_dict(data).T

# Display DataFrame as table in Notebook
display(df)

dict_keys(['past_convo', 'img', 'prep', 'engPrep', 'obj', 'engObj', 'color', 'engColor', 'grammarRules', 'studentActions', 'tutorResponses', 'tutorActions', 'tutorKeys']) 1135


Unnamed: 0,past_convo,img,prep,engPrep,obj,engObj,color,engColor,grammarRules,studentActions,tutorResponses,tutorActions,tutorKeys
0,"[""Pink"" is ""rosa"". Please try to fill in the b...","""pictures/dog_behind_pink_tree.png""",e dietro,is behind the,l'albero,tree,rosa,pink,"[[""l' (\""the\"") is prepended to the following ...","[False, True, False, False]",[Look at your order of words again. Adjectives...,"[[False, False, False, False, True], [True, Fa...",[]
1,"[Please try to fill in the blank in Italian. ,...","""pictures/plant_next_pink_bed.png""",e accanto al,is next to the,letto,bed,rosa,pink,"[[""al (\""to the\"", or \""of the\"") is used when...","[True, False, False, False]","[Wonderful!, Very good - you are almost comple...","[[False, False, False, True, False], [False, F...",[]
2,"[Please try to fill in the blank in Italian. ,...","""pictures/cat_next_blue_tree.png""",e vicino,is next to the,all'albero,tree,blu,blue,"[[""all' (\""to the\"" is prepended to the follow...","[False, False, True, False]",[So close! Look at the word tree and double ch...,"[[False, False, False, True, False], [True, Fa...",[]
3,"[Please try to fill in the blank in Italian. ,...","""pictures/cat_next_yellow_bunny.png""",e accanto al,is next to the,coniglio,bunny,giallo,yellow,"[[""al (\""to the\"", or \""of the\"") is used when...","[True, False, False, False]","[Well let's look at your last word ""amarillo""....","[[True, False, False, False, False], [True, Fa...",[]
4,"[Please try to fill in the blank in Italian. ,...","""pictures/plant_next_yellow_bed.png""",e accanto al,is next to the,letto,bed,giallo,yellow,"[[""al (\""to the\"", or \""of the\"") is used when...","[False, False, True, False]","[The ""is next to the"" is e accanto al., No, th...","[[False, True, False, False, False], [False, F...",[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1130,"[Please try to fill in the blank in Italian. ,...","""pictures/bunny_front_purple_tree.png""",e di fronte,is in front of the,all'albero,tree,viola,purple,"[[""all' (to the is prepended to the following ...","[True, False, False, False]","[Not quite. The word for ""tree"" is ""all'alber...","[[False, True, True, False, False]]",[]
1131,"[Please try to fill in the blank in Italian. ,...","""pictures/cat_on_yellow_bed.png""",e in cima al,is on top of the,letto,bed,giallo,yellow,"[[""al (to the, or of the) is used when the fol...","[False, True, False, False]","[I will help you with that, ""on top of"" transl...","[[False, True, False, False, False]]",[]
1132,"[Please try to fill in the blank in Italian. ,...","""pictures/bunny_front_pink_box.png""",e di fronte alla,is in front of the,scatola,box,rosa,pink,"[[""alla (to the) is used when the following wo...","[False, True, False, False]","[OK, we can break this down one by one. In fro...","[[False, True, False, False, False]]",[]
1133,[Green is verde. Please try to fill in the bla...,"""pictures/plant_inside_green_box.png""",e dentro la,is inside of the,scatola,box,verde,green,"[[""Prepositional phrases separate the two noun...","[False, True, False, False]",[Inside of is dentro la. Do you know green box?],"[[False, True, False, False, False]]",[]


In [None]:
# Data Preparation (with two previous turns)

def generate_dataset(dataframe):

    # Dialogue actions labels
    tutorActions_labels = ['Question', 'Hint', 'Correction', 'Confirmation', 'Other']

    # create an empty list to store the output
    output = []

    # iterate through the dataframe
    for i in range(len(dataframe)):
        # create a dictionary to store each session
        session = {}

        # get the specified number of previous sentences of the conversation
        session['tutor_initiation'] = dataframe['past_convo'][i][-2]
        session['student'] = dataframe['past_convo'][i][-1]

        # iterate through the tutorActions and tutorResponses, select responses with a single encoding
        single_encoding_responses = []
        actions = dataframe['tutorActions'][i]

        # if the response has multiple encodings or is labeled as "Other", skip it
        for j, action in enumerate(actions):
            if action[-1] == 1 or 'Other' in dataframe['tutorResponses'][i][j]:
                continue
            encoding_count = sum(action[:-1])
            if encoding_count == 1:
                single_encoding_responses.append(dataframe['tutorResponses'][i][j])

        # if there are responses with a single encoding, randomly select one (since more than one coworks may reply to the same student's utternace)
        if len(single_encoding_responses) > 0:
            index = random.randint(0, len(single_encoding_responses)-1)
            response = single_encoding_responses[index]
            actions = dataframe['tutorActions'][i][dataframe['tutorResponses'][i].index(response)]

            # create 'tutorResponses' and corresponding 'tutorActions' labels
            session['tutor_response'] = response
            session['tutorActions'] = [label for j, label in enumerate(tutorActions_labels[:-1]) if actions[j]]

            output.append(session)

    print('Example:\n', output[1])
    print('\nNum. of examples:', len(output))

    return output

prepared_data = generate_dataset(df)

Example:
 {'tutor_initiation': 'Okay, I\'ll give you a hint.  "bed" is  "letto"', 'student': "Oh okay! then I'll try e accanto al letto roso.", 'tutor_response': 'Wonderful!', 'tutorActions': ['Confirmation']}

Num. of examples: 1065


In [None]:
# Sampling
random.seed(random_seed)

# Define a function to sample data
def sample_dataset(data, category_size = 10):

    tutorActions_labels = ['Question', 'Hint', 'Correction', 'Confirmation']

    sampled_data = []

    for label in tutorActions_labels:
        count = 0
        for sample in data:
            if label in sample['tutorActions'] and count < category_size:
                sampled_data.append(sample)
                data.remove(sample)
                count += 1

    # Print an example from the test set and the number of examples it contains
    print('\nNum. of examples:', len(sampled_data))
    print('Example:\n', sampled_data[0])

    return sampled_data

In [None]:
# Prompts (sys_message)

def get_sys_message(sys_message_num, num_pre_turn):

  variations = ['Identify tutor\'s response.',
                'Identify student\'s utterance and tutor\'s response.',
                'Identify tutor\'s initiation, student\'s utterance, and tutor\'s response.']

  sys_message_dict = {}

  sys_message_dict['1'] = """
  You will be given a snippet of a tutor-student conversation enclosed within triple backticks.
  Your job is to carefully read this information line by line, and then \
provide a tag of the TUTOR_RESPONSE from the following list.
  Choose ONLY ONE best tag from the list of tags provided here.

  - Question
  - Hint
  - Correction
  - Confirmation
  """

  sys_message_dict['2'] = """
  You will be given a snippet of a tutor-student conversation enclosed within triple backticks.
  Your job is to carefully read this information line by line, and then \
provide a tag of the TUTOR_RESPONSE from the following list.
  After the colon, each tag's explanation is attached
  Choose ONLY ONE best tag from the list of tags provided here.

  - Question: The tutor is asking an open-ended question usually with a question mark
  - Hint: The tutor is answering a student's question and/or providing additional information
  - Correction: The tutor is explicitly correcting student's mistake
  - Confirmation: The tutor is agreeing with student's words

  """

  sys_message_dict['3'] = f"""
  You will be given a snippet of a tutor-student conversation enclosed within triple backticks.

  Your job is to perform the following steps:

  1. {variations[num_pre_turn]}
  2. Determine whether the tutor action is 'Question', 'Confirmation', \
or none of them ('Other').
  3. Unless it's NOT 'Other', choose ONLY ONE tag from 'Question' and 'Confirmation' and skip other steps. \
If it's 'Other', proceed to step 4.
  4. Determine whether the tutor action is 'Hint' or 'Correction'.
  5. Choose ONLY ONE tag from 'Hint' and 'Correction'
  """

  sys_message_dict['4'] = f"""
  You will be given a snippet of a tutor-student conversation enclosed within triple backticks.
  Your job is to perform the following steps:

  1. {variations[num_pre_turn]}
  2. Determine whether the tutor is asking an open-ended question usually with a \
question mark ('Question'), agreeing with student's words ('Confirmation'), \
or none of them ('Other').
  3. Unless it's NOT 'Other', choose ONLY ONE tag from 'Question' and 'Confirmation' and skip other steps. \
If it's 'Other', proceed to step 4.
  4. Determine whether tutor is answering a student's question and/or providing \
additional information ('Hint') or is explicitly correcting student's mistake ('Correction').
  5. Choose ONLY ONE tag from 'Hint' and 'Correction'
  """

  sys_message = sys_message_dict[str(sys_message_num)]

  return sys_message

In [None]:
# from the sample to generate the user message

def get_user_message(sample, num_pre_turn):

  line_keys = ['tutor_initiation', 'student', 'tutor_response']
  user_message = ''

  for key in line_keys[(2-num_pre_turn):]:
    user_message = user_message + f'{key}: {sample[key]} \n'

  # print('user_message is: \n', user_message)

  return user_message

In [59]:
# Define the function to implement the model

@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(20))

def get_completion(messages, model, temperature=0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature)
    return response.choices[0].message["content"]


def implement_model(samples, Model, sys_message_num, num_pre_turn):

    label_predict = []

    # print an example
    print('Example:\n',
          f'sys_message = {get_sys_message(sys_message_num, num_pre_turn)}',
          f'user_message = {get_user_message(samples[0], num_pre_turn)}\n')

    # Imprement the model to machine tag
    for i in range(len(samples)):

        print('\n', f'Processing the {i}-th sample \n')

        sys_message = get_sys_message(sys_message_num, num_pre_turn)
        user_message = get_user_message(samples[i], num_pre_turn)

        messages = [
            {'role':'system','content':sys_message},
            {'role':'user','content':user_message}]

        # Predict
        predict = get_completion(messages, model= Model)

        # Print the conversation and the predicted result
        print('predict = ', predict, '\ntrue = ', label_true[i])

        label_predict.append(predict)

    return label_predict

In [None]:
# Sampling
CategorySize = 20  # the number of samples in each category
sampled_data = sample_dataset(prepared_data, CategorySize)

# Extracting labels from sampled data
label_true = [sample['tutorActions'][0] for sample in sampled_data]


Num. of examples: 80
Example:
 {'tutor_initiation': 'l\' ("the" is prepended to the following word when it begins with a vowel.', 'student': "So what's the correct answer?", 'tutor_response': 'Are you sure you have all the words in the right order?', 'tutorActions': ['Question']}


In [68]:
# Set variables
Model = "gpt-4" # select a model 'gpt-3.5-turbo', 'gpt-4'
Sys_message_num = 4  # the number of prompts, 1-basic, 2-elaborative, 3-stepwise, 4-combined
Num_pre_turn = 2  # the number of previous sentences to be considered, 0,1,2

# open data
# with open('25_OCT_sample_data.json', 'w') as file:
#    json.dump(sampled_data, file)

# Implementing
label_predict = implement_model(sampled_data, Model, Sys_message_num, Num_pre_turn)

Example:
 sys_message = 
  You will be given a snippet of a tutor-student conversation enclosed within triple backticks.
  Your job is to perform the following steps:

  1. Identify tutor's initiation, student's utterance, and tutor's response.
  2. Determine whether the tutor is asking an open-ended question usually with a question mark ('Question'), agreeing with student's words ('Confirmation'), or none of them ('Other').
  3. Unless it's NOT 'Other', choose ONLY ONE tag from 'Question' and 'Confirmation' and skip other steps. If it's 'Other', proceed to step 4.
  4. Determine whether tutor is answering a student's question and/or providing additional information ('Hint') or is explicitly correcting student's mistake ('Correction').
  5. Choose ONLY ONE tag from 'Hint' and 'Correction'
   user_message = tutor_initiation: l' ("the" is prepended to the following word when it begins with a vowel. 
student: So what's the correct answer? 
tutor_response: Are you sure you have all the wor

In [69]:
# Evaluation

# true labels
y_true = label_true

# Clean predicted labels
y_predict = clean_labels(label_predict)

# Remove samples with '*' label from evaluation
y_true_filtered, y_predict_filtered = [], []
for true, pred in zip(y_true, y_predict):
    if not pred.startswith('*'):
        y_true_filtered.append(true)
        y_predict_filtered.append(pred)

# Compute classification report
classification_rep = classification_report(y_true_filtered, y_predict_filtered)

# Calculate Cohen's Kappa
cohen_kappa = 0
cohen_kappa = cohen_kappa_score(y_true_filtered, y_predict_filtered)

# Print the classification report and Cohen's Kappa
# 1-basic, 2-elaborative, 3-stepwise, 4-combined
print(f'model = {Model}, sys_message_num = {Sys_message_num}, and num_pre_turn = {Num_pre_turn}\n')
print(classification_rep)
print(f"Cohen's Kappa: {cohen_kappa}")

model = gpt-4, sys_message_num = 4, and num_pre_turn = 2

              precision    recall  f1-score   support

Confirmation       0.88      0.88      0.88        17
  Correction       0.83      0.88      0.86        17
        Hint       0.79      0.75      0.77        20
    Question       0.71      0.71      0.71        17

    accuracy                           0.80        71
   macro avg       0.80      0.81      0.80        71
weighted avg       0.80      0.80      0.80        71

Cohen's Kappa: 0.7368281705056924
