In [1]:
# import necessary packages
import os
import re
import json
import random
from collections import Counter

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# !pip install openai
import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential

OPENAI_API_KEY = ''

In [None]:
# Load data from googld drive
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/MyDrive/pilot_cima/'
os.chdir(path)
print("path=", os.getcwd())

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
path= /content/drive/MyDrive/pilot_cima


In [None]:
# Define utility functions

# Define function to count labels
def count_labels(dataset, key = 'tutorActions'):    
    
    # Extract labels from dataset
    labels = []
    for example in dataset:
        labels += example[key]
        
    # Count label occurrences
    label_counts = Counter(labels)
    
    # Print count for each label
    for label, count in label_counts.items():
        print(label, count)
        
        
# Function to clean predicted labels
def clean_labels(outputs):
    """
    Clean predicted labels.

    Args:
    outputs (list): A list of predicted labels.

    Returns:
    cleaned_labels (list): A list of cleaned labels.
    """
    
    # Define a list of labels
    labels = ['Question', 'Hint', 'Correction', 'Confirmation']

    # Define a special symbol
    special_symbol = '*'

    # Define a regular expression pattern that matches any label in the label list
    label_pattern = '|'.join(labels)

    # Initialise an empty list to store cleaned labels
    cleaned_labels = []

    # Loop through each output and extract the label information
    for output in outputs:
        # Match the label using the regular expression pattern
        match = re.search(label_pattern, output)

        # If there is a match, append the label to the cleaned_labels list
        if match:
            cleaned_labels.append(match.group(0))
        # Otherwise, append the special symbol to the cleaned_labels list
        else:
            cleaned_labels.append(special_symbol)

    return cleaned_labels

In [2]:
# Load data

# The CIMA dataset is from Stasaski, K., Kao, K., & Hearst, M. A. (2020).\
# CIMA: A Large Open Access Dialogue Dataset for Tutoring. 

with open('cima_raw_data.json', 'r') as f:
    data = json.loads(f.read())
    
    # Use 'prepDataset' database
    data = data['prepDataset'] 
    num = len(data.keys())
    print(data['0'].keys(), num)
    
# Convert Python object to DataFrame
df = pd.DataFrame.from_dict(data).T
# Display DataFrame as table in Notebook
# display(df)

dict_keys(['past_convo', 'img', 'prep', 'engPrep', 'obj', 'engObj', 'color', 'engColor', 'grammarRules', 'studentActions', 'tutorResponses', 'tutorActions', 'tutorKeys']) 1135


In [None]:
# Data Preparation

def generate_dataset(dataframe, num_prev_sentences=1):
    # Dialogue actions labels
    tutorActions_labels = ['Question', 'Hint', 'Correction', 'Confirmation', 'Other']

    # create an empty list to store the output
    output = []

    # iterate through the dataframe
    for i in range(len(dataframe)):
        # create a dictionary to store each session
        session = {}

        # get the specified number of previous sentences of the conversation
        if num_prev_sentences == 2:
             session['tutor_initiation'] = dataframe['past_convo'][i][-2]
        session['student'] = dataframe['past_convo'][i][-1]

        # iterate through the tutorActions and tutorResponses, select responses with a single encoding
        single_encoding_responses = []
        actions = dataframe['tutorActions'][i]
        for j, action in enumerate(actions):
            # if the response has multiple encodings or is labeled as "Other", skip it
            if action[-1] == 1 or 'Other' in dataframe['tutorResponses'][i][j]:
                continue
            encoding_count = sum(action[:-1])
            if encoding_count == 1:
                single_encoding_responses.append(dataframe['tutorResponses'][i][j])

        # if there are responses with a single encoding, randomly select one
        if len(single_encoding_responses) > 0:
            index = random.randint(0, len(single_encoding_responses)-1)
            response = single_encoding_responses[index]
            actions = dataframe['tutorActions'][i][dataframe['tutorResponses'][i].index(response)]

            # create 'tutorResponses' and corresponding 'tutorActions' labels
            session['tutor_response'] = response
            session['tutorActions'] = [label for j, label in enumerate(tutorActions_labels[:-1]) if actions[j]]

            output.append(session)

    # write the output to a JSON file
    folder_name = "dataset"

    # check if the folder exists, if not create it
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)

    with open(f'./dataset/n{num_prev_sentences}_sentences_output.json', 'w') as f:
        json.dump(output, f)

    print('Example:\n', output[1])
    print('\nNum. of examples:', len(output))
    
    return output

dataset = generate_dataset(df, num_prev_sentences=1)

Example:
 {'student': "Oh okay! then I'll try e accanto al letto roso.", 'tutor_response': 'Wonderful!', 'tutorActions': ['Confirmation']}

Num. of examples: 1065


In [None]:
# Data splitting

def split_dataset(data, test_size=10):
    tutorActions_labels = ['Question', 'Hint', 'Correction', 'Confirmation']

    # Split the data into training and test sets
    test_data = []
    train_data = []

    for label in tutorActions_labels:
        count = 0
        for sample in data:
            if label in sample['tutorActions'] and count < test_size:
                test_data.append(sample)
                data.remove(sample)
                count += 1

    # Split the training set into samples
    for sample in data:
        train_data.append([sample])

    # Save the test set to a JSON file
    with open(f'./dataset/test_data_{len(test_data)}.json', 'w') as f:
        json.dump(test_data, f)

    # Save the training set to a JSON file (for further fine-tuning experiment)
    with open(f'./dataset/train_data.json', 'w') as f:
        json.dump(train_data, f)

    # Print an example from the test set and the number of examples it contains
    print('Example:\n', test_data[1])
    print('\nNum. of examples:', len(test_data))
    
    return test_data

test_data = split_dataset(dataset, test_size=20)

Example:
 {'student': 'ok', 'tutor_response': 'OK. Could you say it with correct words?', 'tutorActions': ['Question']}

Num. of examples: 80


In [None]:
# Select a model
openai.api_key = OPENAI_API_KEY

@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(20))
def get_completion(messages, model="gpt-3.5-turbo", temperature=0):
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=temperature)
    return response.choices[0].message["content"]


def train_model(test_data, sys_message, prompt_message):
    label_predict = []

    # Extract labels from test data
    label_true = [sample['tutorActions'][0] for sample in test_data]
    
    for i in range(len(test_data)):
        test_data[i]['tutorActions'] = []
        info = test_data[i]
        
        # messages
        prompt = prompt_message + f'```{info}```'
        messages = [
            {'role':'system','content':sys_message},
            {'role':'user','content':prompt}]
        
        # Predict
        predict = get_completion(messages)
        print(i, 'predict = ', predict, 'true = ', label_true[i])

        label_predict.append(predict)
        
    return label_predict

In [None]:
# Training
sys_message = """
Your role is to classify tutor's response to student attempts. \
Students who seek information are not mistaken and do not assign 'Correction' labels.\
Pay more attention to determine the "hint" label.\
Give the final answer without explanation and punctuation."""

prompt_message = """
You will read the information delimited with triple backticks, perform the following steps:

1. Identify student's utterance and tutor's response.
2. Determine whether the tutor is asking an open-ended question with a \
   question mark ('Question'), agreeing with student's words ('Confirmation'), \
   or none of them ('Other').
3. Unless it's not 'Other', label it accordingly and skip other steps. \
   If it's 'Other', proceed to step 4.
4. Determine whether tutor is answering student questions and providing \
   additional information ('Hint') or is explicitly correcting student's mistake ('Correction').
5. Label it accordingly."""

with open('./dataset/test_data_40.json', 'r') as f:
    test_data = json.loads(f.read())

label_predict = train_model(test_data, sys_message, prompt_message)
temp = label_predict

0 predict =  - Student's utterance: "So what's the correct answer?"
- Tutor's response: "Are you sure you have all the words in the right order?"
- Tutor's action: Other

Label: Other true =  Question
1 predict =  - Student's utterance: "blu"
- Tutor's response: "So what do you think the answer is?"
- Label: Question true =  Question
2 predict =  1. Student's utterance: "OK, thanks. Is it "il gatto e vicino al il cane"?"
Tutor's response: "close, what color is the dog?"

2. The tutor is asking an open-ended question with a question mark. Hence, the label is 'Question'.

3. Label: Question

 true =  Question
3 predict =  Student: "what is behind?"
Tutor: "dietro"

Label: Hint true =  Question
4 predict =  1. Student's utterance: "e dentro la rosa"
Tutor's response: "what is the word for box in italian"

2. Tutor's response is a 'Question'.

3. Label: 'Question'

 true =  Question
5 predict =  Student: what is red?
Tutor: Is it rosso or rossi?
Label: Question true =  Question
6 predict =

In [None]:
# Load test data from file
with open('/content/drive/MyDrive/pilot_cima/dataset/test_data_40.json', 'r') as f:
    test_data = json.load(f)

# Extract labels from test data
y_true = [sample['tutorActions'][0] for sample in test_data]

# Clean predicted labels
y_predict = clean_labels(temp)

# Remove samples with '*' label from evaluation
y_true_filtered, y_predict_filtered = [], []
for true, pred in zip(y_true, y_predict):
    if not pred.startswith('*'):
        y_true_filtered.append(true)
        y_predict_filtered.append(pred)

# Compute classification report
# target_names = ['Question', 'Hint', 'Correction', 'Confirmation']
print(classification_report(y_true_filtered, y_predict_filtered, target_names=target_names))

              precision    recall  f1-score   support

Confirmation       0.80      0.57      0.67         7
  Correction       0.73      0.80      0.76        10
        Hint       0.64      0.70      0.67        10
    Question       0.78      0.78      0.78         9

    accuracy                           0.72        36
   macro avg       0.74      0.71      0.72        36
weighted avg       0.73      0.72      0.72        36

