In [None]:
import logging
from datetime import datetime

current_file_name = "8_Transcripts_Processing_GPT"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

from openai import OpenAI

In [None]:
from helpers.pages import *
from helpers.constants import *
from helpers.questions import *
from helpers.utils import *

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
with open("tokens/openai_key.txt", "r") as file:
    OPENAI_API_KEY = file.read().rstrip()

# Set environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
client = OpenAI()

In [None]:
paired_answers_raw = pd.read_csv("data\\4_Pair_UXtweak_and_SurveyJS\\4_Pair_UXtweak_and_SurveyJS_data.csv")

In [None]:
# Remove all columns with postfix _extracted
paired_answers = paired_answers_raw[paired_answers_raw.columns.drop(list(paired_answers_raw.filter(regex='_extracted')))]

# Remove postfix _evaluated
paired_answers.columns = paired_answers.columns.str.replace('_evaluated', '')

# Drop columns Submitted and encoded
paired_answers = paired_answers.drop(columns=["Submitted", "encoded"])

# Cast global columns to string
paired_answers[list(glob_normal_columns)] = paired_answers[list(glob_normal_columns)].astype(str)
paired_answers[list(glob_reversed_columns)] = paired_answers[list(glob_reversed_columns)].astype(str)

# Replace numbers eith text answers
paired_answers.update(paired_answers[list(glob_normal_columns)].apply(lambda col: col.map(glob_normal_likert_numbers)))
paired_answers.update(paired_answers[list(glob_reversed_columns)].apply(lambda col: col.map(glob_reverse_likert_numbers)))

In [None]:
paired_answers

In [None]:
elaborations_indices = [4, 8, 15, 18, 30, 32, 39, 41, 51, 52]
elaborations_questions = [x for i, x in enumerate(glob_big5_questions) if i + 1 in elaborations_indices]
elaborations_names = [f"elaboration_{x}_{y}" for x in range(1, 6) for y in range(1, 3)]
elaborations_columns = [f"rbfi{x}" if x in glob_reversed_questions else f"bfi{x}" for x in elaborations_indices]

elaborations = {elaborations_names[i]: (elaborations_columns[i], elaborations_questions[i], elaborations_indices[i]) for i in range(len(elaborations_indices))}
elaborations

In [None]:
def get_dict_of_paths(root_path):
    dict_of_paths = {}
    for root, dirs, files in os.walk(root_path):
        if len(files) > 0:
            files = [f for f in files if f.endswith(".txt")]
            files = [os.path.join(root, f) for f in files if "_response" not in f]
            
            folder_name = root.split("\\")[-1]
            dict_of_paths[folder_name] = files
    return dict_of_paths

In [None]:
extracted_transcripts_fg_path = "data\\7_3_Combine_Chunks\\FG"
extracted_transcripts_h_path = "data\\7_3_Combine_Chunks\\H"

In [None]:
fg_paths = get_dict_of_paths(extracted_transcripts_fg_path)
h_paths = get_dict_of_paths(extracted_transcripts_h_path)

In [None]:
def create_list_of_inputs(paired_answers, path_dict, variant):
    list_of_dicts = []

    # For each respondent, transcribe all the audio files and save the transcript
    for respondent, paths in path_dict.items():
        order = int(respondent.split("_")[-1])

        answer = paired_answers[paired_answers["group"] == variant]
        answer = answer[answer["order"] == order]

        for path in paths:
            with open(path, "r") as file:
                transcript = file.read()

                elaboration_name = path.split("\\")[-1].split(".")[0]
                elaboration_column = elaborations[elaboration_name][0]

                elaboration_question = elaborations[elaboration_name][1]
                
                try:
                    respondent_answer = answer[elaboration_column].values[0]
                except:
                    logging.error(f"Respondent {respondent} does not have an answer for {elaboration_name}")
                    continue
                
                input_dict = {
                    "respondent": respondent,
                    "variant": variant,
                    "elaboration_name": elaboration_name,
                    "transcript": transcript,
                    "question": elaboration_question,
                    "answer": respondent_answer
                }

                list_of_dicts.append(input_dict)
    
    return list_of_dicts

In [None]:
inputs_fg = create_list_of_inputs(paired_answers, fg_paths, "FG")
inputs_h = create_list_of_inputs(paired_answers, h_paths, "H")

In [None]:
@timer
def call_gpt(system_prompt, user_prompt, temperature=0.2):
    response = client.chat.completions.create(
        # model="gpt-3.5-turbo", # Input 0,50 USD / 1M tokens Output 1,50 USD / 1M tokens
        model="gpt-4-turbo", # Input 10,00 USD / 1M tokens Output 30,00 USD / 1M tokens
        # model="gpt-4", # Input 30,00 USD / 1M tokens Output 60,00 USD / 1M tokens
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": user_prompt
            }
        ],
        temperature=temperature
    )

    return response

In [None]:
def sanitaze_response(response_object, possible_answers):
    response = response_object.choices[0].message.content
    # Sometimes gpt answers with whole sentences, sometimes with just a word. This function sanitizes the response to be a word from the possible answers.
    for possible_answer in possible_answers:
        if f'"{possible_answer}"' in response:
            return possible_answer
        
    return response

In [None]:
@timer
def get_embedding(text, model="text-embedding-3-large"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
def process_transcripts(inputs, variant):
    # Get unique respondents
    unique_respondents = list(set([x["respondent"] for x in inputs]))

    for respondent in unique_respondents:
        respondent_inputs = [x for x in inputs if x["respondent"] == respondent]

        for i in respondent_inputs:

                system_prompt_big5 = f"""You are a researcher who is evaluating the personality of the respondent based on the Big Five traits: Openness, Conscientiousness, Extraversion, Agreeableness, and Neuroticism.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the personality-related question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to assess the relevance of the respondent's elaboration in the context of the Big Five traits.
        Does the respondent's elaboration provide useful insights into their personality traits related to the question and their initial answer? Answer with "Yes" if it provides relevant insights into any of the Big Five traits, and "No" if it does not."
        """
                user_prompt_big5 = i["transcript"]
                big5_response = call_gpt(system_prompt_big5, user_prompt_big5)
                i["big5"] = sanitaze_response(big5_response, ["Yes", "No"])

                system_prompt_hesitation = f"""You are a researcher who is is analyzing the respondent's hesitation in the respondent's verbal elaboration.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the following question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to assess whether the respondent's hesitations during their verbal elaboration are indicative of uncertainty, anxiety, or reconsideration of their thoughts.
        Does the respondent's hesitation suggest a deeper psychological process at play regarding the topic in question? Consider the duration, frequency, and context of the hesitations. Answer with 'Yes' if the hesitations are psychologically meaningful, and 'No' if they appear to be normal pauses or unrelated to deeper thought processes.
        """
                user_prompt_hesitation = i["transcript"]
                hesitation_response = call_gpt(system_prompt_hesitation, user_prompt_hesitation)
                i["hesitation"] = sanitaze_response(hesitation_response, ["Yes", "No"])


                system_prompt_relevant = f"""You are a researcher who is evaluating the relevance of the respondent's verbal elaboration.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the following question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to evaluate the respondent's verbal elaboration.
        Was the respondent's verbal elaboration relevant to the question and the previous answer? Answer with "Yes" or "No".
        """
                user_prompt_relevant = i["transcript"]
                relevant_response = call_gpt(system_prompt_relevant, user_prompt_relevant)
                i["relevant"] = sanitaze_response(relevant_response, ["Yes", "No"])
                
                system_prompt_quality = f"""You are a researcher who is evaluating the quality of the respondent's verbal elaboration.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the following question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to evaluate the quality of the respondent's verbal elaboration.
        What is the quality of the respondent's verbal elaboration? Answer with "Good", "Average", or "Poor".
        """
                user_prompt_quality = i["transcript"]
                quality_response = call_gpt(system_prompt_quality, user_prompt_quality)
                i["quality"] = sanitaze_response(quality_response, ["Good", "Average", "Poor"])

                system_prompt_honesty = f"""You are a researcher who is evaluating the honesty of the respondent's verbal elaboration.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the following question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to evaluate the honesty of the respondent's verbal elaboration.
        Was the respondent's verbal elaboration honest? Answer with "Yes" or "No".
        """
                user_prompt_honesty = i["transcript"]
                honesty_response = call_gpt(system_prompt_honesty, user_prompt_honesty)
                i["honesty"] = sanitaze_response(honesty_response, ["Yes", "No"])

                system_prompt_tone = f"""You are a researcher who is evaluating the tone of the respondent's verbal elaboration.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the following question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to evaluate the tone of the respondent's verbal elaboration.
        What is the tone of the respondent's verbal elaboration? Answer with "Positive", "Neutral", or "Negative".
        """
                user_prompt_tone = i["transcript"]
                tone_response = call_gpt(system_prompt_tone, user_prompt_tone)
                i["tone"] = sanitaze_response(tone_response, ["Positive", "Neutral", "Negative"])

                syste_prompt_language_complexity = f"""You are a researcher who is evaluating the language complexity of the respondent's verbal elaboration.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the following question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to evaluate the language complexity of the respondent's verbal elaboration.
        What is the language complexity of the respondent's verbal elaboration? Answer with "Simple", "Average", or "Complex".
        """
                user_prompt_language_complexity = i["transcript"]
                language_complexity_response = call_gpt(syste_prompt_language_complexity, user_prompt_language_complexity)
                i["language_complexity"] = sanitaze_response(language_complexity_response, ["Simple", "Average", "Complex"])

                system_prompt_linguistic_cues = f"""You are a researcher who is evaluating the linguistic cues of the respondent's verbal elaboration.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the following question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to evaluate the linguistic cues of the respondent's verbal elaboration.
        You are mainly interested in the linguistic cues such as evasive language, excessive qualifiers, or vague statements that often accompany dishonesty.
        Did the respondent use any linguistic cues that might indicate dishonesty? Answer with "Yes" or "No".
        """
                user_prompt_linguistic_cues = i["transcript"]
                linguistic_cues_response = call_gpt(system_prompt_linguistic_cues, user_prompt_linguistic_cues)
                i["linguistic_cues"] = sanitaze_response(linguistic_cues_response, ["Yes", "No"])

                system_prompt_defensiveness = f"""You are a researcher who is evaluating the defensiveness of the respondent's verbal elaboration.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the following question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to evaluate the defensiveness of the respondent's verbal elaboration.
        Was the respondent's verbal elaboration defensive? Answer with "Yes" or "No".
        """
                user_prompt_defensiveness = i["transcript"]
                defensiveness_response = call_gpt(system_prompt_defensiveness, user_prompt_defensiveness)
                i["defensiveness"] = sanitaze_response(defensiveness_response, ["Yes", "No"])

                system_prompt_contradictions = f"""You are a researcher who is evaluating the contradictions in the respondent's verbal elaboration.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the following question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to evaluate the contradictions in the respondent's verbal elaboration.
        Did the respondent contradict themselves during the verbal elaboration? Answer with "Yes" or "No".
        """
                user_prompt_contradictions = i["transcript"]
                contradictions_response = call_gpt(system_prompt_contradictions, user_prompt_contradictions)
                i["contradictions"] = sanitaze_response(contradictions_response, ["Yes", "No"])

                system_prompt_consistency = f"""You are a researcher who is evaluating the consistency of the respondent's verbal elaboration.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the following question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to evaluate the consistency of the respondent's verbal elaboration with their previous answers.
        Was the respondent's verbal elaboration consistent with their previous answers? Answer with "Yes" or "No".
        """
                user_prompt_consistency = i["transcript"]
                consistency_response = call_gpt(system_prompt_consistency, user_prompt_consistency)
                i["consistency"] = sanitaze_response(consistency_response, ["Yes", "No"])

                system_prompt_intent = f"""You are a researcher who is evaluating the intent of the respondent's verbal elaboration.
        During verbal elaboration, the respondent provided additional information about their previous answer.
        The respondent is reacting to the following question: '''{i['question']}''', and the respondent's initial answer was: '''{i['answer']}'''. Your task is to evaluate the intent of the respondent's verbal elaboration.
        What was the intent of the respondent's verbal elaboration? Answer with "Informative", "Evasive", or "Defensive".
        """
                user_prompt_intent = i["transcript"]
                intent_response = call_gpt(system_prompt_intent, user_prompt_intent)
                i["intent"] = sanitaze_response(intent_response, ["Informative", "Evasive", "Defensive"])

                i["embedding"] = get_embedding(i["transcript"])
                
        # Create csv file with all the respondent_inputs
        df = pd.DataFrame(respondent_inputs)
        save_path = f"data\\8_Transcripts_Processing_GPT\\{variant}\\{respondent}"
        os.makedirs(save_path, exist_ok=True)
        df.to_csv(f"{save_path}\\{respondent}.csv", index=False, sep="~", encoding="utf-8")

In [None]:
# process_transcripts(inputs_fg, "FG")

In [None]:
# Get folder names from path data\8_Transcripts_Processing_GPT\H
folders = [x[0] for x in os.walk("data\\8_Transcripts_Processing_GPT\\H")][1:]
# Only folder names
folders = [x.split("\\")[-1] for x in folders]

folders

In [None]:
# Remove all respondents that have already been processed (folders) from the inputs_h, which has format
inputs_h = [x for x in inputs_h if x["respondent"] not in folders]

In [None]:
process_transcripts(inputs_h, "H")