In [None]:
import logging
from datetime import datetime

current_file_name = "11_Pause_Defined_Units"

dt_string = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = f"logs/{current_file_name}/{dt_string}.log"
logging.basicConfig(level=logging.INFO, filename=log_file,filemode="w", format="%(asctime)s %(levelname)s %(message)s")

# https://blog.sentry.io/logging-in-python-a-developers-guide/

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import json

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import parselmouth
import statistics

from IPython.display import Audio
from parselmouth.praat import call
from scipy.stats.mstats import zscore
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import syllables
import nltk
from collections import Counter
import string
from nltk.corpus import cmudict
from nltk.corpus import stopwords
import collections
import liwc

from openai import OpenAI

nltk.download('universal_tagset')
nltk.download('cmudict')

In [None]:
sns.set() # Use seaborn's default style to make attractive graphs
plt.rcParams['figure.dpi'] = 100 # Show nicely large images in this notebook

In [None]:
from helpers.pages import *
from helpers.constants import *
from helpers.utils import *

In [None]:
pd.set_option('display.max_columns', 500)

In [None]:
extracted_transcripts_fg_path = "data\\7_3_Combine_Chunks\\FG"
extracted_transcripts_h_path = "data\\7_3_Combine_Chunks\\H"

extracted_transcripts_fg_path_google = "data\\7_3_Combine_Chunks\\FG_Google"
extracted_transcripts_h_path_google = "data\\7_3_Combine_Chunks\\H_Google"

In [None]:
def count_syllables(word, fast=True):
    if fast:
        # Use the syllables package to estimate the number of syllables in a word
        return syllables.estimate(word.lower())
    else:
        # Extremely slow :(
        # Load the CMU Pronouncing Dictionary
        d = cmudict.dict()
        if word.lower() in d:
            # Count the number of vowels (indicated by digits) in the phoneme representation
            # This is a simple approximation and may not be 100% accurate for all words
            return sum(1 for phoneme in d[word.lower()][0] if phoneme[-1].isdigit())
        else:
            # If the word is not found in the dictionary, return 0 or handle as needed
            return syllables.estimate(word.lower())

In [None]:
def add_nltk_metric(row, list_of_tags, name, tag_abbreviation):
    row[name] = sum(1 for word, tag in list_of_tags if tag == tag_abbreviation)

In [None]:
def calculate_nltk_metrics(row):
    text = row["word"]

    # Tokenize the text into words
    words = nltk.word_tokenize(text)

    # Universal part-of-speech tagging to identify specific nouns, verbs, adjectives, adverbs, pronouns, etc.
    universal_pos_tags = nltk.pos_tag(words, tagset="universal")

    # Universal count of occurrences of nouns, verbs, adjectives, adverbs, pronouns etc
    # - NOUN (nouns)
    # - VERB (verbs)
    # - ADJ (adjectives)
    # - ADV (adverbs)
    # - PRON (pronouns)
    # - DET (determiners and articles)
    # - ADP (prepositions and postpositions)
    # - NUM (numerals)
    # - CONJ (conjunctions)
    # - PRT (particles)
    # - . (punctuation marks)
    # - X (a catch-all for other categories such as abbreviations or foreign words)

    all_universal_tags = [
        ("noun", "NOUN"),
        ("verb", "VERB"),
        ("adjective", "ADJ"),
        ("adverb", "ADV"),
        ("pronoun", "PRON"),
        ("determiner_article", "DET"),
        ("preposition_postposition", "ADP"),
        ("numeral", "NUM"),
        ("conjunction", "CONJ"),
        ("particle", "PRT"),
        ("punctuation", "."),
        ("other", "X")
    ]

    for name, tag in all_universal_tags:
        add_nltk_metric(row, universal_pos_tags, name, tag)

    # Specific part-of-speech tagging to identify specific nouns, verbs, adjectives, adverbs, pronouns, etc.
    specific_pos_tags = nltk.pos_tag(words)

    # Specific count of occurrences of nouns, verbs, adjectives, adverbs, pronouns etc
    # https://www.guru99.com/pos-tagging-chunking-nltk.html

    # Abbreviation	Meaning
    # CC	coordinating conjunction
    # CD	cardinal digit
    # DT	determiner
    # EX	existential there
    # FW	foreign word
    # IN	preposition/subordinating conjunction
    # JJ	This NLTK POS Tag is an adjective (large)
    # JJR	adjective, comparative (larger)
    # JJS	adjective, superlative (largest)
    # LS	list market
    # MD	modal (could, will)
    # NN	noun, singular (cat, tree)
    # NNS	noun plural (desks)
    # NNP	proper noun, singular (sarah)
    # NNPS	proper noun, plural (indians or americans)
    # PDT	predeterminer (all, both, half)
    # POS	possessive ending (parent\ ‘s)
    # PRP	personal pronoun (hers, herself, him, himself)
    # PRP$	possessive pronoun (her, his, mine, my, our )
    # RB	adverb (occasionally, swiftly)
    # RBR	adverb, comparative (greater)
    # RBS	adverb, superlative (biggest)
    # RP	particle (about)
    # TO	infinite marker (to)
    # UH	interjection (goodbye)
    # VB	verb (ask)
    # VBG	verb gerund (judging)
    # VBD	verb past tense (pleaded)
    # VBN	verb past participle (reunified)
    # VBP	verb, present tense not 3rd person singular(wrap)
    # VBZ	verb, present tense with 3rd person singular (bases)
    # WDT	wh-determiner (that, what)
    # WP	wh- pronoun (who)
    # WRB	wh- adverb (how)

    all_specific_tags = [
        ("coordinating_conjunction", "CC"),
        ("cardinal_digit", "CD"),
        ("determiner", "DT"),
        ("existential_there", "EX"),
        ("foreign_word", "FW"),
        ("preposition_subordinating_conjunction", "IN"),
        ("adjective", "JJ"),
        ("adjective_comparative", "JJR"),
        ("adjective_superlative", "JJS"),
        ("list_marker", "LS"),
        ("modal", "MD"),
        ("noun_singular", "NN"),
        ("noun_plural", "NNS"),
        ("proper_noun_singular", "NNP"),
        ("proper_noun_plural", "NNPS"),
        ("predeterminer", "PDT"),
        ("possessive_ending", "POS"),
        ("personal_pronoun", "PRP"),
        ("possessive_pronoun", "PRP$"),
        ("adverb", "RB"),
        ("adverb_comparative", "RBR"),
        ("adverb_superlative", "RBS"),
        ("particle", "RP"),
        ("infinite_marker", "TO"),
        ("interjection", "UH"),
        ("verb", "VB"),
        ("verb_gerund", "VBG"),
        ("verb_past_tense", "VBD"),
        ("verb_past_participle", "VBN"),
        ("verb_present_tense_not_3rd_person_singular", "VBP"),
        ("verb_present_tense_with_3rd_person_singular", "VBZ"),
        ("wh_determiner", "WDT"),
        ("wh_pronoun", "WP"),
        ("wh_adverb", "WRB"),
    ]
    
    for name, tag in all_specific_tags:
        add_nltk_metric(row, specific_pos_tags, name, tag)

    # Count total number of words
    total_words = len(words)

    # Other metrics
    unique_words = len(set(words))
    word_lengths = [len(word) for word in words]
    try:
        average_word_length = sum(word_lengths) / total_words
        lexical_diversity = len(set(words)) / total_words
    except ZeroDivisionError:
        average_word_length = 0
        lexical_diversity = 0
    
    row["total_words"] = total_words
    row["unique_words"] = unique_words
    row["average_word_length"] = average_word_length
    row["lexical_diversity"] = lexical_diversity

    return row

In [None]:
def get_pdu_dataset(response):
    threshold = 0.300

    words = response["words"]
    words_df = pd.DataFrame(words)
    words_df["articulation_duration"] = words_df["end"] - words_df["start"]

    # Add words count
    words_df["word_count"] = 1

    # Add syllables count
    words_df["syllables_count"] = words_df["word"].apply(count_syllables)

    # Add pause duration before each word
    words_df["pause_duration_before_word"] = words_df["start"].shift(0) - words_df["end"].shift(1)

    # Add pause duration after each word
    words_df["pause_duration_after_word"] = words_df["start"].shift(-1) - words_df["end"].shift(0)

    # Ignore pauses lower than threshold
    words_df["above_threshold_pause"] = words_df["pause_duration_before_word"].apply(lambda x: x if x >= threshold else 0)

    # Pauses longer than threshold are considered as a new unit
    words_df["unit"] = (words_df["above_threshold_pause"] >= threshold).cumsum()

    # Replace NaN with 0
    words_df["pause_duration_before_word"] = words_df["pause_duration_before_word"].fillna(0)
    words_df["pause_duration_after_word"] = words_df["pause_duration_after_word"].fillna(0)
    words_df["above_threshold_pause"] = words_df["above_threshold_pause"].fillna(0)

    merging = True

    while merging:
        # If unit has less than 3 words, merge it with previous unit
        words_df["unit_word_count"] = words_df.groupby("unit")["word_count"].transform("sum")

        # Bool if the word is first word in unit
        words_df["is_first_word_in_unit"] = words_df["unit"] != words_df["unit"].shift(1)

        # Bool if the word is last word in unit
        words_df["is_last_word_in_unit"] = words_df["unit"] != words_df["unit"].shift(-1)
        
        # If unit has less than 3 words, the unit must be merged. If the pause_duration_after_word of last word is smaller
        # than pause_duration_before_word of first word, merge with next unit. Otherwise merge with precious unit

        small_units = words_df[words_df["unit_word_count"] < 3]["unit"].unique()

        if len(small_units) == 0:
            merging = False
            break
        
        all_units = list(words_df["unit"].unique())
        all_units.sort()

        if len(all_units) <= 1:
            merging = False
            break
        
        for unit in small_units:
            first_word = words_df[(words_df["unit"] == unit) & (words_df["is_first_word_in_unit"])]
            last_word = words_df[(words_df["unit"] == unit) & (words_df["is_last_word_in_unit"])]

            if len(first_word) == 0 or len(last_word) == 0:
                continue

            first_word_index = first_word.index[0]
            last_word_index = last_word.index[0]

            current_unit_index = all_units.index(unit)

            if current_unit_index == 0:
                new_unit = all_units[current_unit_index + 1]
            elif current_unit_index == len(all_units) - 1:
                new_unit = all_units[current_unit_index - 1]
            elif words_df.loc[last_word_index, "pause_duration_after_word"] < words_df.loc[first_word_index, "pause_duration_before_word"]:
                new_unit = all_units[current_unit_index + 1]
            else:
                new_unit = all_units[current_unit_index - 1]
            
            logging.info(f"Unit {unit} has less than 3 words. Merging with unit {new_unit}")
            print(f"Unit {unit} has less than 3 words. Merging with unit {new_unit}")

            words_df.loc[words_df["unit"] == unit, "unit"] = new_unit
    
    # Group rows by unit - concat words
    words_df = words_df.groupby('unit').agg({
        'word': ' '.join, 
        'start': 'first', 
        'end': 'last', 
        'articulation_duration': 'sum', 
        'word_count': 'sum', 
        'syllables_count': 'sum'
        })
    words_df["unit_duration"] = words_df["end"] - words_df["start"]

    # Add pause duration before each unit
    words_df["pause_duration_before_unit"] = words_df["start"].shift(0) - words_df["end"].shift(1)

    # Replace NaN with 0
    words_df["pause_duration_before_unit"] = words_df["pause_duration_before_unit"].fillna(0)

    words_df["unit_duration_with_pause"] = words_df["unit_duration"] + words_df["pause_duration_before_unit"]

    words_df["word_speach_rate"] = words_df["word_count"] / words_df["unit_duration_with_pause"]
    words_df["syllables_speach_rate"] = words_df["syllables_count"] / words_df["unit_duration_with_pause"]
    words_df["word_articulation_rate"] = words_df["word_count"] / words_df["articulation_duration"]
    words_df["syllables_articulation_rate"] = words_df["syllables_count"] / words_df["articulation_duration"]

    # Calculate_nltk_metrics based on words
    words_df = words_df.apply(calculate_nltk_metrics, axis=1)

    # Reindex dataframe
    words_df = words_df.reset_index(drop=True)
    
    return words_df

In [None]:
response = json.load(open("data\\7_3_Combine_Chunks\\FG\\respondent_35\\elaboration_5_2_response.json"))

words_df = get_pdu_dataset(response)
print(len(words_df))
words_df

In [None]:
# This is the function to measure source acoustics using default male parameters.
def measurePitch(voiceID, f0min, f0max, unit):
    sound = parselmouth.Sound(voiceID)  # read the sound
    duration = call(sound, "Get total duration")  # duration
    try:
        # create a praat pitch object
        pitch = call(sound, "To Pitch", 0.0, f0min, f0max)
    except:
        logging.log(
            logging.ERROR, f"Error in measurePitch: {voiceID}, returning np.NAN for all values.")
        return duration, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN, np.NAN

    meanF0 = call(pitch, "Get mean", 0, 0, unit)  # get mean pitch
    medianF0 = call(pitch, "Get quantile", 0, 0, 0.5, unit)  # get median pitch
    stdevF0 = call(pitch, "Get standard deviation", 0,
                   0, unit)  # get standard deviation

    harmonicity = call(sound, "To Harmonicity (cc)", 0.01, f0min, 0.1, 1.0)
    hnr = call(harmonicity, "Get mean", 0, 0)

    pointProcess = call(sound, "To PointProcess (periodic, cc)", f0min, f0max)
    localJitter = call(pointProcess, "Get jitter (local)",
                       0, 0, 0.0001, 0.02, 1.3)
    localabsoluteJitter = call(
        pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3)
    rapJitter = call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3)
    ppq5Jitter = call(pointProcess, "Get jitter (ppq5)",
                      0, 0, 0.0001, 0.02, 1.3)
    ddpJitter = call(pointProcess, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3)
    localShimmer = call([sound, pointProcess],
                        "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    localdbShimmer = call(
        [sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    apq3Shimmer = call([sound, pointProcess],
                       "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    aqpq5Shimmer = call([sound, pointProcess],
                        "Get shimmer (apq5)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    apq11Shimmer = call([sound, pointProcess],
                        "Get shimmer (apq11)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
    ddaShimmer = call([sound, pointProcess],
                      "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

    return duration, meanF0, medianF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter, localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer

def runPCA(df):
    # z-score the Jitter and Shimmer measurements
    measures = ['localJitter', 'localabsoluteJitter', 'rapJitter', 'ppq5Jitter', 'ddpJitter',
                'localShimmer', 'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer', 'apq11Shimmer', 'ddaShimmer']
    x = df.loc[:, measures].values

    # Drop the NaNs
    x = np.nan_to_num(x)

    x = StandardScaler().fit_transform(x)
    # PCA
    try:
        pca = PCA(n_components=2)
        principalComponents = pca.fit_transform(x)
    except:
        pca = PCA(n_components=1)
        principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data=principalComponents, columns=[
                               'JitterPCA', 'ShimmerPCA'])
    principalDf
    return principalDf


def voice_analysis(snd_parts, female):
    # https://github.com/drfeinberg/PraatScripts/tree/master

    # create lists to put the results
    file_list = []
    duration_list = []
    mean_F0_list = []
    median_F0_list = []
    sd_F0_list = []
    hnr_list = []
    localJitter_list = []
    localabsoluteJitter_list = []
    rapJitter_list = []
    ppq5Jitter_list = []
    ddpJitter_list = []
    localShimmer_list = []
    localdbShimmer_list = []
    apq3Shimmer_list = []
    aqpq5Shimmer_list = []
    apq11Shimmer_list = []
    ddaShimmer_list = []

    # Go through all the wave files in the folder and measure all the acoustics
    for order, snd in enumerate(snd_parts):
        sound = snd

        if female:
            minHz = 100
            maxHz = 600  # 500 by mozno stacilo
        else:
            minHz = 75
            maxHz = 300

        (duration, meanF0, medianF0, stdevF0, hnr, localJitter, localabsoluteJitter, rapJitter, ppq5Jitter, ddpJitter,
         localShimmer, localdbShimmer, apq3Shimmer, aqpq5Shimmer, apq11Shimmer, ddaShimmer) = measurePitch(
            sound, minHz, maxHz, "Hertz")

        file_list.append(order)  # make an ID list
        duration_list.append(duration)  # make duration list
        mean_F0_list.append(meanF0)  # make a mean F0 list
        median_F0_list.append(medianF0)  # make a median F0 list
        sd_F0_list.append(stdevF0)  # make a sd F0 list
        hnr_list.append(hnr)  # add HNR data

        # add raw jitter and shimmer measures
        localJitter_list.append(localJitter)
        localabsoluteJitter_list.append(localabsoluteJitter)
        rapJitter_list.append(rapJitter)
        ppq5Jitter_list.append(ppq5Jitter)
        ddpJitter_list.append(ddpJitter)
        localShimmer_list.append(localShimmer)
        localdbShimmer_list.append(localdbShimmer)
        apq3Shimmer_list.append(apq3Shimmer)
        aqpq5Shimmer_list.append(aqpq5Shimmer)
        apq11Shimmer_list.append(apq11Shimmer)
        ddaShimmer_list.append(ddaShimmer)

    # Add the data to Pandas
    df = pd.DataFrame(np.column_stack([file_list, duration_list, mean_F0_list, median_F0_list, sd_F0_list, hnr_list,
                                       localJitter_list, localabsoluteJitter_list, rapJitter_list,
                                       ppq5Jitter_list, ddpJitter_list, localShimmer_list,
                                       localdbShimmer_list, apq3Shimmer_list, aqpq5Shimmer_list,
                                       apq11Shimmer_list, ddaShimmer_list,
                                       ]),
                      columns=['voiceID', 'duration', 'meanF0Hz', 'medianF0Hz', 'stdevF0Hz', 'HNR',
                               'localJitter', 'localabsoluteJitter', 'rapJitter',
                               'ppq5Jitter', 'ddpJitter', 'localShimmer',
                               'localdbShimmer', 'apq3Shimmer', 'apq5Shimmer',
                               'apq11Shimmer', 'ddaShimmer'])

    print("finished")

    return df

In [None]:
extracted_audios_fg_path = "data\\6_Elaborations_Extraction\\FG"
extracted_audios_h_path = "data\\6_Elaborations_Extraction\\H"

extracted_transcripts_fg_path = "data\\7_3_Combine_Chunks\\FG"
extracted_transcripts_h_path = "data\\7_3_Combine_Chunks\\H"

In [None]:
def get_dict_of_paths(root_path, file_extension=".json"):
    dict_of_paths = {}
    for root, dirs, files in os.walk(root_path):
        if len(files) > 0:
            files = [f for f in files if f.endswith(file_extension)]
            files = [os.path.join(root, f) for f in files]
            
            folder_name = root.split("\\")[-1]
            dict_of_paths[folder_name] = files
    return dict_of_paths

In [None]:
fg_audio_paths = get_dict_of_paths(extracted_audios_fg_path, file_extension=".wav")
h_audio_paths = get_dict_of_paths(extracted_audios_h_path, file_extension=".wav")

fg_transcript_paths = get_dict_of_paths(extracted_transcripts_fg_path, file_extension=".json")
h_transcript_paths = get_dict_of_paths(extracted_transcripts_h_path, file_extension=".json")

In [None]:
h_audio_paths = {k: v for k, v in h_audio_paths.items() if k in h_transcript_paths.keys()}
fg_audio_paths = {k: v for k, v in fg_audio_paths.items() if k in fg_transcript_paths.keys()}

In [None]:
if len(fg_audio_paths) != len(fg_transcript_paths) or len(h_audio_paths) != len(h_transcript_paths):
    print(f"Number of audio and transcript files do not match - {len(fg_audio_paths)} {len(fg_transcript_paths)} {len(h_audio_paths)} {len(h_transcript_paths)}")
    sys.exit(1)

In [None]:
def pair_audio_and_transcript_paths(audio_paths, transcript_paths):
    paired_dict = {}
    for k, v in audio_paths.items():
        paired_dict[k] = { "audio": v, "transcript": transcript_paths[k]}

    return paired_dict

In [None]:
fg_paired = pair_audio_and_transcript_paths(fg_audio_paths, fg_transcript_paths)
h_paired = pair_audio_and_transcript_paths(h_audio_paths, h_transcript_paths)

In [None]:
fg_pre_study_questions_path = wd + "\\2 UXtweak CSVs\\[DP Lies] Final 1 FG\\[DP Lies] Final 1 FG - Pre-study questionnaire.csv"
h_pre_study_questions_path = wd + "\\2 UXtweak CSVs\\[DP Lies] Final 1 H\\[DP Lies] Final 1 H - Pre-study questionnaire.csv"
fg_pre_study_questions = pd.read_csv(fg_pre_study_questions_path)
h_pre_study_questions = pd.read_csv(h_pre_study_questions_path)

fg_pre_study_questions_path_pilot = wd_pilot + "\\2 UXtweak CSVs\\Pilot Demo 4 FG\\Pilot Demo 4 FG - Pre-study questionnaire.csv"
h_pre_study_questions_path_pilot = wd_pilot + "\\2 UXtweak CSVs\\Pilot Demo 4 H\\Pilot Demo 4 H - Pre-study questionnaire.csv"
fg_pre_study_questions_pilot = pd.read_csv(fg_pre_study_questions_path_pilot)
h_pre_study_questions_pilot = pd.read_csv(h_pre_study_questions_path_pilot)

fg_pre_study_questions = pd.concat([fg_pre_study_questions, fg_pre_study_questions_pilot])
h_pre_study_questions = pd.concat([h_pre_study_questions, h_pre_study_questions_pilot])

In [None]:
print(fg_pre_study_questions["Q1: What gender do you identify as?"].unique())
print(h_pre_study_questions["Q1: What gender do you identify as?"].unique())

In [None]:
def check_if_female(pre_study_questions):
    pre_study_questions["female"] = pre_study_questions["Q1: What gender do you identify as?"] == "Female"
    pre_study_questions = pre_study_questions[["respondent", "female"]]
    pre_study_questions["respondent_string"] = pre_study_questions["respondent"].apply(lambda x: "respondent_" + str(x))
    return pre_study_questions

In [None]:
fg_female = check_if_female(fg_pre_study_questions)
h_female = check_if_female(h_pre_study_questions)

In [None]:
def add_female_to_paired(paired_dict, female_df):
    for k, v in paired_dict.items():
        paired_dict[k]["female"] = female_df[female_df["respondent_string"] == k]["female"].values[0]
    return paired_dict

In [None]:
fg_paired_enriched = add_female_to_paired(fg_paired, fg_female)
h_paired_enriched = add_female_to_paired(h_paired, h_female)

In [None]:
@timer
def add_characteristics_to_words(words_df_out, snd, female):
    words_df = words_df_out.copy()
    
    snd_parts = [snd.extract_part(start, end, preserve_times=True) for start, end in zip(words_df["start"], words_df["end"])]

    analysis = voice_analysis(snd_parts, female)

    words_df = pd.concat([words_df, analysis], axis=1)
    
    return words_df

In [None]:
with open("tokens/openai_key.txt", "r") as file:
    OPENAI_API_KEY = file.read().rstrip()

# Set environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [None]:
client = OpenAI()

In [None]:
def call_gpt(system_prompt, user_prompt, temperature=0.2):

    # Add exponential backoff when calling OpenAI API

    for i in range(5):
        try:
            response = client.chat.completions.create(
                model="gpt-3.5-turbo", # Input 0,50 USD / 1M tokens Output 1,50 USD / 1M tokens
                # model="gpt-4-turbo", # Input 10,00 USD / 1M tokens Output 30,00 USD / 1M tokens
                
                messages=[
                    {
                        "role": "system",
                        "content": system_prompt
                    },
                    {
                        "role": "user",
                        "content": user_prompt
                    }
                ],
                temperature=temperature
            )

            return response
        except Exception as e:
            logging.error(f"Error in call_gpt: {e}")
            print(f"Error in call_gpt: {e}")
            time.sleep(2 ** (i + 2))

    return response

In [None]:
def sanitaze_response(response_object, possible_answers):
    response = response_object.choices[0].message.content.lower()
    # Sometimes gpt answers with whole sentences, sometimes with just a word. This function sanitizes the response to be a word from the possible answers.
    for possible_answer in possible_answers:
        if f'{possible_answer}' in response:
            return possible_answer
        
    # return response
    return "unknown"

In [None]:
@timer
def add_gpt_characteristics(words_df_out):
    words_df = words_df_out.copy()

    system_prompt_hesitation = """Analyze the text for a higher than average use of hesitation words such as 'erm', 'uh', 'you know'. These hesitation markers can indicate stress or uncertainty, which might be associated with deception. Return 'true' if the frequency of these words is higher than usual, and 'false' otherwise.
    
    Example 1: 'I, uh, really think that, erm, we should move forward with the project.'
    Response: true
    
    Example 2: 'The meeting was scheduled for Tuesday, and everything was prepared in advance.'
    Response: false
    """

    system_prompt_disfluency  = """Detect the presence of disfluencies such as false starts and repetitions in the provided text. These linguistic features can indicate increased cognitive load or stress, often associated with deceptive behavior. Return 'true' if disfluencies are prominent, and 'false' if they are minimal or absent.
    
    Example 1: 'I just want to, I mean, I need to say that it was, it was not like that.'
    Response: true
    
    Example 2: 'She completed her presentation smoothly and was confident in her explanations.'
    Response: false
    """

    system_prompt_tense = """Analyze the text for changes in tense, which can suggest a lack of commitment to the truth of the statement or an attempt to distance the speaker from the events. Return 'true' if changes in tense are observed, and 'false' if the tense remains consistent.

    Example 1: 'I remember, last week I start the project and then I shifted to planning the next phases.'
    Response: true
    
    Example 2: 'We began the project in January and have been working diligently ever since.'
    Response: false
    """

    system_prompt_qualifiers = """Analyze the text for the presence of increased qualifiers such as 'maybe', 'probably', 'sort of', 'kind of', 'essentially', which can suggest a lack of confidence or certainty in the statements being made. Return 'true' if an increased number of qualifiers is used, and 'false' if the speech is more direct and lacks these qualifiers.

    Example 1: 'I think it's sort of important, maybe, to consider all possible outcomes.'
    Response: true
    
    Example 2: 'It is crucial to consider all possible outcomes.'
    Response: false
    """

    system_prompt_contradictions = """Analyze the text to determine if there are contradictions within the statement. Contradictions can suggest confusion, forgetfulness, or deception. Return 'true' if contradictions are present, and 'false' if the narrative is consistent without contradictions.
    
    Example 1: 'I always wake up early, around 6 AM every day. Actually, I've been sleeping in till about 8 AM lately.'
    Response: true
    
    Example 2: 'I enjoy running and make sure to go for a run every morning. It helps me start my day right.'
    Response: false
    """

    for index, row in words_df.iterrows():
        word = row["word"]
        
        response_hesitation = call_gpt(system_prompt_hesitation, word)
        words_df.loc[index, "hesitation"] = sanitaze_response(response_hesitation, ["true", "false"])

        response_disfluency = call_gpt(system_prompt_disfluency , word)
        words_df.loc[index, "disfluency"] = sanitaze_response(response_disfluency, ["true", "false"])

        response_tense = call_gpt(system_prompt_tense, word)
        words_df.loc[index, "tense"] = sanitaze_response(response_tense, ["true", "false"])

        response_qualifiers = call_gpt(system_prompt_qualifiers, word)
        words_df.loc[index, "qualifiers"] = sanitaze_response(response_qualifiers, ["true", "false"])

        response_contradictions = call_gpt(system_prompt_contradictions, word)
        words_df.loc[index, "contradictions"] = sanitaze_response(response_contradictions, ["true", "false"])
        
    return words_df

In [None]:
@timer
def analyze_words(paired_dict, variant):
    for k, v in paired_dict.items():
        logging.info(f"Analyzing respondent {k}")
        female = v["female"]
        for audio_file, transcript_file in zip(v["audio"], v["transcript"]):
            logging.info(f"Analyzing {audio_file}")
            
            snd = parselmouth.Sound(audio_file)
            response = json.load(open(transcript_file))

            words_df = get_pdu_dataset(response)
            words_df["female"] = female
            words_df["variant"] = variant
            words_df["respondent"] = k
            
            words_df = add_characteristics_to_words(words_df, snd, female)
            words_df = add_gpt_characteristics(words_df)

            # Elaboration name
            elaboration = audio_file.split("\\")[-1]
            elaboration = elaboration[:-4]

            # Respondent name
            respondent = audio_file.split("\\")[-2]

            # Save the dataframe
            directory = f"data\\11_Pause_Defined_Units\\{variant}\\{respondent}"
            if not os.path.exists(directory):
                os.makedirs(directory)
            path = f"{directory}\\{elaboration}.csv"
            words_df.to_csv(path, index=False, sep=";")
            logging.info(f"Saved {path}")
        logging.info(f"Finished analyzing respondent {k}")

In [None]:
analyze_words(fg_paired_enriched, "FG")

In [None]:
analyze_words(h_paired_enriched, "H")