# Calculating control predictors / random effects from candor_Df

## Load candor_dfs

In [12]:
import sys
import os
from pathlib import Path
import pandas as pd

scripts_dir = os.path.abspath(os.path.join(os.getcwd(), '..', 'scripts'))

if scripts_dir not in sys.path:
    sys.path.append(scripts_dir)

from candor.create_raw_data import load_conversation_tokens

In [13]:
def load_transcript_output(convo_path: Path):
    output_df = load_conversation_tokens(convo_path)
    output_df = output_df[output_df.type == 'pronunciation']    
    return output_df

def load_transcribe_cliffhanger(convo_path: Path):
    return pd.read_csv(convo_path / 'transcription/transcript_cliffhanger.csv')

In [14]:
convo_path = Path('../data/candor/sample/0020a0c5-1658-4747-99c1-2839e736b481/')
cliffhanger_df = load_transcribe_cliffhanger(convo_path)
output_df = load_transcript_output(convo_path)

In [15]:
cliffhanger_df

Unnamed: 0,turn_id,speaker,start,stop,utterance,interval,delta,questions,end_question,overlap,n_words
0,0,5fa072f4f4aa580b63834357,4.34,190.04,"Mhm. Mhm. Just, mm. And Uh huh, mm. Mhm. Mhm. ...",,185.70,2,False,False,38
1,1,5a73899f9cdd1800017786f0,198.34,201.76,Yeah hey I'm gone.,8.30,3.42,0,False,False,5
2,2,5fa072f4f4aa580b63834357,200.64,203.96,"Oh good, how are you?",-1.12,3.32,1,True,True,5
3,3,5a73899f9cdd1800017786f0,204.54,214.66,Yeah yeah I've done a few of these before they...,0.58,10.12,0,False,False,15
4,4,5fa072f4f4aa580b63834357,205.34,214.56,"Yeah. Yeah, so this will be an interesting stu...",-9.32,9.22,0,False,True,14
...,...,...,...,...,...,...,...,...,...,...,...
320,320,5fa072f4f4aa580b63834357,2727.21,2728.01,So thank you.,0.70,0.80,0,False,False,3
321,321,5a73899f9cdd1800017786f0,2728.29,2731.92,Yeah I get to enjoy your day evening.,0.28,3.63,0,False,False,8
322,322,5fa072f4f4aa580b63834357,2731.29,2733.00,"Yes, you too. Have a good night.",-0.63,1.71,0,False,True,7
323,323,5a73899f9cdd1800017786f0,2732.59,2734.09,Yeah. Yeah.,-0.41,1.50,0,False,True,2


In [50]:
import random
i = random.randint(0, len(cliffhanger_df)-1)
cliffhanger_df['utterance'][i]

"I absolutely loved it. But yeah, next time I go to Wisconsin, I've already put in my notes, my beer notes that I have to bring back at least a case of spotted cow because it was amazing."

In [16]:
output_df

Unnamed: 0_level_0,Unnamed: 1_level_0,speaker,start,stop,utterance,confidence,type
conversation_id,turn_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0020a0c5-1658-4747-99c1-2839e736b481,0,5fa072f4f4aa580b63834357,4.34,4.66,Mhm,0.466,pronunciation
0020a0c5-1658-4747-99c1-2839e736b481,2,5fa072f4f4aa580b63834357,10.14,10.95,Mhm,0.673,pronunciation
0020a0c5-1658-4747-99c1-2839e736b481,4,5fa072f4f4aa580b63834357,12.74,14.36,Just,0.995,pronunciation
0020a0c5-1658-4747-99c1-2839e736b481,6,5fa072f4f4aa580b63834357,17.74,17.95,mm,0.468,pronunciation
0020a0c5-1658-4747-99c1-2839e736b481,8,5fa072f4f4aa580b63834357,18.74,19.06,And,0.997,pronunciation
0020a0c5-1658-4747-99c1-2839e736b481,...,...,...,...,...,...,...
0020a0c5-1658-4747-99c1-2839e736b481,9025,5fa072f4f4aa580b63834357,2732.43,2732.56,good,0.999,pronunciation
0020a0c5-1658-4747-99c1-2839e736b481,9026,5fa072f4f4aa580b63834357,2732.56,2733.00,night,0.999,pronunciation
0020a0c5-1658-4747-99c1-2839e736b481,9028,5a73899f9cdd1800017786f0,2732.59,2732.79,Yeah,0.800,pronunciation
0020a0c5-1658-4747-99c1-2839e736b481,9030,5a73899f9cdd1800017786f0,2733.29,2734.09,Yeah,1.000,pronunciation


## Using spacy to get control predictor info

All the information should be calculable from a (conversational) turn -level basis, with the exception of frequency, for which a Counter needs to be created from the lemmas of the conversation w.r.t. either all conversations or the current conversation. So, I will first focus on making a function that will calculate and return all necessary control predictors on a turn-level (including information about sentence number and id of word in sentence), then later add in frequency.

In [20]:
import spacy

In [23]:
from nltk.corpus import cmudict
cmudict_dict = cmudict.dict()

In [24]:
# Example usage:
nlp = spacy.load("en_core_web_sm")
doc = nlp("I've been working on the partially-done project's completion.")
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop, len(token.whitespace_))


I I PRON PRP nsubj X True True 0
've have AUX VBP aux 'xx False True 1
been be AUX VBN aux xxxx True True 1
working work VERB VBG ROOT xxxx True False 1
on on ADP IN prep xx True True 1
the the DET DT det xxx True True 1
partially partially ADV RB advmod xxxx True False 0
- - PUNCT HYPH punct - False False 0
done do VERB VBN amod xxxx True True 1
project project NOUN NN poss xxxx True False 0
's 's PART POS case 'x False True 1
completion completion NOUN NN pobj xxxx True False 0
. . PUNCT . punct . False False 0


In [61]:
class ControlPredictors:
    nlp = nlp
    cmudict_dict = cmudict_dict
    def __init__(self, turn):
        """
        Given a string corresponding to a turn, return dictionary of list of control predictor
        values for each WHITESPACE word of the original turn. (whitespace is important so we can
        map back to the durations that are annotated in CANDOR)
        """
        doc = nlp(turn)
        self.doc = doc
        self.turn = turn

        self.text = []
        self.lemma = []
        self.n_chars = []
        self.n_syllables = []
        self.pos = []
        self.stopword = []
        self.mtw = [] # is word a multi-token word? ex: "I've", but not "he," (punct)
        
        self.sentence_id_in_turn = []
        self.word_id_in_sentence = []

        self.n_whitespace_words = 0
        skip_this_token = False
        sentence_id = -1
        word_id = 0
        for idx, token in enumerate(doc):
            if skip_this_token:
                if token.whitespace_:
                    skip_this_token = False
                continue

            # print(token)
            
            self.text.append(token.text)
            self.lemma.append(token.lemma_)
            self.n_chars.append(len(token.text))
            self.n_syllables.append(self.word_syllable_count(token.lemma_.lower()))
            self.pos.append(token.pos_)
            self.stopword.append(token.is_stop)

            self.n_whitespace_words += 1

            if token.is_sent_start:
                sentence_id += 1
                word_id = 0

            self.sentence_id_in_turn.append(sentence_id)
            self.word_id_in_sentence.append(word_id)

            word_id += 1
            
            is_mtw = False
            if not token.whitespace_:
                skip_this_token = True
                if idx + 1 < len(doc) - 1:
                    # the current token is a multitoken word if there's no whitespace between
                    # this token and the next, and the next token is not punctuation
                    is_mtw = (doc[idx+1].pos_ != 'PUNCT') #and doc[idx+1].text != '-'

            self.mtw.append(is_mtw)
        
        assert len(turn.split(' ')) == self.n_whitespace_words, f"mismatch in spacy tokenization, actual whitespace: {len(turn.split(' '))} != spacy whitespace: {self.n_whitespace_words}"

    def word_syllable_count(self, word) -> int:
        if word in self.cmudict_dict:
            # Return the minimum count if multiple pronunciations exist
            return min([len([y for y in x if y[-1].isdigit()]) for x in self.cmudict_dict[word]])
        else:
            return None
        
    def __iter__(self):
        for k, v in vars(self).items():
            if isinstance(v, list):
                yield k, v

In [56]:
text = "I've been diligently, strongly, hardly, we've, been working on the partially-done project's completion"
text = "I absolutely loved it. But yeah, next time I go to Wisconsin, I've already put in my notes, my beer notes that I have to bring back at least a case of spotted cow because it was amazing."
text_control_predictors = ControlPredictors(text)
for k, v in vars(text_control_predictors).items():
    print(k, v)

I
absolutely
loved
it
But
yeah
next
time
I
go
to
Wisconsin
I
already
put
in
my
notes
my
beer
notes
that
I
have
to
bring
back
at
least
a
case
of
spotted
cow
because
it
was
amazing
doc I absolutely loved it. But yeah, next time I go to Wisconsin, I've already put in my notes, my beer notes that I have to bring back at least a case of spotted cow because it was amazing.
turn I absolutely loved it. But yeah, next time I go to Wisconsin, I've already put in my notes, my beer notes that I have to bring back at least a case of spotted cow because it was amazing.
text ['I', 'absolutely', 'loved', 'it', 'But', 'yeah', 'next', 'time', 'I', 'go', 'to', 'Wisconsin', 'I', 'already', 'put', 'in', 'my', 'notes', 'my', 'beer', 'notes', 'that', 'I', 'have', 'to', 'bring', 'back', 'at', 'least', 'a', 'case', 'of', 'spotted', 'cow', 'because', 'it', 'was', 'amazing']
lemma ['I', 'absolutely', 'love', 'it', 'but', 'yeah', 'next', 'time', 'I', 'go', 'to', 'Wisconsin', 'I', 'already', 'put', 'in', 'my', 'no

In [18]:
from collections import Counter
def make_candor_freq_counter(output_df):
    return Counter(output_df['utterance'].tolist())
make_candor_freq_counter(output_df)

Counter({'I': 343,
         'like': 291,
         'the': 189,
         'and': 186,
         'to': 173,
         'Yeah': 153,
         'a': 150,
         'you': 144,
         'of': 138,
         'know': 126,
         'yeah': 95,
         'so': 92,
         'Oh': 91,
         'that': 90,
         'it': 88,
         'in': 82,
         "it's": 79,
         'we': 74,
         'just': 68,
         'was': 68,
         'my': 67,
         'is': 66,
         'Mhm': 60,
         'So': 58,
         'for': 56,
         'have': 51,
         'as': 50,
         "don't": 48,
         'do': 48,
         'Uh': 46,
         'all': 45,
         'And': 44,
         'but': 44,
         'really': 42,
         'kind': 41,
         "I'm": 40,
         'right': 38,
         'because': 37,
         'huh': 36,
         'at': 36,
         'are': 34,
         'what': 34,
         'be': 33,
         "that's": 32,
         'But': 31,
         'Okay': 30,
         'this': 29,
         'there': 29,
         'get': 29,
 

In [63]:
def candor_full_df(cliffhanger_df, output_df):
    """
    Given transcribe_output.json and transcript_cliffhanger.csv
    as loaded dataframes, return new cliffhanger_exploded df 
    mapping each word and their start/stop times to cliffhanger_df
    """
    row_starts = []
    row_stops = []
    row_words = []
    # row_surprisals = []
    row_control_predictors = {}

    for idx, row in cliffhanger_df.iterrows():
        output_sub_df = output_df.query(
            "start >= @row.start & stop <= @row.stop & speaker == @row.speaker"
        )

        row_starts.append(output_sub_df.start.tolist())
        row_stops.append(output_sub_df.stop.tolist())

        output_words = output_sub_df.utterance.tolist()

        cliffhanger_words = row['utterance'].split(' ')#.strip?
        row_words.append(cliffhanger_words) # or output_words, to remove punctuation

        control_predictors = ControlPredictors(row['utterance'])
        for predictor, values in control_predictors:
            if predictor not in row_control_predictors:
                row_control_predictors[predictor] = [values]
            else:
                row_control_predictors[predictor].append(values)
    
        # # Surprisals:
        # inputs = tokenize_cliffhanger_turn(cliffhanger_words, tokenizer)
        # surprisals = calculate_surprisal(inputs, model)
        # surprisals_by_word = aggregate_surprisal_by_word(inputs, surprisals)

        assert len(output_words) == len(cliffhanger_words), f"output/cliffhanger transcript mismatch:\n{output_words}\n{cliffhanger_words}\n"
        # assert len(cliffhanger_words) == len(surprisals_by_word), f"cliffhanger_words/surprisals_by_word mismatch:\n{cliffhanger_words}\n{surprisals_by_word}\n"

    cliffhanger_df_minimal = cliffhanger_df.loc[:, ['turn_id']]
    cliffhanger_df_minimal["word"] = cliffhanger_df["utterance"].str.split(' ')
    cliffhanger_df_minimal["word_start"] = row_starts
    cliffhanger_df_minimal["word_stop"] = row_stops
    # cliffhanger_df_minimal["surprisal"] = row_surprisals
    for predictor, values in row_control_predictors.items():
        cliffhanger_df_minimal[predictor] = values


    out = cliffhanger_df_minimal.explode(["word", "word_start", "word_stop",] + [predictor for predictor in row_control_predictors])# "surprisal"])
    out["position_in_turn"] = out.groupby("turn_id").cumcount()
    return out.reset_index(drop=True)

In [65]:
full_df = candor_full_df(cliffhanger_df, output_df)
full_df

Unnamed: 0,turn_id,word,word_start,word_stop,text,lemma,n_chars,n_syllables,pos,stopword,mtw,sentence_id_in_turn,word_id_in_sentence,position_in_turn
0,0,Mhm.,4.34,4.66,Mhm,Mhm,3,2,PROPN,False,False,0,0,0
1,0,Mhm.,10.14,10.95,Mhm,Mhm,3,2,PROPN,False,False,1,0,1
2,0,"Just,",12.74,14.36,Just,just,4,1,ADV,True,False,2,0,2
3,0,mm.,17.74,17.95,mm,mm,2,,INTJ,False,False,2,1,3
4,0,And,18.74,19.06,And,and,3,1,CCONJ,True,False,3,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7716,322,good,2732.43,2732.56,good,good,4,1,ADJ,False,False,1,2,5
7717,322,night.,2732.56,2733.0,night,night,5,1,NOUN,False,False,1,3,6
7718,323,Yeah.,2732.59,2732.79,Yeah,yeah,4,1,INTJ,False,False,0,0,0
7719,323,Yeah.,2733.29,2734.09,Yeah,yeah,4,1,INTJ,False,False,1,0,1


### Try to prepare dataframes for surprisal evaluation 
(à la COCA's text_bigram, text_sentence, text_trigram)

In [68]:
text_sentence = full_df.groupby(['turn_id', 'sentence_id_in_turn']).agg({
    'word': lambda words: ' '.join(words)
}).reset_index()
text_sentence.rename(columns={'word': 'sentence'}, inplace=True)
text_sentence

Unnamed: 0,turn_id,sentence_id_in_turn,sentence
0,0,0,Mhm.
1,0,1,Mhm.
2,0,2,"Just, mm."
3,0,3,"And Uh huh, mm."
4,0,4,Mhm.
...,...,...,...
851,322,0,"Yes, you too."
852,322,1,Have a good night.
853,323,0,Yeah.
854,323,1,Yeah.


In [85]:
def gen_bigrams(words):
    yield ('<s>' + ' ' + words[0])
    if len(words) > 1:
        for i in range(0, len(words)-1):
            yield (words[i] + ' ' + words[i+1])
    yield (words[-1] + ' ' + '</s>')

def gen_trigrams(words):
    if len(words) == 1:
        yield (' '.join(['<s>', words[0], '</s>']))
        return

    yield (' '.join(['<s>', words[0], words[1]]))
    if len(words) > 2:
        for i in range(0, len(words)-2):
            yield (' '.join([words[i], words[i+1], words[i+2]]))
    yield (' '.join([words[-2], words[-1], '</s>']))

In [86]:
all_sentences = []
all_bigrams = []
all_trigrams = []

# Iterate through each group of sentences
for (turn_id, sentence_id), group in full_df.groupby(['turn_id', 'sentence_id_in_turn']):
    words = list(group['word'])
    all_sentences.append({
        'turn_id': turn_id, 
        'sentence_id_in_turn': sentence_id,
        'sentence': ' '.join(words)})

    for bigram in gen_bigrams(words):
        all_bigrams.append({
        'turn_id': turn_id, 
        'bigram': bigram})

    for trigram in gen_trigrams(words):
        all_trigrams.append({
        'turn_id': turn_id, 
        'trigram': trigram})


# Create DataFrame from the list of dictionaries
text_sentence = pd.DataFrame(all_sentences)
text_bigram = pd.DataFrame(all_bigrams)
text_trigram = pd.DataFrame(all_trigrams)

In [87]:
text_sentence

Unnamed: 0,turn_id,sentence_id_in_turn,sentence
0,0,0,Mhm.
1,0,1,Mhm.
2,0,2,"Just, mm."
3,0,3,"And Uh huh, mm."
4,0,4,Mhm.
...,...,...,...
851,322,0,"Yes, you too."
852,322,1,Have a good night.
853,323,0,Yeah.
854,323,1,Yeah.


In [88]:
text_bigram

Unnamed: 0,turn_id,bigram
0,0,<s> Mhm.
1,0,Mhm. </s>
2,0,<s> Mhm.
3,0,Mhm. </s>
4,0,"<s> Just,"
...,...,...
8572,323,Yeah. </s>
8573,323,<s> Yeah.
8574,323,Yeah. </s>
8575,324,<s> But


In [89]:
text_trigram

Unnamed: 0,turn_id,trigram
0,0,<s> Mhm. </s>
1,0,<s> Mhm. </s>
2,0,"<s> Just, mm."
3,0,"Just, mm. </s>"
4,0,<s> And Uh
...,...,...
7716,322,a good night.
7717,322,good night. </s>
7718,323,<s> Yeah. </s>
7719,323,<s> Yeah. </s>
