This notebook: developing a text alignment function that lets us align tokens in raw text (which will be used to compute surprisals) with tokens in the Brennan CSV representation.

In [14]:
import re
import unicodedata

import pandas as pd
from tqdm.notebook import tqdm

In [3]:
stim_df = pd.read_csv("../data/AliceChapterOne-EEG.csv")

In [87]:
with open("../data/texts/alice-ch1.txt") as f:
    text_raw = f.read()
tokens_flat = text_raw.split()
# TODO maybe use nltk.tokenize.RegexpTokenizer("[\w']+|[^\w\s]+").tokenize(sent)

In [6]:
tokens_flat[:10]

['Alice',
 'was',
 'beginning',
 'to',
 'get',
 'very',
 'tired',
 'of',
 'sitting',
 'by']

In [15]:
def strip_accents(s):
    return unicodedata.normalize("NFKD", s).encode("ASCII", "ignore").decode("utf-8")

In [78]:
punct_re = re.compile(r"[^A-Za-z]")

def process_fulltext_token(t):
    ret = strip_accents(t.lower())
    ret = punct_re.sub("", ret)
    return ret

def align_stimulus_fulltext(stim_df, tokens_flat):
    """
    Prepare an alignment between the eperimental stimuli and the fulltext token
    indices, so that we can provide surprisal predictors.

    Adds a new column `tok_pos` to `stim_df` describing the corresponding
    position for each row in the fulltext. (inplace)
    """

    stop = False

    tok_cursor = 0
    tok_el = process_fulltext_token(tokens_flat[tok_cursor])

    # For each element in surp_df, record the index of the corresponding element
    # in the token sequence or surprisal df.
    tok_pos = []
    for _, row in tqdm(stim_df.iterrows(), total=len(stim_df)):
        if stop:
            break

        df_el = punct_re.sub("", row.Word.lower())
        print(row.Word, "::", df_el, "::")

        # Track how many elements in a reference we have skipped. If this
        # is excessive, we'll quit rather than looping infinitely.
        skip_count = 0
        if stop:
            raise RuntimeError("abort")
            break

        # Find corresponding token in text and append to `tok_pos`.
        try:
            print("\t///", tok_el, df_el)
            while not tok_el.startswith(df_el):
                # Special cases for oddities in the Brennan stim df..
                if tok_el == "is" and df_el == "s":
                    # annotation says "\x1as" which gets stripped
                    break
                elif tok_el == "had" and df_el == "d":
                    # annotation says "\x1ad" which gets stripped
                    break

                tok_cursor += 1
                skip_count += 1
                if skip_count > 20:
                    stop = True
                    break

                tok_el = process_fulltext_token(tokens_flat[tok_cursor])
                print("\t//", tok_el)

            print("\tMatch", df_el, tok_el)
            tok_pos.append(tok_cursor)

            # If we matched only a subset of the token, then cut off what we
            # matched and proceed.
            if tok_el != df_el:
                tok_el = tok_el[len(df_el):]
            else:
                tok_cursor += 1
                tok_el = process_fulltext_token(tokens_flat[tok_cursor])
        except IndexError:
            # print("iex", row, tok_cursor, tok_el)
            stop = True
            break

    stim_df["tok_pos"] = tok_pos

In [88]:
align_stimulus_fulltext(stim_df, tokens_flat)

  0%|          | 0/2129 [00:00<?, ?it/s]

Alice :: alice ::
	/// alice alice
	Match alice alice
was :: was ::
	/// was was
	Match was was
beginning :: beginning ::
	/// beginning beginning
	Match beginning beginning
to :: to ::
	/// to to
	Match to to
get :: get ::
	/// get get
	Match get get
very :: very ::
	/// very very
	Match very very
tired :: tired ::
	/// tired tired
	Match tired tired
of :: of ::
	/// of of
	Match of of
sitting :: sitting ::
	/// sitting sitting
	Match sitting sitting
by :: by ::
	/// by by
	Match by by
her :: her ::
	/// her her
	Match her her
sister :: sister ::
	/// sister sister
	Match sister sister
on :: on ::
	/// on on
	Match on on
the :: the ::
	/// the the
	Match the the
bank :: bank ::
	/// bank bank
	Match bank bank
and :: and ::
	/// and and
	Match and and
of :: of ::
	/// of of
	Match of of
having :: having ::
	/// having having
	Match having having
nothing :: nothing ::
	/// nothing nothing
	Match nothing nothing
to :: to ::
	/// to to
	Match to to
do :: do ::
	/// do do
	Match do do
once

In [90]:
stim_df.tok_pos.map(dict(enumerate(tokens_flat)))

0           Alice
1             was
2       beginning
3              to
4             get
          ...    
2124      happens
2125         when
2126          one
2127         eats
2128        cake,
Name: tok_pos, Length: 2129, dtype: object

In [91]:
stim_df.tail()

Unnamed: 0,Word,Segment,onset,offset,Order,LogFreq,LogFreq_Prev,LogFreq_Next,SndPower,Length,Position,Sentence,IsLexical,NGRAM,RNN,CFG,tok_pos
2124,happens,12,45.226353,45.672448,2146,10.77,10.82,13.76,0.000708,0.446095,6,84,1.0,5.574428,6.356812,2.969568,2100
2125,when,12,45.677924,45.891353,2147,13.76,10.77,14.17,0.003221,0.213429,7,84,0.0,4.059164,6.720639,4.930669,2101
2126,one,12,45.896829,46.058972,2148,14.17,13.76,8.15,0.001984,0.162143,8,84,1.0,1.380381,2.187682,0.725398,2102
2127,eats,12,46.064448,46.322373,2149,8.15,14.17,8.74,2.5e-05,0.257925,9,84,1.0,3.171368,3.941021,2.767965,2103
2128,cake,12,46.327849,46.681557,2150,8.74,8.15,0.0,5e-06,0.353708,10,84,1.0,5.386008,5.832085,3.692958,2104
