In [18]:
import json
from pathlib import Path
import re

import numpy as np
import pandas as pd

In [29]:
raw_path = Path("../data/raw.txt")
aligned_path = Path("../data/align.json")
surprisal_path = Path("../data/surprisals.tsv")

In [30]:
surprisal_df = pd.read_csv(surprisal_path, sep="\t")
surprisal_df

Unnamed: 0,sentence_id,token_id,token,surprisal
0,1,1,i,8.92268
1,1,2,just,5.75346
2,1,3,had,7.21849
3,1,4,a,4.03629
4,1,5,really,4.65778
...,...,...,...,...
575,5,101,it,5.55041
576,5,102,for,9.11516
577,5,103,now,13.27430
578,5,104,.,2.53920


In [31]:
with aligned_path.open("r") as align_f:
    aligned_data = json.load(align_f)
aligned_df = pd.DataFrame.from_dict(aligned_data["words"])
aligned_df

Unnamed: 0,alignedWord,case,end,endOffset,phones,start,startOffset,word
0,i,success,17.170000,1,"[{'duration': 0.09, 'phone': 'ay_S'}]",17.080000,0,I
1,just,success,17.400000,6,"[{'duration': 0.07, 'phone': 'jh_B'}, {'durati...",17.170000,2,just
2,had,success,17.560000,10,"[{'duration': 0.05, 'phone': 'hh_B'}, {'durati...",17.400000,7,had
3,a,success,17.839999,12,"[{'duration': 0.28, 'phone': 'ah_S'}]",17.559999,11,a
4,really,success,18.359999,19,"[{'duration': 0.08, 'phone': 'r_B'}, {'duratio...",18.109999,13,really
...,...,...,...,...,...,...,...,...
510,and,success,254.620000,2682,"[{'duration': 0.09, 'phone': 'ah_B'}, {'durati...",254.270000,2679,And
511,that's,success,255.170000,2689,"[{'duration': 0.08, 'phone': 'dh_B'}, {'durati...",254.990000,2683,that's
512,it,success,255.270000,2692,"[{'duration': 0.05, 'phone': 'ih_B'}, {'durati...",255.170000,2690,it
513,for,success,255.520000,2696,"[{'duration': 0.04, 'phone': 'f_B'}, {'duratio...",255.270000,2693,for


In [34]:
PUNCT_RE = re.compile(r"[-,.!?]+")

# Harmonize the two dfs; prepare to join
harmonized = []
surp_cursor, aligned_cursor = 0, 0
while surp_cursor < len(surprisal_df) and aligned_cursor < len(aligned_df):
    surp_row = surprisal_df.iloc[surp_cursor]
    aligned_row = aligned_df.iloc[aligned_cursor]
    
    if surp_row.token == aligned_row.word.lower():
        harmonized.append((surp_row.name, aligned_row.name))
        
        surp_cursor += 1
        aligned_cursor += 1
    elif PUNCT_RE.match(surp_row.token) or surp_row.token == "</s>":
        surp_cursor += 1
    elif surp_row.token == "<unk>":
        # TODO dangerous ..
        harmonized.append((surp_row.name, aligned_row.name))
        
        surp_cursor += 1
        aligned_cursor += 1
    elif aligned_row.word.lower().startswith(surp_row.token):
        # Look for remaining parts of the aligned word in future surprisal rows
        aligned_rem = aligned_row.word.lower()[len(surp_row.token):]
        
        # TODO full case: iterate over infinite possible rows in surp
        next_surp_row = surprisal_df.iloc[surp_cursor + 1]
        if next_surp_row.token == aligned_rem:
            harmonized.append((surp_row.name, aligned_row.name))
            harmonized.append((next_surp_row.name, aligned_row.name))
            
            surp_cursor += 2
            aligned_cursor += 1
        else:
            print("BREAK 2", surp_row, aligned_row)
            break
    else:
        print("BREAK 1", surp_row, aligned_row)
        break

In [39]:
harmonized_df = pd.DataFrame(harmonized, columns=["surp_idx", "aligned_idx"])
harmonized_df = pd.merge(harmonized_df, surprisal_df[["token", "surprisal"]], left_on="surp_idx", right_index=True)
harmonized_df = pd.merge(harmonized_df, aligned_df, left_on="aligned_idx", right_index=True)
harmonized_df

Unnamed: 0,surp_idx,aligned_idx,token,surprisal,alignedWord,case,end,endOffset,phones,start,startOffset,word
0,0,0,i,8.92268,i,success,17.170000,1,"[{'duration': 0.09, 'phone': 'ay_S'}]",17.080000,0,I
1,1,1,just,5.75346,just,success,17.400000,6,"[{'duration': 0.07, 'phone': 'jh_B'}, {'durati...",17.170000,2,just
2,2,2,had,7.21849,had,success,17.560000,10,"[{'duration': 0.05, 'phone': 'hh_B'}, {'durati...",17.400000,7,had
3,3,3,a,4.03629,a,success,17.839999,12,"[{'duration': 0.28, 'phone': 'ah_S'}]",17.559999,11,a
4,4,4,really,4.65778,really,success,18.359999,19,"[{'duration': 0.08, 'phone': 'r_B'}, {'duratio...",18.109999,13,really
...,...,...,...,...,...,...,...,...,...,...,...,...
522,573,511,that,8.16496,that's,success,255.170000,2689,"[{'duration': 0.08, 'phone': 'dh_B'}, {'durati...",254.990000,2683,that's
523,574,511,'s,6.93816,that's,success,255.170000,2689,"[{'duration': 0.08, 'phone': 'dh_B'}, {'durati...",254.990000,2683,that's
524,575,512,it,5.55041,it,success,255.270000,2692,"[{'duration': 0.05, 'phone': 'ih_B'}, {'durati...",255.170000,2690,it
525,576,513,for,9.11516,for,success,255.520000,2696,"[{'duration': 0.04, 'phone': 'f_B'}, {'duratio...",255.270000,2693,for


In [40]:
harmonized_df.to_csv("../data/harmonized.csv")

----

In [63]:
# Resample every 100ms
harmonized_df["dt"] = pd.datetime(2021, 1, 1) + pd.to_timedelta(harmonized_df.startOffset, unit="sec")
harmonized_df = harmonized_df.set_index("dt")

harmonized_df.rolling('1000ms', min_periods=0).mean()#.to_csv("../data/resampled.csv")

  harmonized_df["dt"] = pd.datetime(2021, 1, 1) + pd.to_timedelta(harmonized_df.startOffset, unit="sec")


Unnamed: 0_level_0,surp_idx,aligned_idx,surprisal,end,endOffset,start,startOffset
dt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2021-01-01 00:00:00,0.0,0.0,8.92268,17.170000,1.0,17.080000,0.0
2021-01-01 00:00:02,1.0,1.0,5.75346,17.400000,6.0,17.170000,2.0
2021-01-01 00:00:07,2.0,2.0,7.21849,17.560000,10.0,17.400000,7.0
2021-01-01 00:00:11,3.0,3.0,4.03629,17.839999,12.0,17.559999,11.0
2021-01-01 00:00:13,4.0,4.0,4.65778,18.359999,19.0,18.109999,13.0
...,...,...,...,...,...,...,...
2021-01-01 00:44:43,573.0,511.0,8.16496,255.170000,2689.0,254.990000,2683.0
2021-01-01 00:44:43,573.5,511.0,7.55156,255.170000,2689.0,254.990000,2683.0
2021-01-01 00:44:50,575.0,512.0,5.55041,255.270000,2692.0,255.170000,2690.0
2021-01-01 00:44:53,576.0,513.0,9.11516,255.520000,2696.0,255.270000,2693.0
