In [1]:
from pathlib import Path

import pandas as pd
import seaborn as sns

import transformers

## Compute phoneme data representation

In [2]:
paths = list(Path("data/TRAIN").glob("**/*.PHN"))
phone_df = pd.concat([pd.read_csv(path, sep=" ", header=None) for path in paths],
                     names=["sentence_idx", "dialect", "speaker"],
                     keys=[(p.stem, p.parts[2], p.parts[3]) for p in paths])
phone_df.columns = ["onset", "offset", "phone"]
phone_df.index.rename("phone_idx", level=-1, inplace=True)
phone_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,onset,offset,phone
sentence_idx,dialect,speaker,phone_idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SI1311,DR4,MMDM0,0,0,2680,h#
SI1311,DR4,MMDM0,1,2680,5640,s
SI1311,DR4,MMDM0,2,5640,7853,ao
SI1311,DR4,MMDM0,3,7853,8647,l
SI1311,DR4,MMDM0,4,8647,10026,r
...,...,...,...,...,...,...
SI1044,DR8,MRDM0,60,59494,60974,zh
SI1044,DR8,MRDM0,61,60974,62208,en
SI1044,DR8,MRDM0,62,62208,63169,epi
SI1044,DR8,MRDM0,63,63169,64698,z


In [3]:
paths = list(Path("data/TRAIN").glob("**/*.WRD"))
word_df = pd.concat([pd.read_csv(path, sep=" ", header=None) for path in paths],
                    names=["sentence_idx", "dialect", "speaker"],
                    keys=[(p.stem, p.parts[2], p.parts[3]) for p in paths])
word_df.columns = ["onset", "offset", "word"]
word_df.index.rename("word_idx", level=-1, inplace=True)
word_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,onset,offset,word
sentence_idx,dialect,speaker,word_idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SI1311,DR4,MMDM0,0,2680,8647,soil
SI1311,DR4,MMDM0,1,8647,23227,redeposition
SI1311,DR4,MMDM0,2,31832,33420,is
SI1311,DR4,MMDM0,3,33420,46280,evaluated
SI1311,DR4,MMDM0,4,46280,48840,by
...,...,...,...,...,...,...
SI1044,DR8,MRDM0,3,33520,34855,to
SI1044,DR8,MRDM0,4,35569,41320,insulate
SI1044,DR8,MRDM0,5,41320,48806,themselves
SI1044,DR8,MRDM0,6,48806,54040,against


In [4]:
# asof merge word indices and word identities into phoneme representation
phone_df = pd.merge_asof(phone_df.sort_values("onset").reset_index(level="phone_idx"), word_df.sort_values("onset").reset_index(level="word_idx"),
                         by=["sentence_idx", "dialect", "speaker"], on="onset", suffixes=("", "_word"))
phone_df = phone_df.set_index(["dialect", "speaker", "sentence_idx", "phone_idx"]).sort_index()

In [5]:
phone_df.to_csv("timit_merged.csv")

In [6]:
phone_df.word.dropna().unique()

array(['she', 'had', 'your', ..., 'murderer', 'outset', 'reconnaissance'],
      dtype=object)

In [7]:
phone_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,onset,offset,phone,word_idx,offset_word,word
dialect,speaker,sentence_idx,phone_idx,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DR1,FCJF0,SA1,0,0,3050,h#,,,
DR1,FCJF0,SA1,1,3050,4559,sh,0.0,5723.0,she
DR1,FCJF0,SA1,2,4559,5723,ix,0.0,5723.0,she
DR1,FCJF0,SA1,3,5723,6642,hv,1.0,10337.0,had
DR1,FCJF0,SA1,4,6642,8772,eh,1.0,10337.0,had
...,...,...,...,...,...,...,...,...,...
DR8,MTCS0,SX82,29,34460,36386,ih,7.0,40870.0,tips
DR8,MTCS0,SX82,30,36386,38150,pcl,7.0,40870.0,tips
DR8,MTCS0,SX82,31,38150,38575,p,7.0,40870.0,tips
DR8,MTCS0,SX82,32,38575,40870,s,7.0,40870.0,tips


## Estimate surprisals

In [10]:
# model = transformers.AutoModelForCausalLM.from_pretrained("distilgpt2")
# tokenizer = transformers.AutoTokenizer.from_pretrained("distilgpt2")

from minicons import scorer
lm = scorer.IncrementalLMScorer("distilgpt2")

Using pad_token, but it is not set yet.


In [9]:
# NB not all speakers read all sentences, so we have to compute sentences across speakers, merge, and dedupe
sentences = phone_df.groupby(["speaker", "sentence_idx"]).apply(lambda xs: " ".join(xs.drop_duplicates("word_idx").dropna().word))
sentences = sentences.droplevel("speaker")
sentences = sentences.loc[~sentences.index.duplicated()]
sentences

sentence_idx
SA1       she had your dark suit in greasy wash water al...
SA2             don't ask me to carry an oily rag like that
SI1392    assume for example a situation where a farm ha...
SI2022                       what outfit does she drive for
SI762                     fill small hole in bowl with clay
                                ...                        
SI2266                        his manhood had been attacked
SI796     the filtered air benefits allergies asthma sin...
SI1166    in the course of its inquiry it took testimony...
SI1796                           his eyes burned feverishly
SI536     so we note approvingly a fresh sample of unani...
Length: 1718, dtype: object

In [16]:
lm.token_score(sentences.iloc[:5], surprisal=True, base_two=True)

[[('she', 0.0),
  ('had', 11.548450469970703),
  ('your', 10.531771659851074),
  ('dark', 12.932721138000488),
  ('suit', 10.313411712646484),
  ('in', 4.336450576782227),
  ('gre', 17.822790145874023),
  ('asy', 1.6554306745529175),
  ('wash', 10.515349388122559),
  ('water', 10.892115592956543),
  ('all', 8.622169494628906),
  ('year', 6.9613566398620605)],
 [('don', 0.0),
  ("'t", 9.131139755249023),
  ('ask', 8.216742515563965),
  ('me', 3.70967960357666),
  ('to', 2.596806049346924),
  ('carry', 11.230142593383789),
  ('an', 7.647350311279297),
  ('oily', 14.790067672729492),
  ('rag', 8.425741195678711),
  ('like', 7.161896705627441),
  ('that', 2.4186594486236572)],
 [('ass', 0.0),
  ('ume', 16.966228485107422),
  ('for', 8.030224800109863),
  ('example', 6.972435474395752),
  ('a', 7.833366870880127),
  ('situation', 11.589754104614258),
  ('where', 0.991115391254425),
  ('a', 3.085996150970459),
  ('farm', 12.526687622070312),
  ('has', 4.806395053863525),
  ('a', 2.9856684207

TODO: merge back into dataset representation so we can use this for sampling later