In [65]:
%load_ext autoreload
%autoreload 2

#%% Change working directory from the workspace root to the ipynb file location. Turn this addition off with the DataSciece.changeDirOnImportExport setting
import os
try:
    os.chdir(r'C:\Users\kevin\Documents\Workspace\psylit-experiments')
    print("Changed cwd:", os.getcwd())
except:
    print("cwd:", os.getcwd())

from collections import Counter, defaultdict
from statistics import mean
from os import linesep as EOL
import re

import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
import seaborn as sns
plt.rcParams['figure.figsize'] = 16,10
np.random.seed(0)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Changed cwd: C:\Users\kevin\Documents\Workspace\psylit-experiments


In [66]:
from tic import preprocess

In [67]:
import spacy
nlp = spacy.load('en_core_web_sm')   # 'Vanilla' spacy model: spacy.load('en_core_web_sm')

merge_ents = nlp.create_pipe("merge_entities")

nlp.add_pipe(merge_ents, after="ner")

In [None]:
import time

def maybe(f, default=None):
    try:
        return f()
    except:
        return default

class show_progress:
    
    def __init__(self, seq, fmt_progress="{i} / {n}", fmt_time=(" "*4+"{t:.5f}s")):
        self.n = maybe(lambda: len(seq))
        self.seq = iter(seq)
        self.i = 0
        self.fmt_progress = fmt_progress
        self.fmt_time = fmt_time
        self.t = time.perf_counter()
    
    def __iter__(self):
        #print(self.fmt_progress.format(i=self.i, n=self.n))
        #self.t = time.perf_counter()
        return self
    
    def __next__(self):
        self.t = time.perf_counter() - self.t
        if self.i:
            print(self.fmt_time.format(t=self.t))
        self.i += 1
        if self.n and self.i <= self.n:
            print(self.fmt_progress.format(i=self.i, n=self.n))
        if not self.n:
            print(self.i)
        self.t = time.perf_counter()
        return next(self.seq)
        
        
def human_print_dict(d, tab=' '*2, level=0):
    for k, v in d.items():
        print(f"{tab*level}{k}:", end=' ')
        if type(v) == dict:
            print(tab*level)
            human_print_dict(v, tab, level+1)
        else:
            print(v)

In [12]:
FUNCTIONAL_DEPS = ('det','poss','neg','aux','auxpass','ps','mark','ccomp','xcomp','acomp','prt') # TODO spacy equiv of ps

def deppaths(doc, skip=(lambda t: t.is_space or t.is_punct), include=(lambda t: not t.is_stop)):

    def deppaths_sent(sent):
        prev_path_ids = None
        for tok in sent:
            if tok.n_lefts or tok.n_rights:
                continue
                
            path = [tok] + list(tok.ancestors)
            path = list(filter(include, path))
            if not path:
                continue

            yield path

    for sent in doc.sents:
        sent = (t for t in sent if not skip(t))
        paths = list(deppaths_sent(sent))
        
        # paths are uniquely identified in the sent by their leaf token
        # the last one for a given id is the the complete one
        # assumes dicts are ordered (Py 3.6+)
        
        paths = {p[-1]: p for p in paths}
        yield from paths.values()
        
                
def format_sgrams(grams, 
                  merge_cond=(lambda t: t.dep_ in ('neg','prt')), 
                  process=(t.lemma_ if t.pos_ in ('VERB','NOUN') else t.text), 
                  join='-'.join): # merge might include preps
    for gram in grams:
        n = len(gram)
        gram_toks = []
        suffixes = []
        for t in reversed(gram):
            txt = process(t)
            if merge_cond(t):
                suffixes.insert(0, txt)
            else:
                tok_txt = join([txt] + suffixes)
                gram_toks.append(tok_txt)
                suffixes = []
                
        yield gram_toks
        
def format_rgrams(grams, process=(t.lemma_ if t.pos_ in ('VERB','NOUN') else t.text)):
    # equiv:
    # yield from format_sgrams(grams, merge_cond=(lambda t: False))
    for gram in grams:
        gram_toks = [process(t) for t in gram]
        yield gram_toks
        

class Grams(Sequence):
    
    def __init__(self, grams):
        """
        grams: Seq[Iterable]
        """
        self.grams = grams
    
    def map(self, f):
        self.grams = map(f, self.grams)
    
    def merge_from_right(self, match):
        def process():
            for gram in self.grams:
                suffixes = []
                for t in reversed(gram):
                    if match(t):
                        suffixes.insert(0, t)
                    else:
                        yield [t] + suffixes
                        suffixes = []
        
        return DisjointGrams(process())
        
        
class DisjointGrams(Sequence):
    
    def __init__(self, grams):
        """
        grams: Seq[Iterable[Iterable]]
        """
        self.grams = grams

    def join(self, join):
        return Grams(join(g) for g in self.grams)
    
def ngrams_from_paths(paths, n=2, is_counted=(lambda t: t.dep_ not in FUNCTIONAL_DEPS), step=1):
    for p in paths:
        yield from ngrams_from_words(p, n, is_counted, step)

def ngrams_from_words(toks, n=2, is_counted=(lambda t: t.dep_ not in FUNCTIONAL_DEPS), step=1):
    content_idx = [i for i,t in enumerate(toks) if is_counted(t)]
    m = len(content_idx)
    for j in range(n, m, step):
        gram_slice = slice(content_idx[j - n], content_idx[j])
        yield toks[gram_slice]


In [53]:

class process_rgrams:
    
    def __init__(self, doc, nmax=3):
        all_words = list(doc)
        words = [t for t in all_words if (not t.is_space) and (not t.is_punct) and (not t.is_stop)]
        self.reg_grams = [
            list(format_rgrams(ngrams_from_words(words, n)))
            for n in range(1, nmax+1)
        ]
        self.nmax = nmax
        self.reg_grams_count = [Counter(map(tuple, gs)) for gs in self.reg_grams]

        self.stats = {}
        
    def summary(self):
        sm = {}
        for n in range(self.nmax):
            sm[f'n_reg_{n+1}grams'] = len(self.reg_grams_count[n])
        return {**self.stats, **sm}

class process_frames:
    def __init__(self, doc, nmax=3):
        all_words = list(doc)
        words = [t for t in all_words if (not t.is_space) and (not t.is_punct) and (not t.is_stop)]
        paths = list(deppaths(doc))
        self.syn_grams = [
            list(format_sgrams(ngrams_from_paths(paths, n)))
            for n in range(1, nmax+1)
        ]
        self.syn_grams_count = [Counter(map(tuple, gs)) for gs in self.syn_grams]
        self.nmax = nmax

        self.stats = {
            'n_dep_paths': len(paths),
            'avg_path_length': mean(len(p) for p in paths),
        }
        
    def summary(self):
        sm = {}
        for n in range(self.nmax):
            sm[f'n_syn_{n+1}grams'] = len(self.syn_grams_count[n])
        return {**self.stats, **sm}
    
class process_doc:
    
    def __init__(self, doc, nmax=3):
        all_words = list(doc)
        all_sents = list(doc.sents)
        ng = process_rgrams(doc, nmax)
        sng = process_sngrams(doc, nmax)
        self.syn_grams_count = sng.syn_grams_count
        self.reg_grams_count = ng.reg_grams_count
        
        self.stats = {
            'n_tokens': len(all_words),
            'avg_sentence_length': mean(len(s) for s in all_sents),
            'n_sentences': len(all_sents),
            **ng.summary(),
            **sng.summary(),
        }
        
    def summary(self):
        return self.stats
    
class process_corpus:
    
    def __init__(self, texts, batch_size=8, disable=None, nmax=3, **kwargs):
        disable = disable or []
        self.reg_grams_count = [Counter() for _ in range(nmax)]
        self.syn_grams_count = [Counter() for _ in range(nmax)]
        self.summaries = []
        self.suspicious_docs = []
        pipeline = nlp.pipe(texts, batch_size=batch_size, disable=disable)
        #pipeline = (nlp(txt, disable=disable) for txt in texts)
        for doc in show_progress(pipeline):
            d = process_doc(doc, **kwargs)
            self.summaries.append(d.summary())
            
            if len(set(t.text for t in d.reg_grams_count[2].keys())) == 1:
                self.suspicious_docs.append(doc)
            if len(set(t.text for t in d.syn_grams_count[2].keys())) == 1:
                self.suspicious_docs.append(doc)
            for n in range(nmax):
                
                self.reg_grams_count[n] += d.reg_grams_count[n]
                self.syn_grams_count[n] += d.syn_grams_count[n]
        
        self.summary = pd.DataFrame(self.summaries)
        del self.summaries

In [64]:
data_root = r'..\datasets\2_txtalb_Novel450'
files = [os.path.join(data_root, f) for f in os.listdir(data_root) if f.startswith('EN_')][60:70]#110]
nfiles = len(files)

In [None]:
corpus = process_corpus(preprocess.read_pg(filename)[:nlp.max_length] for filename in files)

In [51]:
corpus.summary.describe()

Unnamed: 0,avg_path_length,avg_sentence_length,n_dep_paths,n_reg_0grams,n_reg_1grams,n_reg_2grams,n_sentences,n_syn_0grams,n_syn_1grams,n_syn_2grams,n_tokens
count,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0,50.0
mean,2.519644,17.066189,9994.14,11011.62,40614.58,44793.9,9124.12,4340.66,6227.3,3176.52,154311.8
std,0.172444,3.436188,4544.271438,4281.427193,18464.718003,20714.481981,4196.700816,1618.863366,2943.207678,1615.770673,71662.083293
min,2.226048,12.260053,1840.0,2939.0,7737.0,8596.0,1594.0,1023.0,899.0,392.0,31418.0
25%,2.362711,14.37867,5330.75,6774.25,22410.0,24318.5,5126.25,2911.25,3342.5,1612.75,79716.75
50%,2.532044,17.02979,11770.0,12594.5,48981.0,52931.5,9928.0,5148.0,7305.0,3663.0,181893.5
75%,2.618959,19.475784,13507.5,14615.0,55597.75,61393.0,12349.5,5609.75,8568.25,4357.0,217958.0
max,2.915258,28.908506,17227.0,16267.0,63207.0,69075.0,17791.0,6436.0,10846.0,6123.0,231944.0


In [52]:
corpus.syn_grams_count[2].most_common(50)

[(('Dunbar', 'Dunbar', 'Balderby'), 14),
 (('office', '-', 'post'), 13),
 (('bid', 'night', 'good'), 10),
 (('bid', 'bye', 'good'), 9),
 (('wish', 'night', 'good'), 8),
 (('house', 'Dunbar', 'Dunbar'), 8),
 (('look', 'round', 'room'), 7),
 (('tête', 'tête', 'à'), 7),
 (('door', 'room', 'drawing'), 6),
 (('open', 'door', 'room'), 6),
 (('till', 'time', 'dinner'), 6),
 (('window', 'room', 'drawing'), 6),
 (('look', 'like', 'man'), 6),
 (('door', 'room', 'dining'), 5),
 (('finger', 'hand', 'left'), 5),
 (('door', 'lead', 'room'), 5),
 (('look', 'shake', 'head'), 5),
 (('see', 'day', 'better'), 5),
 (('man', 'woman', 'child'), 5),
 (('burst', 'flood', 'tear'), 5),
 (('room', 'floor', 'ground'), 5),
 (('look', 'look', 'look'), 5),
 (('look', 'expression', 'face'), 5),
 (('turn', 'look', 'face'), 4),
 (('window', 'room', 'dining'), 4),
 (('like', 'friend', 'old'), 4),
 (('like', 'beast', 'wild'), 4),
 (('turn', 'walk', 'away'), 4),
 (('like', 'child', 'little'), 4),
 (('wish', 'bye', 'good')

In [17]:
data_root = r'..\datasets\2_txtalb_Novel450'
files = [os.path.join(data_root, f) for f in os.listdir(data_root) if f.startswith('EN_')][60:110]
        
def all_ngrams(files):
    texts = (preprocess.read_pg(filename)[:nlp.max_length] for filename in files)
    counter = Counter()
    disabled = ['tagger','parser','ner','entity_ruler','sentencizer','merge_entities']
    for doc in [nlp(txt, disable=disabled) for txt in texts]:#(nlp.pipe(texts, batch_size=2, disable=disabled)):
        c = process_rgrams(doc)
        counter += c.reg_grams_count[2]
    return counter

c = all_ngrams(files)
c.most_common(50)

[(('Mrs.', 'Dormer', 'Smith'), 245),
 (('said', 'St.', 'Clare'), 148),
 (('said', 'Miss', 'Ophelia'), 124),
 (('said', 'Lady', 'Laura'), 119),
 (('Mrs.', 'Orton', 'Beg'), 116),
 (('Mrs.', 'Le', 'Marchant'), 97),
 (('Madame', 'Max', 'Goesler'), 78),
 (('Mr.', 'Fane', 'Smith'), 78),
 (('said', 'Mrs.', 'Edmonstone'), 77),
 (('said', 'Mr.', 'Tulliver'), 75),
 (('said', 'Mr.', 'Lorry'), 73),
 (('Mrs.', 'Guthrie', 'Brimston'), 69),
 (('Sir', 'Percival', 'Glyde'), 67),
 (('Mr.', 'Dormer', 'Smith'), 66),
 (('Dr.', 'Van', 'Helsing'), 65),
 (('said', 'Mrs.', 'Tulliver'), 63),
 (('said', 'Mr.', 'Jaggers'), 62),
 (('replied', 'Dr.', 'Leete'), 57),
 (('said', 'Charles', 'Osmond'), 55),
 (('said', 'old', 'man'), 54),
 (('Mrs.', 'Fane', 'Smith'), 54),
 (('said', 'Mr.', 'Hale'), 52),
 (('tête', 'à', 'tête'), 52),
 (('Mr.', 'St.', 'John'), 52),
 (('said', 'young', 'man'), 49),
 (('said', 'Lord', 'Henry'), 47),
 (('said', 'low', 'voice'), 46),
 (('said', 'Mr', 'Slope'), 45),
 (('said', 'Mrs.', 'Glegg'),

In [None]:
[x[:20] for x in (preprocess.read_pg(filename)[:nlp.max_length] for filename in files)]

In [None]:
dep_cnt = Counter(t.dep_ for t in doc)
dep_cnt

In [None]:
content_dep_cnt = Counter(t.dep_ for t in doc if t.dep_ not in FUNCTIONAL_DEPS)
sum(content_dep_cnt.values()), sum((dep_cnt - content_dep_cnt).values()), sum((dep_cnt).values())

In [None]:
from itertools import islice

sents = {}
current_sent_start = None
for gram in islice(grams, 50):
    sent = gram[0].sent
    if sent.start != current_sent_start:
        current_sent_start = sent.start
        print(current_sent_start, " ".join(t.text for t in sent if not t.is_space))
        sents[current_sent_start] = sent
    gram_pos = [t.pos_ for t in gram]
    gram_dep = [t.dep_ for t in gram]
    print(gram)
    print(" "*6, gram_dep)
    print(" "*6, gram_pos)

In [None]:
from spacy import displacy
displacy.render(sents[3], style="dep")

In [None]:
"The family of Dashwood had long been settled in Sussex ."
['settle', 'family', 'of', 'Dashwood']
['settle', 'in', 'Sussex']

In [None]:
list(sents[3][7].ancestors), sents[3][7].dep_
list(sents[3][3].ancestors), sents[3][7].dep_

In [None]:
list(sents[3].root.subtree)

In [None]:
def ancestors(tok):
    while tok.head != tok:
        tok = tok.head
        yield tok
    if tok.head != tok:
        yield tok

print("--- custom:")
for t in sents[3]:
    print(list(ancestors(t))[::-1], t.dep_, t)

print("--- spacy:")
for t in sents[3]:
    print(list(t.ancestors)[::-1], t.dep_, t)

In [None]:
displacy.render(nlp("He didn't even look at her."), style="dep")

In [None]:
a = []
a.insert(0, 42)
a.insert(0, 43)
a

In [None]:
help(list)

In [None]:
help(range)

In [None]:
list(range(2,8))

In [None]:
sents = list(doc.sents)

In [None]:
list(sents[1])

In [None]:
help(set)

In [None]:
help(slice)

In [None]:
[(t.text, t.dep_, t.pos_) for t in nlp('Alice was going up the stairs.')]

In [None]:
x = 4
for i in range(2, x):
    print(i)
else:
    print('empty')

In [None]:
[f for f in os.listdir(r'..\datasets\2_txtalb_Novel450') if f.startswith('EN_')]

In [None]:
len([f for f in os.listdir(r'..\datasets\2_txtalb_Novel450') if f.startswith('EN_')])

In [None]:
"{:.6f}s".format(time.perf_counter() - time.perf_counter())

In [None]:
a = time.perf_counter()

In [None]:
help(nlp.pipe)

In [None]:
help(doc)

In [None]:
{k for k in globals().keys() if not k.startswith('_')}

In [41]:
class Toto:pass

t = Toto()
t.a = 42
t2 = Toto()
try:
    print(t.a, t2.a)
except Exception as e:
    print(e)
t2.a = 43
print(t.a, t2.a)

'Toto' object has no attribute 'a'
42 43


In [61]:
x=np.triu(np.random.random([6,6])*2-1)
x-x.T*(1+np.random.random([6,6])*.2-.1)

array([[-0.08667145,  0.96247106, -0.90505545, -0.67014953,  0.68932609,
        -0.08000127],
       [-0.9814764 , -0.01658201,  0.45627636,  0.13262529,  0.79202124,
        -0.55723361],
       [ 0.94989781, -0.47164804,  0.06555926, -0.21102938, -0.27916941,
         0.94720727],
       [ 0.68735926, -0.14219569,  0.20014977,  0.05008628,  0.39812897,
         0.89753264],
       [-0.74043738, -0.79034741,  0.2726128 , -0.38621711,  0.00357159,
         0.61631072],
       [ 0.0760054 ,  0.51077689, -1.01802278, -0.94650462, -0.63429054,
        -0.00322389]])

In [62]:
import networkx