# Generator of (inclusive) (extended) (skippy) n-grams out of words or sentences

developed by Kow Kuroda (kow.kuroda@gmail.com)

Limitations
- Cython-enhancement is effective only on non-Apple Silicons with Python 3.10, 3.11 and 3.12.

Creation
- 2025/08/19

Modifications
- None

# Set up data

In [1]:
analyze_words = True # if False, analyze sentential/phrasal objects

In [2]:
import pathlib
if analyze_words:
    data_dir = 'data/words'
    files = list(pathlib.Path(data_dir).glob('buddhist-listed2.txt'))
else:
    data_dir = 'data/phrases'
    files = list(pathlib.Path(data_dir).glob('austen-j-sample100.txt'))
##
print(files)

##
file = files[0]
source_name = file.stem
print(f"source_name: {source_name}")

[PosixPath('data/words/buddhist-listed2.txt')]
source_name: buddhist-listed2


In [3]:
## get data
docs = file.read_text(encoding = 'utf-8').splitlines()

## lowercase
docs = [ doc.lower() for doc in docs if len(doc) > 0 ]
print(docs[:10])

['阿羅漢', '辟支仏', '転法輪', '十二因縁', '五蘊盛苦', '三法印', '四念処', '四神足', '五根五力', '七覚支']


# Set up Cython

In [4]:
#conda update -n base -c defaults conda -y

In [5]:
## Cython の導入 (必要に応じて)
#!conda uninstall cython -y # seems necessary in certain situations
#!conda install cython -y
## Try the following if the above fails
#!pip install cython --upgrade --force-reinstall
#!conda update -n base -c defaults conda -y

In [6]:
#!pip show cython

In [7]:
## Cython を使うかどうか
use_Cython = True

In [8]:
## run the follwing only once before you use Cython
if use_Cython:
    !rm -rf build dist *.c *.so
    !python setup.py clean build_ext --inplace

Compiling gen_ngrams_cy.py because it changed.
[1/1] Cythonizing gen_ngrams_cy.py
  tree = Parsing.p_module(s, pxd, full_module_name)
running clean
running build_ext
building 'gen_ngrams_cy' extension
creating build
creating build/temp.macosx-10.15-x86_64-cpython-311
clang -DNDEBUG -fwrapv -O2 -Wall -fPIC -O2 -isystem /Users/kowk/opt/anaconda3/include -fPIC -O2 -isystem /Users/kowk/opt/anaconda3/include -I/usr/local/include -I/usr/local/opt/llvm/include -I/Users/kowk/opt/anaconda3/include/python3.11 -c gen_ngrams_cy.c -o build/temp.macosx-10.15-x86_64-cpython-311/gen_ngrams_cy.o
creating build/lib.macosx-10.15-x86_64-cpython-311
clang -bundle -undefined dynamic_lookup -Wl,-rpath,/Users/kowk/opt/anaconda3/lib -L/Users/kowk/opt/anaconda3/lib -Wl,-rpath,/Users/kowk/opt/anaconda3/lib -L/Users/kowk/opt/anaconda3/lib -L/Library/Developer/CommandLineTools/SDKs/MacOSX.sdk/usr/lib -L/usr/local/lib -L/usr/local/opt/llvm/lib -I/usr/local/include -I/usr/local/opt/llvm/include build/temp.macosx-10.

In [9]:
if use_Cython:
    try:
        %reload_ext Cython
    except ImportError:
        %load_ext Cython
    ## The following works with 3.10 on M1, but not with 3.11, 3.12 on Apple Silicons
    import gen_ngrams_cy as gen_ngrams
else:
    import gen_ngrams

# Generation of (extended) (skippy) n-grams

In [10]:
## parameters
if analyze_words:
    segmenter: str = r""
    sep_local: str = ""
else:
    segmenter: str = r" "
    sep_local: str = " "

In [11]:
## flags
check: bool = False
generated_as_string: bool = True
generated_as_list: bool = not(generated_as_string)

In [12]:
## saving results
save_results: bool = False
save_dir: str = "saves"

In [13]:
#!conda install pandas -y

In [14]:
max_n_for_ngram: int = 5

In [15]:
import pandas as pd
columns0 = ['doc']
columns1 = [ f"{i}g" for i in range(1, max_n_for_ngram + 1)]
columns2 = [ f"sk{i}g" for i in range(1, max_n_for_ngram + 1)]
columns3 = [ f"xsk{i}g" for i in range(1, max_n_for_ngram + 1)]

used_columns = columns0 + columns1 + columns2 + columns3
df = pd.DataFrame(columns = used_columns)

In [16]:
## generate regular n-grams
import re
for i, doc in enumerate(docs):
    ## update df for word
    df.loc[i,'doc'] = doc
    ##
    print(f"Processing word {i}: {doc} [use_Cython: {use_Cython}]")
    word_segs = [ x for x in re.split(segmenter, doc) if len(x) > 0 ]
    if len(word_segs) > 1:
        for j in range(1, max_n_for_ngram + 1):
            print(f"generating {j}-grams ...")
            ngrams = gen_ngrams.gen_ngrams(word_segs, j, sep = sep_local, as_list = generated_as_list, check = False)
            if check:
                print(ngrams)
            ## update df
            df.loc[i, f'{j}g'] = ngrams

Processing word 0: 阿羅漢 [use_Cython: True]
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
generating 5-grams ...
Processing word 1: 辟支仏 [use_Cython: True]
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
generating 5-grams ...
Processing word 2: 転法輪 [use_Cython: True]
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
generating 5-grams ...
Processing word 3: 十二因縁 [use_Cython: True]
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
generating 5-grams ...
Processing word 4: 五蘊盛苦 [use_Cython: True]
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
generating 5-grams ...
Processing word 5: 三法印 [use_Cython: True]
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
generating 5-grams ...
Processing word 6: 四念処 [use_Cython: True]
generating 1-g

In [17]:
df

Unnamed: 0,doc,1g,2g,3g,4g,5g,sk1g,sk2g,sk3g,sk4g,sk5g,xsk1g,xsk2g,xsk3g,xsk4g,xsk5g
0,阿羅漢,"[阿, 羅, 漢]","[阿羅, 羅漢]",[阿羅漢],[阿羅漢],[阿羅漢],,,,,,,,,,
1,辟支仏,"[辟, 支, 仏]","[辟支, 支仏]",[辟支仏],[辟支仏],[辟支仏],,,,,,,,,,
2,転法輪,"[転, 法, 輪]","[転法, 法輪]",[転法輪],[転法輪],[転法輪],,,,,,,,,,
3,十二因縁,"[十, 二, 因, 縁]","[十二, 二因, 因縁]","[十二因, 二因縁]",[十二因縁],[十二因縁],,,,,,,,,,
4,五蘊盛苦,"[五, 蘊, 盛, 苦]","[五蘊, 蘊盛, 盛苦]","[五蘊盛, 蘊盛苦]",[五蘊盛苦],[五蘊盛苦],,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,両祖忌法要,"[両, 祖, 忌, 法, 要]","[両祖, 祖忌, 忌法, 法要]","[両祖忌, 祖忌法, 忌法要]","[両祖忌法, 祖忌法要]",[両祖忌法要],,,,,,,,,,
196,宗祖忌法要,"[宗, 祖, 忌, 法, 要]","[宗祖, 祖忌, 忌法, 法要]","[宗祖忌, 祖忌法, 忌法要]","[宗祖忌法, 祖忌法要]",[宗祖忌法要],,,,,,,,,,
197,御会式法要,"[御, 会, 式, 法, 要]","[御会, 会式, 式法, 法要]","[御会式, 会式法, 式法要]","[御会式法, 会式法要]",[御会式法要],,,,,,,,,,
198,報恩講法要,"[報, 恩, 講, 法, 要]","[報恩, 恩講, 講法, 法要]","[報恩講, 恩講法, 講法要]","[報恩講法, 恩講法要]",[報恩講法要],,,,,,,,,,


In [18]:
## generate skippy n-grams
import re
for i, doc in enumerate(docs):
    print(f"Processing word {i}: {doc} [use_Cython: {use_Cython}]")
    word_segs = [ seg for seg in re.split(segmenter, doc) if len(seg) > 0 ]
    if len(word_segs) > 1:
        for j in range(1, max_n_for_ngram + 1):
            print(f"generating skippy {j}-grams ...")
            ngrams = gen_ngrams.gen_skippy_ngrams(word_segs, j, sep = sep_local, as_list = generated_as_list, check = False)
            if check:
                print(ngrams)
            ## update df
            df.loc[i, f'sk{j}g'] = ngrams

Processing word 0: 阿羅漢 [use_Cython: True]
generating skippy 1-grams ...
generating skippy 2-grams ...
generating skippy 3-grams ...
generating skippy 4-grams ...
generating skippy 5-grams ...
Processing word 1: 辟支仏 [use_Cython: True]
generating skippy 1-grams ...
generating skippy 2-grams ...
generating skippy 3-grams ...
generating skippy 4-grams ...
generating skippy 5-grams ...
Processing word 2: 転法輪 [use_Cython: True]
generating skippy 1-grams ...
generating skippy 2-grams ...
generating skippy 3-grams ...
generating skippy 4-grams ...
generating skippy 5-grams ...
Processing word 3: 十二因縁 [use_Cython: True]
generating skippy 1-grams ...
generating skippy 2-grams ...
generating skippy 3-grams ...
generating skippy 4-grams ...
generating skippy 5-grams ...
Processing word 4: 五蘊盛苦 [use_Cython: True]
generating skippy 1-grams ...
generating skippy 2-grams ...
generating skippy 3-grams ...
generating skippy 4-grams ...
generating skippy 5-grams ...
Processing word 5: 三法印 [use_Cython: Tr

In [19]:
df

Unnamed: 0,doc,1g,2g,3g,4g,5g,sk1g,sk2g,sk3g,sk4g,sk5g,xsk1g,xsk2g,xsk3g,xsk4g,xsk5g
0,阿羅漢,"[阿, 羅, 漢]","[阿羅, 羅漢]",[阿羅漢],[阿羅漢],[阿羅漢],"[阿, 羅, 漢]","[阿羅, 阿…漢, 羅漢]",[阿羅漢],[阿羅漢],[阿羅漢],,,,,
1,辟支仏,"[辟, 支, 仏]","[辟支, 支仏]",[辟支仏],[辟支仏],[辟支仏],"[辟, 支, 仏]","[辟支, 辟…仏, 支仏]",[辟支仏],[辟支仏],[辟支仏],,,,,
2,転法輪,"[転, 法, 輪]","[転法, 法輪]",[転法輪],[転法輪],[転法輪],"[転, 法, 輪]","[転法, 転…輪, 法輪]",[転法輪],[転法輪],[転法輪],,,,,
3,十二因縁,"[十, 二, 因, 縁]","[十二, 二因, 因縁]","[十二因, 二因縁]",[十二因縁],[十二因縁],"[十, 二, 因, 縁]","[十二, 十…因, 十…縁, 二因, 二…縁, 因縁]","[十二因, 十二…縁, 十…因縁, 二因縁]",[十二因縁],[十二因縁],,,,,
4,五蘊盛苦,"[五, 蘊, 盛, 苦]","[五蘊, 蘊盛, 盛苦]","[五蘊盛, 蘊盛苦]",[五蘊盛苦],[五蘊盛苦],"[五, 蘊, 盛, 苦]","[五蘊, 五…盛, 五…苦, 蘊盛, 蘊…苦, 盛苦]","[五蘊盛, 五蘊…苦, 五…盛苦, 蘊盛苦]",[五蘊盛苦],[五蘊盛苦],,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,両祖忌法要,"[両, 祖, 忌, 法, 要]","[両祖, 祖忌, 忌法, 法要]","[両祖忌, 祖忌法, 忌法要]","[両祖忌法, 祖忌法要]",[両祖忌法要],"[両, 祖, 忌, 法, 要]","[両祖, 両…忌, 両…法, 両…要, 祖忌, 祖…法, 祖…要, 忌法, 忌…要, 法要]","[両祖忌, 両祖…法, 両祖…要, 両…忌法, 両…忌…要, 両…法要, 祖忌法, 祖忌…要...","[両祖忌法, 両祖忌…要, 両祖…法要, 両…忌法要, 祖忌法要]",[両祖忌法要],,,,,
196,宗祖忌法要,"[宗, 祖, 忌, 法, 要]","[宗祖, 祖忌, 忌法, 法要]","[宗祖忌, 祖忌法, 忌法要]","[宗祖忌法, 祖忌法要]",[宗祖忌法要],"[宗, 祖, 忌, 法, 要]","[宗祖, 宗…忌, 宗…法, 宗…要, 祖忌, 祖…法, 祖…要, 忌法, 忌…要, 法要]","[宗祖忌, 宗祖…法, 宗祖…要, 宗…忌法, 宗…忌…要, 宗…法要, 祖忌法, 祖忌…要...","[宗祖忌法, 宗祖忌…要, 宗祖…法要, 宗…忌法要, 祖忌法要]",[宗祖忌法要],,,,,
197,御会式法要,"[御, 会, 式, 法, 要]","[御会, 会式, 式法, 法要]","[御会式, 会式法, 式法要]","[御会式法, 会式法要]",[御会式法要],"[御, 会, 式, 法, 要]","[御会, 御…式, 御…法, 御…要, 会式, 会…法, 会…要, 式法, 式…要, 法要]","[御会式, 御会…法, 御会…要, 御…式法, 御…式…要, 御…法要, 会式法, 会式…要...","[御会式法, 御会式…要, 御会…法要, 御…式法要, 会式法要]",[御会式法要],,,,,
198,報恩講法要,"[報, 恩, 講, 法, 要]","[報恩, 恩講, 講法, 法要]","[報恩講, 恩講法, 講法要]","[報恩講法, 恩講法要]",[報恩講法要],"[報, 恩, 講, 法, 要]","[報恩, 報…講, 報…法, 報…要, 恩講, 恩…法, 恩…要, 講法, 講…要, 法要]","[報恩講, 報恩…法, 報恩…要, 報…講法, 報…講…要, 報…法要, 恩講法, 恩講…要...","[報恩講法, 報恩講…要, 報恩…法要, 報…講法要, 恩講法要]",[報恩講法要],,,,,


In [20]:
## generate extended skippy n-grams
import re
for i, doc in enumerate(docs):
    print(f"Processing word {i}: {doc} [use_Cython: {use_Cython}]")
    word_segs = [ seg for seg in re.split(segmenter, doc) if len(seg) > 0 ]
    if len(word_segs) > 1:
        for j in range(1, max_n_for_ngram + 1):
            print(f"generating extended skippy {j}-grams ...")
            ngrams = gen_ngrams.gen_extended_skippy_ngrams(word_segs, j, sep = sep_local, as_list = generated_as_list, check = False)
            if check:
                print(ngrams)
            ## update df
            df.loc[i, f'xsk{j}g'] = ngrams

Processing word 0: 阿羅漢 [use_Cython: True]
generating extended skippy 1-grams ...
generating extended skippy 2-grams ...
generating extended skippy 3-grams ...
generating extended skippy 4-grams ...
generating extended skippy 5-grams ...
Processing word 1: 辟支仏 [use_Cython: True]
generating extended skippy 1-grams ...
generating extended skippy 2-grams ...
generating extended skippy 3-grams ...
generating extended skippy 4-grams ...
generating extended skippy 5-grams ...
Processing word 2: 転法輪 [use_Cython: True]
generating extended skippy 1-grams ...
generating extended skippy 2-grams ...
generating extended skippy 3-grams ...
generating extended skippy 4-grams ...
generating extended skippy 5-grams ...
Processing word 3: 十二因縁 [use_Cython: True]
generating extended skippy 1-grams ...
generating extended skippy 2-grams ...
generating extended skippy 3-grams ...
generating extended skippy 4-grams ...
generating extended skippy 5-grams ...
Processing word 4: 五蘊盛苦 [use_Cython: True]
generati

In [21]:
## result
df

Unnamed: 0,doc,1g,2g,3g,4g,5g,sk1g,sk2g,sk3g,sk4g,sk5g,xsk1g,xsk2g,xsk3g,xsk4g,xsk5g
0,阿羅漢,"[阿, 羅, 漢]","[阿羅, 羅漢]",[阿羅漢],[阿羅漢],[阿羅漢],"[阿, 羅, 漢]","[阿羅, 阿…漢, 羅漢]",[阿羅漢],[阿羅漢],[阿羅漢],"[阿…, 羅…, 漢]","[阿羅…, 阿…漢, 羅漢]",[阿羅漢],[阿羅漢],[阿羅漢]
1,辟支仏,"[辟, 支, 仏]","[辟支, 支仏]",[辟支仏],[辟支仏],[辟支仏],"[辟, 支, 仏]","[辟支, 辟…仏, 支仏]",[辟支仏],[辟支仏],[辟支仏],"[辟…, 支…, 仏]","[辟支…, 辟…仏, 支仏]",[辟支仏],[辟支仏],[辟支仏]
2,転法輪,"[転, 法, 輪]","[転法, 法輪]",[転法輪],[転法輪],[転法輪],"[転, 法, 輪]","[転法, 転…輪, 法輪]",[転法輪],[転法輪],[転法輪],"[転…, 法…, 輪]","[転法…, 転…輪, 法輪]",[転法輪],[転法輪],[転法輪]
3,十二因縁,"[十, 二, 因, 縁]","[十二, 二因, 因縁]","[十二因, 二因縁]",[十二因縁],[十二因縁],"[十, 二, 因, 縁]","[十二, 十…因, 十…縁, 二因, 二…縁, 因縁]","[十二因, 十二…縁, 十…因縁, 二因縁]",[十二因縁],[十二因縁],"[十…, 二…, 因…, 縁]","[十二…, 十…因…, 十…縁, 二因…, 二…縁, 因縁]","[十…二因…, 十…二…縁, 十…因縁, 二…因縁]",[十二因縁],[十二因縁]
4,五蘊盛苦,"[五, 蘊, 盛, 苦]","[五蘊, 蘊盛, 盛苦]","[五蘊盛, 蘊盛苦]",[五蘊盛苦],[五蘊盛苦],"[五, 蘊, 盛, 苦]","[五蘊, 五…盛, 五…苦, 蘊盛, 蘊…苦, 盛苦]","[五蘊盛, 五蘊…苦, 五…盛苦, 蘊盛苦]",[五蘊盛苦],[五蘊盛苦],"[五…, 蘊…, 盛…, 苦]","[五蘊…, 五…盛…, 五…苦, 蘊盛…, 蘊…苦, 盛苦]","[五…蘊盛…, 五…蘊…苦, 五…盛苦, 蘊…盛苦]",[五蘊盛苦],[五蘊盛苦]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,両祖忌法要,"[両, 祖, 忌, 法, 要]","[両祖, 祖忌, 忌法, 法要]","[両祖忌, 祖忌法, 忌法要]","[両祖忌法, 祖忌法要]",[両祖忌法要],"[両, 祖, 忌, 法, 要]","[両祖, 両…忌, 両…法, 両…要, 祖忌, 祖…法, 祖…要, 忌法, 忌…要, 法要]","[両祖忌, 両祖…法, 両祖…要, 両…忌法, 両…忌…要, 両…法要, 祖忌法, 祖忌…要...","[両祖忌法, 両祖忌…要, 両祖…法要, 両…忌法要, 祖忌法要]",[両祖忌法要],"[両…, 祖…, 忌…, 法…, 要]","[両祖…, 両…忌…, 両…法…, 両…要, 祖忌…, 祖…法…, 祖…要, 忌法…, 忌…...","[両…祖忌…, 両…祖…法…, 両…祖…要, 両…忌法…, 両…忌…要, 両…法要, 祖…忌...","[両…祖…忌法…, 両…祖…忌…要, 両…祖…法要, 両…忌…法要, 祖…忌…法要]",[両祖忌法要]
196,宗祖忌法要,"[宗, 祖, 忌, 法, 要]","[宗祖, 祖忌, 忌法, 法要]","[宗祖忌, 祖忌法, 忌法要]","[宗祖忌法, 祖忌法要]",[宗祖忌法要],"[宗, 祖, 忌, 法, 要]","[宗祖, 宗…忌, 宗…法, 宗…要, 祖忌, 祖…法, 祖…要, 忌法, 忌…要, 法要]","[宗祖忌, 宗祖…法, 宗祖…要, 宗…忌法, 宗…忌…要, 宗…法要, 祖忌法, 祖忌…要...","[宗祖忌法, 宗祖忌…要, 宗祖…法要, 宗…忌法要, 祖忌法要]",[宗祖忌法要],"[宗…, 祖…, 忌…, 法…, 要]","[宗祖…, 宗…忌…, 宗…法…, 宗…要, 祖忌…, 祖…法…, 祖…要, 忌法…, 忌…...","[宗…祖忌…, 宗…祖…法…, 宗…祖…要, 宗…忌法…, 宗…忌…要, 宗…法要, 祖…忌...","[宗…祖…忌法…, 宗…祖…忌…要, 宗…祖…法要, 宗…忌…法要, 祖…忌…法要]",[宗祖忌法要]
197,御会式法要,"[御, 会, 式, 法, 要]","[御会, 会式, 式法, 法要]","[御会式, 会式法, 式法要]","[御会式法, 会式法要]",[御会式法要],"[御, 会, 式, 法, 要]","[御会, 御…式, 御…法, 御…要, 会式, 会…法, 会…要, 式法, 式…要, 法要]","[御会式, 御会…法, 御会…要, 御…式法, 御…式…要, 御…法要, 会式法, 会式…要...","[御会式法, 御会式…要, 御会…法要, 御…式法要, 会式法要]",[御会式法要],"[御…, 会…, 式…, 法…, 要]","[御会…, 御…式…, 御…法…, 御…要, 会式…, 会…法…, 会…要, 式法…, 式…...","[御…会式…, 御…会…法…, 御…会…要, 御…式法…, 御…式…要, 御…法要, 会…式...","[御…会…式法…, 御…会…式…要, 御…会…法要, 御…式…法要, 会…式…法要]",[御会式法要]
198,報恩講法要,"[報, 恩, 講, 法, 要]","[報恩, 恩講, 講法, 法要]","[報恩講, 恩講法, 講法要]","[報恩講法, 恩講法要]",[報恩講法要],"[報, 恩, 講, 法, 要]","[報恩, 報…講, 報…法, 報…要, 恩講, 恩…法, 恩…要, 講法, 講…要, 法要]","[報恩講, 報恩…法, 報恩…要, 報…講法, 報…講…要, 報…法要, 恩講法, 恩講…要...","[報恩講法, 報恩講…要, 報恩…法要, 報…講法要, 恩講法要]",[報恩講法要],"[報…, 恩…, 講…, 法…, 要]","[報恩…, 報…講…, 報…法…, 報…要, 恩講…, 恩…法…, 恩…要, 講法…, 講…...","[報…恩講…, 報…恩…法…, 報…恩…要, 報…講法…, 報…講…要, 報…法要, 恩…講...","[報…恩…講法…, 報…恩…講…要, 報…恩…法要, 報…講…法要, 恩…講…法要]",[報恩講法要]


In [22]:
if save_results:
    file_name = f"{save_dir}/{source_name}-simple-reg-sk-xsk-df.csv"
    df.to_csv(file_name, header = True)

# Creating inclusive versions

In [23]:
## create inclusive extended skippy ngrams
import pandas as pd
inclusive_xsk_df = pd.DataFrame()
check = False
for i in range(2, max_n_for_ngram + 1):
    source_col, target_col = f"xsk{i-1}g", f"xsk{i}g"
    if i == 2:
        S = df[source_col]
    else:
        S = inclusive_xsk_df[source_col]
    T = df[target_col]
    ##
    U = []
    for s, t in zip(S, T):
        try:
            r = [ x for x in s if x not in t ]
            if check:
                print(f"r: {sorted(r)}")
            ##
            if len(r) > 0:
                try:
                    u = t + r
                except TypeError:
                    pass
            else:
                u = t
            if check:
                print(f"u: {sorted(u)}")    
        except TypeError:
            pass
        ##
        U.append(u)
    ##
    inclusive_xsk_df.loc[:,target_col] = U
##
inclusive_xsk_df

##
if save_results:
    file_name = f"{save_dir}/{source_name}-inclusive-xsk-df.csv"
    df.to_csv(file_name, header = True, index = False)

In [24]:
## create inclusive skippy ngrams
import pandas as pd
inclusive_sk_df = pd.DataFrame()
check = False
for i in range(2, max_n_for_ngram + 1):
    source_col, target_col = f"sk{i-1}g", f"sk{i}g"
    if i == 2:
        S = df[source_col]
    else:
        S = inclusive_sk_df[source_col]
    T = df[target_col]
    ##
    U = []
    for s, t in zip(S, T):
        try:
            r = [ x for x in s if x not in t ]
            if check:
                print(f"r: {sorted(r)}")
            ##
            if len(r) > 0:
                try:
                    u = t + r
                except TypeError:
                    pass
            else:
                u = t
            if check:
                print(f"u: {sorted(u)}")    
        except TypeError:
            pass
        ##
        U.append(u)
    ##
    inclusive_sk_df.loc[:,target_col] = U
##
inclusive_sk_df

##
if save_results:
    file_name = f"{save_dir}/{source_name}-inclusive-sk-df.csv"
    df.to_csv(file_name, header = True, index = False)

In [25]:
## create inclusive regular ngrams
import pandas as pd
inclusive_df = pd.DataFrame()
check = False
for i in range(2, max_n_for_ngram + 1):
    source_col, target_col = f"{i-1}g", f"{i}g"
    U = []
    if i == 2:
        S = df[source_col]
    else:
        S = inclusive_df[source_col]
    T = df[target_col]
    for s, t in zip(S, T):
        try:
            r = [ x for x in s if x not in t ]
            if check:
                print(f"r: {sorted(r)}")
            ##
            if len(r) > 0:
                try:
                    u = t + r
                except TypeError:
                    pass
            else:
                u = t
            if check:
                print(f"u: {sorted(u)}")    
        except TypeError:
            pass
        ##
        U.append(u)
    ##
    inclusive_df.loc[:,target_col] = U
##
inclusive_df

##
if save_results:
    file_name = f"{save_dir}/{source_name}-inclusive-reg-df.csv"
    df.to_csv(file_name, header = True, index = False)

# end of file