# Generator of (extended) (skippy) n-grams out of words or sentences

developed by Kow Kuroda (kow.kuroda@gmail.com)

This Jupyter notebook demonstrates how to use gen_ngrams.py.

Limitations
- Availablity of Cython-enhancement is limited. Apple Silicons like M1 and M2 (M3 is not tested yet) do not accept it, though it is available under Python 3.10 on M1.

Creation
- 2025/08/23

Modifications


# Set up Cython

In [54]:
#conda update -n base -c defaults conda -y

In [55]:
## Cython の導入 (必要に応じて)
#!conda uninstall cython -y # seems necessary in certain situations
#!conda install cython -y
## Try the following if the above fails
#!pip install cython --upgrade --force-reinstall
#!conda update -n base -c defaults conda -y

In [56]:
#!pip show cython

In [57]:
## Cython を使うかどうか
use_Cython = False

In [58]:
## Cython extension の(再)構築が必要な場合は True に
build_Cython_extension = True
if use_Cython and build_Cython_extension:
    !python setup.py clean build_ext --inplace

In [59]:
## Cython 版の読込み
## Apple Silicons like M1, M2 do not accept the following
if use_Cython:
    try:
        %reload_ext Cython
    except ImportError:
        %load_ext Cython
    import gen_ngrams_cy as gen_ngrams
else:
    import gen_ngrams

# Set up data

In [60]:
analyze_words = True # if False, analyze sentential/phrasal objects

## parameters for analysis
if analyze_words:
    segmenter: str = r""
    sep_local: str = ""
else:
    segmenter: str = r" "
    sep_local: str = " "

In [61]:
import pathlib
if analyze_words:
    data_dir = 'data/words'
    files = list(pathlib.Path(data_dir).glob('buddhist-listed2.txt'))
else:
    data_dir = 'data/phrases'
    files = list(pathlib.Path(data_dir).glob('austen-j-sample100.txt'))
##
print(files)

##
file = files[0]
source_name = file.stem
print(f"source_name: {source_name}")

[PosixPath('data/words/buddhist-listed2.txt')]
source_name: buddhist-listed2


In [62]:
## get data
docs = file.read_text(encoding = 'utf-8').splitlines()

## lowercase
docs = sorted([ doc.lower() for doc in docs if len(doc) > 0 ], key = lambda x: len(x))
print(docs[:10])

['阿羅漢', '辟支仏', '転法輪', '三法印', '四念処', '四神足', '七覚支', '十善戒', '六神通', '不退転']


# Generation of (extended) (skippy) n-grams

In [63]:
## flags
check: bool = False

## saving results
save_results: bool = False
save_dir: str = "saves"

In [64]:
## n の最大値
max_n_for_ngram: int = 4

## max_gap_size
max_gap_size = 3

## n-gram
ngram_is_inclusive = True
#skippy_means_extended = True

## n-gram を文字列として生成するか否か
generated_as_string: bool = True
generated_as_list: bool = not(generated_as_string)

In [65]:
#!conda install pandas -y

In [66]:
import pandas as pd
columns0 = ['doc']
columns1 = [ f"{i}g" for i in range(1, max_n_for_ngram + 1)]
columns2 = [ f"sk{i}g" for i in range(1, max_n_for_ngram + 1)]
columns3 = [ f"xsk{i}g" for i in range(1, max_n_for_ngram + 1)]

used_columns = columns0 + columns1 + columns2 + columns3
df = pd.DataFrame(columns = used_columns)

## Normal (non-skippy) n-grams

In [67]:
## generate non-skippy n-grams
import re
import unicodedata as ud
for i, doc in enumerate(docs):
    ## update df for word
    df.loc[i,'doc'] = doc
    ##
    print(f"Processing word {i} [use_Cython: {use_Cython}]: {doc}")
    word_segs = [ x for x in re.split(segmenter, ud.normalize("NFKC", doc)) if len(x) > 0 ]
    for j in range(1, max_n_for_ngram + 1):
        print(f"generating {j}-grams ...")
        ngrams = gen_ngrams.gen_ngrams(word_segs, j, sep = sep_local, as_list = generated_as_list, check = False)
        if check:
            print(ngrams)
        ## update df
        df.loc[i, f'{j}g'] = ngrams

Processing word 0 [use_Cython: False]: 阿羅漢
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
Processing word 1 [use_Cython: False]: 辟支仏
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
Processing word 2 [use_Cython: False]: 転法輪
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
Processing word 3 [use_Cython: False]: 三法印
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
Processing word 4 [use_Cython: False]: 四念処
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
Processing word 5 [use_Cython: False]: 四神足
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
Processing word 6 [use_Cython: False]: 七覚支
generating 1-grams ...
generating 2-grams ...
generating 3-grams ...
generating 4-grams ...
Processing word 7 [use_Cython: False]: 十善戒
generating 1

In [68]:
df[columns1]

Unnamed: 0,1g,2g,3g,4g
0,"[阿, 羅, 漢]","[阿羅, 羅漢]",[阿羅漢],[阿羅漢]
1,"[辟, 支, 仏]","[辟支, 支仏]",[辟支仏],[辟支仏]
2,"[転, 法, 輪]","[転法, 法輪]",[転法輪],[転法輪]
3,"[三, 法, 印]","[三法, 法印]",[三法印],[三法印]
4,"[四, 念, 処]","[四念, 念処]",[四念処],[四念処]
...,...,...,...,...
195,"[不, 空, 羂, 索, 観, 音]","[不空, 空羂, 羂索, 索観, 観音]","[不空羂, 空羂索, 羂索観, 索観音]","[不空羂索, 空羂索観, 羂索観音]"
196,"[金, 剛, 界, 曼, 荼, 羅]","[金剛, 剛界, 界曼, 曼荼, 荼羅]","[金剛界, 剛界曼, 界曼荼, 曼荼羅]","[金剛界曼, 剛界曼荼, 界曼荼羅]"
197,"[胎, 蔵, 界, 曼, 荼, 羅]","[胎蔵, 蔵界, 界曼, 曼荼, 荼羅]","[胎蔵界, 蔵界曼, 界曼荼, 曼荼羅]","[胎蔵界曼, 蔵界曼荼, 界曼荼羅]"
198,"[伝, 燈, 大, 阿, 闍, 梨]","[伝燈, 燈大, 大阿, 阿闍, 闍梨]","[伝燈大, 燈大阿, 大阿闍, 阿闍梨]","[伝燈大阿, 燈大阿闍, 大阿闍梨]"


In [69]:
## make n-grams inclusive
import pandas as pd
for i in range(2, max_n_for_ngram + 1):
    ## target
    target_name = f"{i}g"
    print(f"target_name: {target_name}")
    target = list(df.loc[:,target_name]).copy()
    ## supplement
    supplement_name = f"{i-1}g"
    print(f"supplement_name: {supplement_name}")
    supplement = list(df.loc[:,supplement_name]).copy()
    ## inclusion
    for T, S in zip(target, supplement):
        T.extend([ s for s in S if not s in T ])

target_name: 2g
supplement_name: 1g
target_name: 3g
supplement_name: 2g
target_name: 4g
supplement_name: 3g


In [70]:
df[columns0 +columns1]

Unnamed: 0,doc,1g,2g,3g,4g
0,阿羅漢,"[阿, 羅, 漢]","[阿羅, 羅漢, 阿, 羅, 漢]","[阿羅漢, 阿羅, 羅漢, 阿, 羅, 漢]","[阿羅漢, 阿羅, 羅漢, 阿, 羅, 漢]"
1,辟支仏,"[辟, 支, 仏]","[辟支, 支仏, 辟, 支, 仏]","[辟支仏, 辟支, 支仏, 辟, 支, 仏]","[辟支仏, 辟支, 支仏, 辟, 支, 仏]"
2,転法輪,"[転, 法, 輪]","[転法, 法輪, 転, 法, 輪]","[転法輪, 転法, 法輪, 転, 法, 輪]","[転法輪, 転法, 法輪, 転, 法, 輪]"
3,三法印,"[三, 法, 印]","[三法, 法印, 三, 法, 印]","[三法印, 三法, 法印, 三, 法, 印]","[三法印, 三法, 法印, 三, 法, 印]"
4,四念処,"[四, 念, 処]","[四念, 念処, 四, 念, 処]","[四念処, 四念, 念処, 四, 念, 処]","[四念処, 四念, 念処, 四, 念, 処]"
...,...,...,...,...,...
195,不空羂索観音,"[不, 空, 羂, 索, 観, 音]","[不空, 空羂, 羂索, 索観, 観音, 不, 空, 羂, 索, 観, 音]","[不空羂, 空羂索, 羂索観, 索観音, 不空, 空羂, 羂索, 索観, 観音, 不, 空,...","[不空羂索, 空羂索観, 羂索観音, 不空羂, 空羂索, 羂索観, 索観音, 不空, 空羂,..."
196,金剛界曼荼羅,"[金, 剛, 界, 曼, 荼, 羅]","[金剛, 剛界, 界曼, 曼荼, 荼羅, 金, 剛, 界, 曼, 荼, 羅]","[金剛界, 剛界曼, 界曼荼, 曼荼羅, 金剛, 剛界, 界曼, 曼荼, 荼羅, 金, 剛,...","[金剛界曼, 剛界曼荼, 界曼荼羅, 金剛界, 剛界曼, 界曼荼, 曼荼羅, 金剛, 剛界,..."
197,胎蔵界曼荼羅,"[胎, 蔵, 界, 曼, 荼, 羅]","[胎蔵, 蔵界, 界曼, 曼荼, 荼羅, 胎, 蔵, 界, 曼, 荼, 羅]","[胎蔵界, 蔵界曼, 界曼荼, 曼荼羅, 胎蔵, 蔵界, 界曼, 曼荼, 荼羅, 胎, 蔵,...","[胎蔵界曼, 蔵界曼荼, 界曼荼羅, 胎蔵界, 蔵界曼, 界曼荼, 曼荼羅, 胎蔵, 蔵界,..."
198,伝燈大阿闍梨,"[伝, 燈, 大, 阿, 闍, 梨]","[伝燈, 燈大, 大阿, 阿闍, 闍梨, 伝, 燈, 大, 阿, 闍, 梨]","[伝燈大, 燈大阿, 大阿闍, 阿闍梨, 伝燈, 燈大, 大阿, 阿闍, 闍梨, 伝, 燈,...","[伝燈大阿, 燈大阿闍, 大阿闍梨, 伝燈大, 燈大阿, 大阿闍, 阿闍梨, 伝燈, 燈大,..."


## Skippy n-grams

In [71]:
## generate regular skippy n-grams
import re
for i, doc in enumerate(docs):
    print(f"Processing word {i} [use_Cython: {use_Cython}]: {doc}")
    word_segs = [ seg for seg in re.split(segmenter, doc) if len(seg) > 0 ]
    for j in range(1, max_n_for_ngram + 1):
        print(f"generating skippy {j}-grams ...")
        ngrams = gen_ngrams.gen_skippy_ngrams(word_segs, j, max_distance = max_gap_size, sep = sep_local, as_list = generated_as_list, check = False)
        ## update df
        df.loc[i, f'sk{j}g'] = ngrams

Processing word 0 [use_Cython: False]: 阿羅漢
generating skippy 1-grams ...
generating skippy 2-grams ...
generating skippy 3-grams ...
generating skippy 4-grams ...
Processing word 1 [use_Cython: False]: 辟支仏
generating skippy 1-grams ...
generating skippy 2-grams ...
generating skippy 3-grams ...
generating skippy 4-grams ...
Processing word 2 [use_Cython: False]: 転法輪
generating skippy 1-grams ...
generating skippy 2-grams ...
generating skippy 3-grams ...
generating skippy 4-grams ...
Processing word 3 [use_Cython: False]: 三法印
generating skippy 1-grams ...
generating skippy 2-grams ...
generating skippy 3-grams ...
generating skippy 4-grams ...
Processing word 4 [use_Cython: False]: 四念処
generating skippy 1-grams ...
generating skippy 2-grams ...
generating skippy 3-grams ...
generating skippy 4-grams ...
Processing word 5 [use_Cython: False]: 四神足
generating skippy 1-grams ...
generating skippy 2-grams ...
generating skippy 3-grams ...
generating skippy 4-grams ...
Processing word 6 [use

In [72]:
df[columns0 + columns2]

Unnamed: 0,doc,sk1g,sk2g,sk3g,sk4g
0,阿羅漢,"[阿, 羅, 漢, 羅, 漢, 漢]","[阿羅, 阿…漢, 羅漢, 羅漢]",[阿羅漢],[阿羅漢]
1,辟支仏,"[辟, 支, 仏, 支, 仏, 仏]","[辟支, 辟…仏, 支仏, 支仏]",[辟支仏],[辟支仏]
2,転法輪,"[転, 法, 輪, 法, 輪, 輪]","[転法, 転…輪, 法輪, 法輪]",[転法輪],[転法輪]
3,三法印,"[三, 法, 印, 法, 印, 印]","[三法, 三…印, 法印, 法印]",[三法印],[三法印]
4,四念処,"[四, 念, 処, 念, 処, 処]","[四念, 四…処, 念処, 念処]",[四念処],[四念処]
...,...,...,...,...,...
195,不空羂索観音,"[不, 空, 羂, 索, 空, 羂, 索, 観, 羂, 索, 観, 音, 索, 観, 音, ...","[不空, 不…羂, 不…索, 空羂, 空…索, 羂索, 空羂, 空…索, 空…観, 羂索, ...","[不空羂, 不空…索, 不…羂索, 空羂索, 空羂索, 空羂…観, 空…索観, 羂索観, 羂...","[不空羂索, 空羂索観, 羂索観音]"
196,金剛界曼荼羅,"[金, 剛, 界, 曼, 剛, 界, 曼, 荼, 界, 曼, 荼, 羅, 曼, 荼, 羅, ...","[金剛, 金…界, 金…曼, 剛界, 剛…曼, 界曼, 剛界, 剛…曼, 剛…荼, 界曼, ...","[金剛界, 金剛…曼, 金…界曼, 剛界曼, 剛界曼, 剛界…荼, 剛…曼荼, 界曼荼, 界...","[金剛界曼, 剛界曼荼, 界曼荼羅]"
197,胎蔵界曼荼羅,"[胎, 蔵, 界, 曼, 蔵, 界, 曼, 荼, 界, 曼, 荼, 羅, 曼, 荼, 羅, ...","[胎蔵, 胎…界, 胎…曼, 蔵界, 蔵…曼, 界曼, 蔵界, 蔵…曼, 蔵…荼, 界曼, ...","[胎蔵界, 胎蔵…曼, 胎…界曼, 蔵界曼, 蔵界曼, 蔵界…荼, 蔵…曼荼, 界曼荼, 界...","[胎蔵界曼, 蔵界曼荼, 界曼荼羅]"
198,伝燈大阿闍梨,"[伝, 燈, 大, 阿, 燈, 大, 阿, 闍, 大, 阿, 闍, 梨, 阿, 闍, 梨, ...","[伝燈, 伝…大, 伝…阿, 燈大, 燈…阿, 大阿, 燈大, 燈…阿, 燈…闍, 大阿, ...","[伝燈大, 伝燈…阿, 伝…大阿, 燈大阿, 燈大阿, 燈大…闍, 燈…阿闍, 大阿闍, 大...","[伝燈大阿, 燈大阿闍, 大阿闍梨]"


In [73]:
## make n-grams inclusive
for i in range(2, max_n_for_ngram + 1):
    ## target
    target_name = f"sk{i}g"
    print(f"target_name: {target_name}")
    target = list(df.loc[:,target_name])
    ## supplement
    supplement_name = f"sk{i-1}g"
    print(f"supplement_name: {supplement_name}")
    supplement = list(df.loc[:,supplement_name])
    ## inclusion
    for T, S in zip(target, supplement):
        T.extend([ s for s in S if not s in T ])

target_name: sk2g
supplement_name: sk1g
target_name: sk3g
supplement_name: sk2g
target_name: sk4g
supplement_name: sk3g


In [74]:
df[columns2]

Unnamed: 0,sk1g,sk2g,sk3g,sk4g
0,"[阿, 羅, 漢, 羅, 漢, 漢]","[阿羅, 阿…漢, 羅漢, 羅漢, 阿, 羅, 漢, 羅, 漢, 漢]","[阿羅漢, 阿羅, 阿…漢, 羅漢, 羅漢, 阿, 羅, 漢, 羅, 漢, 漢]","[阿羅漢, 阿羅, 阿…漢, 羅漢, 羅漢, 阿, 羅, 漢, 羅, 漢, 漢]"
1,"[辟, 支, 仏, 支, 仏, 仏]","[辟支, 辟…仏, 支仏, 支仏, 辟, 支, 仏, 支, 仏, 仏]","[辟支仏, 辟支, 辟…仏, 支仏, 支仏, 辟, 支, 仏, 支, 仏, 仏]","[辟支仏, 辟支, 辟…仏, 支仏, 支仏, 辟, 支, 仏, 支, 仏, 仏]"
2,"[転, 法, 輪, 法, 輪, 輪]","[転法, 転…輪, 法輪, 法輪, 転, 法, 輪, 法, 輪, 輪]","[転法輪, 転法, 転…輪, 法輪, 法輪, 転, 法, 輪, 法, 輪, 輪]","[転法輪, 転法, 転…輪, 法輪, 法輪, 転, 法, 輪, 法, 輪, 輪]"
3,"[三, 法, 印, 法, 印, 印]","[三法, 三…印, 法印, 法印, 三, 法, 印, 法, 印, 印]","[三法印, 三法, 三…印, 法印, 法印, 三, 法, 印, 法, 印, 印]","[三法印, 三法, 三…印, 法印, 法印, 三, 法, 印, 法, 印, 印]"
4,"[四, 念, 処, 念, 処, 処]","[四念, 四…処, 念処, 念処, 四, 念, 処, 念, 処, 処]","[四念処, 四念, 四…処, 念処, 念処, 四, 念, 処, 念, 処, 処]","[四念処, 四念, 四…処, 念処, 念処, 四, 念, 処, 念, 処, 処]"
...,...,...,...,...
195,"[不, 空, 羂, 索, 空, 羂, 索, 観, 羂, 索, 観, 音, 索, 観, 音, ...","[不空, 不…羂, 不…索, 空羂, 空…索, 羂索, 空羂, 空…索, 空…観, 羂索, ...","[不空羂, 不空…索, 不…羂索, 空羂索, 空羂索, 空羂…観, 空…索観, 羂索観, 羂...","[不空羂索, 空羂索観, 羂索観音, 不空羂, 不空…索, 不…羂索, 空羂索, 空羂索, ..."
196,"[金, 剛, 界, 曼, 剛, 界, 曼, 荼, 界, 曼, 荼, 羅, 曼, 荼, 羅, ...","[金剛, 金…界, 金…曼, 剛界, 剛…曼, 界曼, 剛界, 剛…曼, 剛…荼, 界曼, ...","[金剛界, 金剛…曼, 金…界曼, 剛界曼, 剛界曼, 剛界…荼, 剛…曼荼, 界曼荼, 界...","[金剛界曼, 剛界曼荼, 界曼荼羅, 金剛界, 金剛…曼, 金…界曼, 剛界曼, 剛界曼, ..."
197,"[胎, 蔵, 界, 曼, 蔵, 界, 曼, 荼, 界, 曼, 荼, 羅, 曼, 荼, 羅, ...","[胎蔵, 胎…界, 胎…曼, 蔵界, 蔵…曼, 界曼, 蔵界, 蔵…曼, 蔵…荼, 界曼, ...","[胎蔵界, 胎蔵…曼, 胎…界曼, 蔵界曼, 蔵界曼, 蔵界…荼, 蔵…曼荼, 界曼荼, 界...","[胎蔵界曼, 蔵界曼荼, 界曼荼羅, 胎蔵界, 胎蔵…曼, 胎…界曼, 蔵界曼, 蔵界曼, ..."
198,"[伝, 燈, 大, 阿, 燈, 大, 阿, 闍, 大, 阿, 闍, 梨, 阿, 闍, 梨, ...","[伝燈, 伝…大, 伝…阿, 燈大, 燈…阿, 大阿, 燈大, 燈…阿, 燈…闍, 大阿, ...","[伝燈大, 伝燈…阿, 伝…大阿, 燈大阿, 燈大阿, 燈大…闍, 燈…阿闍, 大阿闍, 大...","[伝燈大阿, 燈大阿闍, 大阿闍梨, 伝燈大, 伝燈…阿, 伝…大阿, 燈大阿, 燈大阿, ..."


## Extended skippy n-grams

In [75]:
## generate extended skippy n-grams
import re, unicodedata
for i, doc in enumerate(docs):
    print(f"Processing word {i} [use_Cython: {use_Cython}]: {doc}")
    ## Unicode normalization is necessay to proper handling of accents in languages like Irish and Welsh
    word_segs = [ seg for seg in re.split(segmenter, unicodedata.normalize('NFC', doc)) if len(seg) > 0 ]
    for j in range(1, max_n_for_ngram + 1):
        print(f"generating extended skippy {j}-grams ...")
        ngrams = gen_ngrams.gen_extended_skippy_ngrams(word_segs, j, max_distance = max_gap_size, sep = sep_local, as_list = generated_as_list, check = False)
        ## update df
        df.loc[i, f'xsk{j}g'] = ngrams

Processing word 0 [use_Cython: False]: 阿羅漢
generating extended skippy 1-grams ...
generating extended skippy 2-grams ...
generating extended skippy 3-grams ...
generating extended skippy 4-grams ...
Processing word 1 [use_Cython: False]: 辟支仏
generating extended skippy 1-grams ...
generating extended skippy 2-grams ...
generating extended skippy 3-grams ...
generating extended skippy 4-grams ...
Processing word 2 [use_Cython: False]: 転法輪
generating extended skippy 1-grams ...
generating extended skippy 2-grams ...
generating extended skippy 3-grams ...
generating extended skippy 4-grams ...
Processing word 3 [use_Cython: False]: 三法印
generating extended skippy 1-grams ...
generating extended skippy 2-grams ...
generating extended skippy 3-grams ...
generating extended skippy 4-grams ...
Processing word 4 [use_Cython: False]: 四念処
generating extended skippy 1-grams ...
generating extended skippy 2-grams ...
generating extended skippy 3-grams ...
generating extended skippy 4-grams ...
Proce

In [76]:
df[columns3]

Unnamed: 0,xsk1g,xsk2g,xsk3g,xsk4g
0,"[阿…, …羅…, …漢, …羅…, …漢, …漢]","[阿羅…, 阿…漢, …羅漢, …羅漢]",[阿羅漢],[阿羅漢]
1,"[辟…, …支…, …仏, …支…, …仏, …仏]","[辟支…, 辟…仏, …支仏, …支仏]",[辟支仏],[辟支仏]
2,"[転…, …法…, …輪, …法…, …輪, …輪]","[転法…, 転…輪, …法輪, …法輪]",[転法輪],[転法輪]
3,"[三…, …法…, …印, …法…, …印, …印]","[三法…, 三…印, …法印, …法印]",[三法印],[三法印]
4,"[四…, …念…, …処, …念…, …処, …処]","[四念…, 四…処, …念処, …念処]",[四念処],[四念処]
...,...,...,...,...
195,"[不…, …空…, …羂…, …索…, …空…, …羂…, …索…, …観…, …羂…, …...","[不空…, 不…羂…, 不…索…, …空羂…, …空…索…, …羂索…, …空羂…, …空…...","[不…空羂…, 不…空…索…, 不…羂索…, …空…羂索…, …空…羂索…, …空…羂…観…...","[不…空…羂索…, …空…羂…索観…, …羂…索…観音]"
196,"[金…, …剛…, …界…, …曼…, …剛…, …界…, …曼…, …荼…, …界…, …...","[金剛…, 金…界…, 金…曼…, …剛界…, …剛…曼…, …界曼…, …剛界…, …剛…...","[金…剛界…, 金…剛…曼…, 金…界曼…, …剛…界曼…, …剛…界曼…, …剛…界…荼…...","[金…剛…界曼…, …剛…界…曼荼…, …界…曼…荼羅]"
197,"[胎…, …蔵…, …界…, …曼…, …蔵…, …界…, …曼…, …荼…, …界…, …...","[胎蔵…, 胎…界…, 胎…曼…, …蔵界…, …蔵…曼…, …界曼…, …蔵界…, …蔵…...","[胎…蔵界…, 胎…蔵…曼…, 胎…界曼…, …蔵…界曼…, …蔵…界曼…, …蔵…界…荼…...","[胎…蔵…界曼…, …蔵…界…曼荼…, …界…曼…荼羅]"
198,"[伝…, …燈…, …大…, …阿…, …燈…, …大…, …阿…, …闍…, …大…, …...","[伝燈…, 伝…大…, 伝…阿…, …燈大…, …燈…阿…, …大阿…, …燈大…, …燈…...","[伝…燈大…, 伝…燈…阿…, 伝…大阿…, …燈…大阿…, …燈…大阿…, …燈…大…闍…...","[伝…燈…大阿…, …燈…大…阿闍…, …大…阿…闍梨]"


In [77]:
## make n-grams inclusive
for i in range(2, max_n_for_ngram + 1):
    ## target
    target_name = f"xsk{i}g"
    print(f"target_name: {target_name}")
    target = list(df.loc[:,target_name])
    ## supplement
    supplement_name = f"xsk{i-1}g"
    print(f"supplement_name: {supplement_name}")
    supplement = list(df.loc[:,supplement_name])
    ## inclusion
    for T, S in zip(target, supplement):
        T.extend([ s for s in S if not s in T ])

target_name: xsk2g
supplement_name: xsk1g
target_name: xsk3g
supplement_name: xsk2g
target_name: xsk4g
supplement_name: xsk3g


In [78]:
df[columns0 +columns3]

Unnamed: 0,doc,xsk1g,xsk2g,xsk3g,xsk4g
0,阿羅漢,"[阿…, …羅…, …漢, …羅…, …漢, …漢]","[阿羅…, 阿…漢, …羅漢, …羅漢, 阿…, …羅…, …漢, …羅…, …漢, …漢]","[阿羅漢, 阿羅…, 阿…漢, …羅漢, …羅漢, 阿…, …羅…, …漢, …羅…, …漢...","[阿羅漢, 阿羅…, 阿…漢, …羅漢, …羅漢, 阿…, …羅…, …漢, …羅…, …漢..."
1,辟支仏,"[辟…, …支…, …仏, …支…, …仏, …仏]","[辟支…, 辟…仏, …支仏, …支仏, 辟…, …支…, …仏, …支…, …仏, …仏]","[辟支仏, 辟支…, 辟…仏, …支仏, …支仏, 辟…, …支…, …仏, …支…, …仏...","[辟支仏, 辟支…, 辟…仏, …支仏, …支仏, 辟…, …支…, …仏, …支…, …仏..."
2,転法輪,"[転…, …法…, …輪, …法…, …輪, …輪]","[転法…, 転…輪, …法輪, …法輪, 転…, …法…, …輪, …法…, …輪, …輪]","[転法輪, 転法…, 転…輪, …法輪, …法輪, 転…, …法…, …輪, …法…, …輪...","[転法輪, 転法…, 転…輪, …法輪, …法輪, 転…, …法…, …輪, …法…, …輪..."
3,三法印,"[三…, …法…, …印, …法…, …印, …印]","[三法…, 三…印, …法印, …法印, 三…, …法…, …印, …法…, …印, …印]","[三法印, 三法…, 三…印, …法印, …法印, 三…, …法…, …印, …法…, …印...","[三法印, 三法…, 三…印, …法印, …法印, 三…, …法…, …印, …法…, …印..."
4,四念処,"[四…, …念…, …処, …念…, …処, …処]","[四念…, 四…処, …念処, …念処, 四…, …念…, …処, …念…, …処, …処]","[四念処, 四念…, 四…処, …念処, …念処, 四…, …念…, …処, …念…, …処...","[四念処, 四念…, 四…処, …念処, …念処, 四…, …念…, …処, …念…, …処..."
...,...,...,...,...,...
195,不空羂索観音,"[不…, …空…, …羂…, …索…, …空…, …羂…, …索…, …観…, …羂…, …...","[不空…, 不…羂…, 不…索…, …空羂…, …空…索…, …羂索…, …空羂…, …空…...","[不…空羂…, 不…空…索…, 不…羂索…, …空…羂索…, …空…羂索…, …空…羂…観…...","[不…空…羂索…, …空…羂…索観…, …羂…索…観音, 不…空羂…, 不…空…索…, 不…..."
196,金剛界曼荼羅,"[金…, …剛…, …界…, …曼…, …剛…, …界…, …曼…, …荼…, …界…, …...","[金剛…, 金…界…, 金…曼…, …剛界…, …剛…曼…, …界曼…, …剛界…, …剛…...","[金…剛界…, 金…剛…曼…, 金…界曼…, …剛…界曼…, …剛…界曼…, …剛…界…荼…...","[金…剛…界曼…, …剛…界…曼荼…, …界…曼…荼羅, 金…剛界…, 金…剛…曼…, 金…..."
197,胎蔵界曼荼羅,"[胎…, …蔵…, …界…, …曼…, …蔵…, …界…, …曼…, …荼…, …界…, …...","[胎蔵…, 胎…界…, 胎…曼…, …蔵界…, …蔵…曼…, …界曼…, …蔵界…, …蔵…...","[胎…蔵界…, 胎…蔵…曼…, 胎…界曼…, …蔵…界曼…, …蔵…界曼…, …蔵…界…荼…...","[胎…蔵…界曼…, …蔵…界…曼荼…, …界…曼…荼羅, 胎…蔵界…, 胎…蔵…曼…, 胎…..."
198,伝燈大阿闍梨,"[伝…, …燈…, …大…, …阿…, …燈…, …大…, …阿…, …闍…, …大…, …...","[伝燈…, 伝…大…, 伝…阿…, …燈大…, …燈…阿…, …大阿…, …燈大…, …燈…...","[伝…燈大…, 伝…燈…阿…, 伝…大阿…, …燈…大阿…, …燈…大阿…, …燈…大…闍…...","[伝…燈…大阿…, …燈…大…阿闍…, …大…阿…闍梨, 伝…燈大…, 伝…燈…阿…, 伝…..."


## Check differences

In [79]:
for i, row in df.iterrows():
    doc = row['doc']
    print(f"----------------------")
    print(f"doc: {doc}")    
    for j in range(1, max_n_for_ngram + 1):
        print(f" ----------------------")
        print(f"{j}g")
        norm_var = f"{j}g"
        sk_var = f"sk{j}g"
        xsk_var = f"xsk{j}g"
        norm = list(df.loc[i,norm_var])
        print(f"norm: {norm}")
        sk = list(df.loc[i,sk_var])
        print(f"sk: {sk}")
        xsk = list(df.loc[i,xsk_var])
        print(f"xsk: {xsk}")
        
        ## Commonalities
        comm1 = [ x for x in norm if x in sk and x in xsk ]
        print(f"C1: x in norm, sk and sk: {comm1}")
        #
        comm2 = [ x for x in norm if x in sk ]
        print(f"C2: x in norm and sk: {comm2}")
        #
        comm3 = [ x for x in norm if x in xsk ]
        print(f"C3: x in norm and xsk: {comm3}")
        #
        comm4 = [ x for x in sk if x in xsk and not x in norm ]
        print(f"C4: x in sk and xsk, not in norm: {comm4}")
        
        ## Differences
        diff1 = [ x for x in norm if not x in sk and not x in xsk ]
        print(f"D1: x in norm, not in sk and xsk: {diff1}")
        #
        diff2 = [ x for x in sk if not x in norm ]
        print(f"D2: x in sk, not in norm: {diff2}")
        #
        diff3 = [ x for x in xsk if not x in norm ]
        print(f"D3: x in xsk, not in norm: {diff3}")
        #
        diff4 = [ x for x in sk if not x in xsk ]
        print(f"D4: x in sk, not in xsk: {diff4}")
        #
        diff5 = [ x for x in xsk if not x in sk ]
        print(f"D5: x in xsk, not in sk: {diff5}")

----------------------
doc: 阿羅漢
 ----------------------
1g
norm: ['阿', '羅', '漢']
sk: ['阿', '羅', '漢', '羅', '漢', '漢']
xsk: ['阿…', '…羅…', '…漢', '…羅…', '…漢', '…漢']
C1: x in norm, sk and sk: []
C2: x in norm and sk: ['阿', '羅', '漢']
C3: x in norm and xsk: []
C4: x in sk and xsk, not in norm: []
D1: x in norm, not in sk and xsk: []
D2: x in sk, not in norm: []
D3: x in xsk, not in norm: ['阿…', '…羅…', '…漢', '…羅…', '…漢', '…漢']
D4: x in sk, not in xsk: ['阿', '羅', '漢', '羅', '漢', '漢']
D5: x in xsk, not in sk: ['阿…', '…羅…', '…漢', '…羅…', '…漢', '…漢']
 ----------------------
2g
norm: ['阿羅', '羅漢', '阿', '羅', '漢']
sk: ['阿羅', '阿…漢', '羅漢', '羅漢', '阿', '羅', '漢', '羅', '漢', '漢']
xsk: ['阿羅…', '阿…漢', '…羅漢', '…羅漢', '阿…', '…羅…', '…漢', '…羅…', '…漢', '…漢']
C1: x in norm, sk and sk: []
C2: x in norm and sk: ['阿羅', '羅漢', '阿', '羅', '漢']
C3: x in norm and xsk: []
C4: x in sk and xsk, not in norm: ['阿…漢']
D1: x in norm, not in sk and xsk: []
D2: x in sk, not in norm: ['阿…漢']
D3: x in xsk, not in norm: ['阿羅…', '阿…漢', '…羅漢'

# 結果の保存

In [80]:
if save_results:
    file_name = f"{save_dir}/gen_{source_name}-df.csv"
    df.to_csv(file_name, header = True)

# end of file