# Generator of (extended) (skippy) n-grams out of words or sentences

developed by Kow Kuroda (kow.kuroda@gmail.com)

This Jupyter notebook demonstrates how to use gen2_ngrams.py (or gen2_ngrams_cy.pyx) developed to enhance the usability of its predecessor "gen_ngrams.py".

There are two main differences from its predecessor. First, gen_skippy_ngrams(..) generates extended skippy n-grams with "extended = True" option. Second, gen_skippy_ngrams(..) cann generate inclusive n-grams, thereby dispensing with incremental generation of n-grams from 1-grams.

Limitations
- Availablity of Cython-enhancement is limited. Apple Silicons like M1 and M2 (M3 is not tested yet) do not accept it, though it is available under Python 3.10 on M1.

Creation
- 2025/08/19

Modifications
- 2025/08/21 minor changes;
- 2025/08/22 i) minor changes; Cython-enhancement was implemented;

# Set up Cython

In [None]:
#conda update -n base -c defaults conda -y

In [None]:
## Cython の導入 (必要に応じて)
#!conda uninstall cython -y # seems necessary in certain situations
#!conda install cython -y
## Try the following if the above fails
#!pip install cython --upgrade --force-reinstall
#!conda update -n base -c defaults conda -y

In [None]:
#!pip show cython

In [None]:
## Cython を使うかどうか
use_Cython = False

In [None]:
## Cython extension の(再)構築が必要な場合は True に
build_Cython_extension = False
if use_Cython and build_Cython_extension:
    !python setup.py clean build_ext --inplace

In [None]:
## Cython 版の読込み
if use_Cython:
    try:
        %reload_ext Cython
    except ImportError:
        %load_ext Cython
    ## Apple Silicons like M1, M2 do not accept the following
    import gen2_ngrams_cy as gen_ngrams
else:
    try:
        import gen2_ngrams as gen_ngrams # gen_ngrams is now obsolete
    except NameError:
        import gen_ngrams

# Set up data

In [None]:
analyze_words = True # if False, analyze sentential/phrasal objects

## parameters for analysis
if analyze_words:
    segmenter: str = r""
    sep_local: str = ""
else:
    segmenter: str = r" "
    sep_local: str = " "

In [None]:
import pathlib
if analyze_words:
    data_dir = 'data/words'
    files = list(pathlib.Path(data_dir).glob('buddhist-listed2.txt'))
else:
    data_dir = 'data/phrases'
    files = list(pathlib.Path(data_dir).glob('austen-j-sample100.txt'))
##
print(files)

##
file = files[0]
source_name = file.stem
print(f"source_name: {source_name}")

In [None]:
## get data
docs = file.read_text(encoding = 'utf-8').splitlines()

## lowercase
docs = [ doc.lower() for doc in docs if len(doc) > 0 ]
print(docs[:10])

# Generation of (extended) (skippy) n-grams

In [None]:
## flags
check: bool = False

## saving results
save_results: bool = False
save_dir: str = "saves"

In [None]:
### n-gram
## n の最大値
max_n_for_ngram: int = 5

## n-gram
ngram_is_inclusive = True
skippy_means_extended = True

## n-gram を文字列として生成するか否か
generated_as_string: bool = True
generated_as_list: bool = not(generated_as_string)

In [None]:
#!conda install pandas -y

In [None]:
import pandas as pd
columns0 = ['doc']
columns1 = [ f"xsk{i}g" for i in range(1, max_n_for_ngram + 1)]
columns2 = [ f"sk{i}g" for i in range(1, max_n_for_ngram + 1)]
columns3 = [ f"{i}g" for i in range(1, max_n_for_ngram + 1)]

used_columns = columns0 + columns1 + columns2 + columns3
df = pd.DataFrame(columns = used_columns)

In [None]:
## generate extended skippy n-grams
import re
for i, doc in enumerate(docs):
    print(f"Processing word {i} [use_Cython: {use_Cython}]: {doc}")
    word_segs = [ seg for seg in re.split(segmenter, doc) if len(seg) > 0 ]
    for j in range(1, max_n_for_ngram + 1):
        print(f"generating extended skippy {j}-grams ...")
        ngrams = gen_ngrams.gen_skippy_ngrams(word_segs, j, extended = skippy_means_extended, inclusive = ngram_is_inclusive, sep = sep_local, as_list = generated_as_list, check = False)
        if check:
            print(ngrams)
        ## update df
        df.loc[i, f'xsk{j}g'] = ngrams

In [None]:
df[columns1]

In [None]:
## generate regular skippy n-grams
import re
for i, doc in enumerate(docs):
    print(f"Processing word {i} [use_Cython: {use_Cython}]: {doc}")
    word_segs = [ seg for seg in re.split(segmenter, doc) if len(seg) > 0 ]
    for j in range(1, max_n_for_ngram + 1):
        print(f"generating skippy {j}-grams ...")
        ngrams = gen_ngrams.gen_skippy_ngrams(word_segs, j, extended = skippy_means_extended, inclusive = ngram_is_inclusive, sep = sep_local, as_list = generated_as_list, check = False)
        if check:
            print(ngrams)
        ## update df
        df.loc[i, f'sk{j}g'] = ngrams

In [None]:
df[columns2]

In [None]:
## generate non-skippy n-grams
import re
for i, doc in enumerate(docs):
    ## update df for word
    df.loc[i,'doc'] = doc
    ##
    print(f"Processing word {i} [use_Cython: {use_Cython}]: {doc}")
    word_segs = [ x for x in re.split(segmenter, doc) if len(x) > 0 ]
    for j in range(1, max_n_for_ngram + 1):
        print(f"generating {j}-grams ...")
        ngrams = gen_ngrams.gen_ngrams(word_segs, j, inclusive = ngram_is_inclusive, sep = sep_local, as_list = generated_as_list, check = False)
        if check:
            print(ngrams)
        ## update df
        df.loc[i, f'{j}g'] = ngrams

In [None]:
df[columns3]

## 結果の保存

In [None]:
if save_results:
    file_name = f"{save_dir}/{source_name}-reg-sk-xsk-df.csv"
    df.to_csv(file_name, header = True)

# end of file