In [1]:
from pathlib import Path
import json, math, collections, pickle
import pickle, pandas as pd
import matplotlib.pyplot as plt
import numpy as np

def find_root(markers=("config.json","corpus.json",".git")):
    p=Path.cwd()
    for c in (p,*p.parents):
        if any((c/m).exists() for m in markers): return c
    return Path.cwd()

ROOT=find_root()
CLEAN=ROOT/"data_clean"
OUT=ROOT/"outputs"
OUT.mkdir(exist_ok=True, parents=True)

cfg=json.loads((ROOT/"config.json").read_text(encoding="utf-8"))
N=cfg["ngram"]; langs=cfg["languages"]
print("N-gram:", N); print("langs:", langs)

N-gram: 3
langs: ['ilocano', 'kapampangan', 'maguindanao', 'ibanag', 'tausug', 'pangasinan', 'kankanaey', 'tagalog', 'cebuano', 'hiligaynon', 'bikol', 'maranao', 'waray', 'chavacano', 'spanish', 'english']


In [2]:
docs=[]
for l in langs:
    p=CLEAN/f"{l}.txt"
    if not p.exists() or p.stat().st_size==0:
        print(f"[skip] {l} missing in data_clean → dropping from analysis")
        continue
    docs.append((l, p.read_text(encoding="utf-8", errors="ignore")))
langs=[l for l,_ in docs]
texts=[t for _,t in docs]
D=len(texts); print("kept:", D, "languages")
assert D>=2, "Need ≥2 languages."

kept: 16 languages


### Step 2: Generate n-gram frequency vectors (Term Frequency - TF)

This section converts each text sample into a normalized vector of n-gram frequencies:

- The `ngrams()` function extracts all possible overlapping sequences of length `n` from each string.  
  For example, with `n=3`, the word `"hello"` becomes `["hel", "ell", "llo"]`.

- For every text in the dataset (`texts`), we:
  1. Count how many times each n-gram appears using `collections.Counter`.
  2. Store these counts in `counts`.

- Then, we apply **L1 normalization** (sum of all frequencies = 1) to obtain **term frequency (TF)** vectors.
  - This ensures that each vector represents relative frequencies instead of raw counts.
  - Helps compare texts of different lengths fairly.

The resulting list `tf` contains one normalized n-gram frequency vector per text.

In [3]:
def ngrams(s, n):
    L = len(s)
    return [s[i:i+n] for i in range(L-n+1)] if L >= n else []

counts = []
for t in texts:
    c = collections.Counter(ngrams(t, N))
    counts.append(c)

# L1-normalize each vector (sum to 1)
tf = []
for c in counts:
    total = sum(c.values()) or 1
    v = {g: ct/total for g, ct in c.items()}
    tf.append(v)

print("vectors built (TF):", len(tf))

vectors built (TF): 16


### Step 3: Show the top n-grams per language

This step builds a small summary table for each language:

- `k` controls how many of the most frequent character n-grams to keep for display.
- For each language (`name`) and its TF vector (`v`), we:
  1. Sort n-grams by their TF value in descending order and keep the top `k`.
  2. Create a DataFrame with columns for the n-gram and its TF.
  3. Add a rank column starting at 1 for readability.
  4. Replace spaces in the n-gram label with underscores (`ngram_`) to make plotting or saving easier.
- Each DataFrame is stored in `dfs[name]` so you can reuse or save it later.
- `display(...)` shows the styled table for quick inspection in the notebook.


In [4]:
k = 5  # top n-grams per language to show
dfs = {}  # keep a handle to each DataFrame if you want to reuse/save later

for name, v in zip(langs, tf):
    items = sorted(v.items(), key=lambda x: x[1], reverse=True)[:k]
    df = pd.DataFrame(items, columns=["ngram", "tf"])
    df.insert(0, "rank", range(1, len(df) + 1))
    df["ngram_"] = df["ngram"].str.replace(" ", "_")
    df = df[["ngram_", "tf"]]
    dfs[name] = df
    display(df.style.set_caption(f"{name} — top {k} char {N}-grams"))

Unnamed: 0,ngram_,tf
0,ti_,0.027571
1,iti,0.017235
2,_a_,0.012925
3,_ke,0.011797
4,_it,0.011325


Unnamed: 0,ngram_,tf
0,ng_,0.049965
1,ing,0.028887
2,ang,0.017432
3,an_,0.014161
4,_ka,0.013669


Unnamed: 0,ngram_,tf
0,_ka,0.021733
1,nu_,0.017049
2,an_,0.016567
3,_na,0.015002
4,na_,0.013881


Unnamed: 0,ngram_,tf
0,_ta,0.019584
1,_na,0.017117
2,nga,0.015714
3,a_n,0.015065
4,_ng,0.013881


Unnamed: 0,ngram_,tf
0,in_,0.0226
1,an_,0.016806
2,_ma,0.016583
3,_in,0.014132
4,iya,0.014068


Unnamed: 0,ngram_,tf
0,an_,0.022792
1,ay_,0.015485
2,_na,0.010155
3,_ma,0.009078
4,_sa,0.008588


Unnamed: 0,ngram_,tf
0,ay_,0.023566
1,_di,0.020024
2,in_,0.019662
3,_ay,0.0173
4,_si,0.015962


Unnamed: 0,ngram_,tf
0,ng_,0.041026
1,ang,0.025109
2,at_,0.017709
3,_at,0.01641
4,_na,0.014596


Unnamed: 0,ngram_,tf
0,ng_,0.022474
1,ang,0.019841
2,sa_,0.019597
3,_sa,0.018545
4,ga_,0.016752


Unnamed: 0,ngram_,tf
0,ng_,0.023142
1,ang,0.023003
2,_sa,0.021446
3,ga_,0.018224
4,_ka,0.013822


Unnamed: 0,ngram_,tf
0,an_,0.03013
1,_sa,0.017187
2,_na,0.016942
3,ng_,0.015263
4,_ka,0.013478


Unnamed: 0,ngram_,tf
0,an_,0.023703
1,iya,0.019376
2,_a_,0.018291
3,_ma,0.013597
4,na_,0.01207


Unnamed: 0,ngram_,tf
0,an_,0.036302
1,nga,0.020999
2,_ng,0.019384
3,_ha,0.018069
4,ga_,0.017858


Unnamed: 0,ngram_,tf
0,el_,0.015904
1,_el,0.013681
2,_co,0.012949
3,_ma,0.011791
4,ya_,0.010191


Unnamed: 0,ngram_,tf
0,_y_,0.017699
1,_de,0.015328
2,os_,0.014299
3,de_,0.012272
4,_la,0.006944


Unnamed: 0,ngram_,tf
0,_th,0.022139
1,the,0.021016
2,he_,0.016468
3,nd_,0.011058
4,and,0.010605


### Step 4: Compute cosine similarity and distance matrices

We build pairwise **cosine similarity** (`S`) and **cosine distance** (`Dist`) between the TF vectors:

- `dot(a, b)`: Efficient sparse dot product for two dict-vectors `{ngram: weight}`.
  - Swaps so the smaller dict is iterated to reduce lookups.
- `norm(a)`: Euclidean norm of a dict-vector with a tiny epsilon safeguard to avoid division by zero.

Procedure:
1. Initialize `S` as a `D × D` matrix (where `D = len(tf)`).
2. For each pair `(i, j)` with `j ≥ i`:
   - Compute `ni = ||tf[i]||` and `nj = ||tf[j]||`.
   - Cosine similarity: `s = dot(tf[i], tf[j]) / (ni * nj)`.
   - Fill both `S[i][j]` and `S[j][i]` (matrix is symmetric; diagonal is 1.0).
3. Build cosine **distance** matrix: `Dist[i][j] = 1 - S[i][j]`.

Notes:
- Cosine similarity ∈ [0, 1] for nonnegative TF vectors; higher means more similar.
- Cosine distance ∈ [0, 1]; lower means more similar.
- These matrices are ready for clustering, dendrograms, or nearest-neighbor queries.

In [5]:
def dot(a,b):
    if len(a)>len(b): a,b=b,a
    return sum(w*b.get(g,0.0) for g,w in a.items())
def norm(a): 
    return math.sqrt(sum(w*w for w in a.values())) or 1e-12

S=[[0.0]*D for _ in range(D)]
for i in range(D):
    ni=norm(tf[i])
    for j in range(i,D):
        nj=norm(tf[j])
        s=dot(tf[i], tf[j])/(ni*nj)
        S[i][j]=S[j][i]=s
Dist=[[1-S[i][j] for j in range(D)] for i in range(D)]
print("matrices ready.")

matrices ready.


### Step 5: Save similarity and distance matrices

This step serializes and saves the computed results for later use:

- Uses the `pickle` module to store Python objects in a binary file.
- The saved dictionary includes:
  - `langs`: list of language names (order matters for interpreting the matrices)
  - `S`: cosine similarity matrix
  - `Dist`: cosine distance matrix
- The file is saved as `similarity_distance.pkl` in the output directory (`OUT`).

Why this step matters:
- Pickle files allow fast reloading without recomputing the matrices.
- You can easily reuse them for clustering, dendrogram visualization, or additional analysis.
- To reload later:  
  ```python
  data = pickle.load(open(OUT / "similarity_distance.pkl", "rb"))
  langs, S, Dist = data["langs"], data["S"], data["Dist"]

In [6]:
pickle.dump({"langs":langs,"S":S,"Dist":Dist}, open(OUT/"similarity_distance.pkl","wb"))
print("Saved:", OUT/"similarity_distance.pkl")

Saved: d:\OneDrive\Documents\My Learning Resource\University Courses\DLSU\2025-26\T1\CSC715M\assignments\mc01\outputs\similarity_distance.pkl
