In [80]:
import json
import glob

paths = glob.glob("../../www/data/policies/**/policies.json")

policies = []
for json_path in paths:
  with open(json_path) as f:
    policies += json.load(f)


In [81]:
import pprint

def create_entry(prompt, policy):
  return '{prompt} TOPIC: {topic}, POLITICAL PARTY: {party}, POLICY DESCRIPTION: {policy_description}'.format(
    prompt=prompt,
    topic=policy["topic"],
    party=policy["party"],
    policy_description=policy["title"]["EN"]
  )

prompt = "clustering:"
corpus = [create_entry(prompt, policy) for policy in policies]
pprint.pp(corpus)

['clustering: TOPIC: economy, POLITICAL PARTY: Conservative, POLICY '
 "DESCRIPTION: Don't change TFSA contribution limit",
 'clustering: TOPIC: economy, POLITICAL PARTY: Liberal, POLICY DESCRIPTION: '
 'Reduce TFSA contribution limit',
 'clustering: TOPIC: economy, POLITICAL PARTY: NDP, POLICY DESCRIPTION: Reduce '
 'TFSA contribution limit',
 'clustering: TOPIC: child-care, POLITICAL PARTY: NDP, POLICY DESCRIPTION: '
 'National $15/day daycare',
 'clustering: TOPIC: child-care, POLITICAL PARTY: NDP, POLICY DESCRIPTION: '
 'Keep Universal Child Care Benefit',
 'clustering: TOPIC: child-care, POLITICAL PARTY: Conservative, POLICY '
 'DESCRIPTION: Universal Child Care Benefit',
 'clustering: TOPIC: child-care, POLITICAL PARTY: Liberal, POLICY DESCRIPTION: '
 'Canada Child Benefit',
 'clustering: TOPIC: health-and-safety, POLITICAL PARTY: NDP, POLICY '
 'DESCRIPTION: Repeal Bill C-51',
 'clustering: TOPIC: health-and-safety, POLITICAL PARTY: Conservative, POLICY '
 'DESCRIPTION: Keep Bil

In [82]:
import llama_cpp
import pprint
import numpy as np

bert = llama_cpp.Llama(
  model_path="/Users/jahfer/src/models/nomic-embed-text-v1.5.f32.gguf",
  embedding=True,
  n_gpu_layers=0,
  n_ctx=8192,
  n_batch=8192,
  rope_scaling_type=llama_cpp.LLAMA_ROPE_SCALING_TYPE_YARN,
  rope_freq_scale=0.75,
)

embed_chunks = np.array_split(corpus, len(corpus) // 5)

embeddings = []
for chunk in embed_chunks:
  embeddings += [e['embedding'] for e in bert.create_embedding(chunk.tolist())['data']]

llama_model_loader: loaded meta data with 22 key-value pairs and 112 tensors from /Users/jahfer/src/models/nomic-embed-text-v1.5.f32.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = nomic-bert
llama_model_loader: - kv   1:                               general.name str              = nomic-embed-text-v1.5
llama_model_loader: - kv   2:                     nomic-bert.block_count u32              = 12
llama_model_loader: - kv   3:                  nomic-bert.context_length u32              = 2048
llama_model_loader: - kv   4:                nomic-bert.embedding_length u32              = 768
llama_model_loader: - kv   5:             nomic-bert.feed_forward_length u32              = 3072
llama_model_loader: - kv   6:            nomic-bert.attention.head_count u32              = 12
llama_model_loader: - kv   7:    nomic-b

In [101]:
from sklearn.metrics.pairwise import cosine_similarity

pairs = list(zip(policies, embeddings))

top_k = []
for [policy, vec] in pairs:
  all_cos_for_policy = []
  for [embedded_policy, other_vec] in pairs:
    cos = cosine_similarity([vec], [other_vec])[0][0]
    if cos > 0.999 or cos < 0.9: next
    all_cos_for_policy.append([embedded_policy, cos])
  sorted_cos = sorted(all_cos_for_policy, reverse=True, key=lambda cos: cos[1])
  top_k.append([policy, [x[0] for x in sorted_cos]])

pprint.pp(top_k[:5])

[[{'topic': 'economy',
   'year': 2015,
   'party': 'Conservative',
   'title': {'EN': "Don't change TFSA contribution limit",
             'FR': 'Ne pas modifier la limite de contribution au CELI'},
   'references': [{'date': '2015-09-11',
                   'title': 'Trudeau, Mulcair pledge to pull back TFSA '
                            'contribution limit ',
                   'publisher': 'The Globe and Mail',
                   'url': 'http://www.theglobeandmail.com/globe-investor/personal-finance/household-finances/tfsas-surface-as-election-issue-as-opposition-vows-to-reverse-increased-limit/article26316436/'},
                  {'date': '2015-04-21',
                   'title': 'TFSA limit hiked to $10,000 as election budget '
                            'delivers few goodies',
                   'publisher': 'CBC',
                   'url': 'http://www.cbc.ca/news/business/budget-2015-tfsa-limit-hiked-to-10-000-as-election-budget-delivers-few-goodies-1.3040853'},
             