In [13]:
from collections import defaultdict
from IPython.display import display
import pickle

import numpy as np
import pandas as pd
from tqdm import tqdm

from wordfreq import zipf_frequency
import spacy

In [2]:
nlp = spacy.load('en_core_web_sm')

In [3]:
def load_glove_embeddings(glove_path):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vec = np.array(parts[1:], dtype=np.float32)
            embeddings[word] = vec
    return embeddings

glove = load_glove_embeddings('data/glove.6B.100d.txt')

In [4]:
print('Number of keywords:', len(glove))

Number of keywords: 400000


In [5]:
def filter_glove_vocab(glove_dict):
    clean_glove = {}

    for word in tqdm(glove_dict.keys()):
        doc = nlp(word)
        token = doc[0]

        if not token.is_alpha: continue
        if token.is_stop: continue
        if token.pos_ in {"PROPN", "DET", "ADP"}:
            continue
        clean_glove[word] = glove_dict[word]

    return clean_glove

filtered_glove = filter_glove_vocab(glove)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400000/400000 [15:44<00:00, 423.71it/s]


In [6]:
print('Number of keywords:', len(filtered_glove))

Number of keywords: 211719


In [20]:
def save_to_pickle(item, name):
    with open(f'data/{name}.pkl', 'wb') as f:
        pickle.dump(item, f)

In [None]:
# save_to_pickle(filtered_glove, 'filtered_glove')

In [7]:
def group_and_average_by_lemma(glove_dict):
    lemma_groups = defaultdict(list)

    for word in tqdm(glove_dict.keys()):
        lemma = nlp(word)[0].lemma_
        lemma_groups[lemma].append(glove_dict[word])

    averaged_glove = {}
    for lemma, vecs in lemma_groups.items():
        averaged_vec = np.mean(vecs, axis=0)
        averaged_glove[lemma] = averaged_vec

    return averaged_glove

averaged_glove = group_and_average_by_lemma(filtered_glove)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 211719/211719 [07:59<00:00, 441.32it/s]


In [8]:
print('Number of keywords:', len(averaged_glove))

Number of keywords: 168791


In [None]:
# save_to_pickle(averaged_glove, 'averaged_glove')

In [9]:
def build_zipf_ranking(glove_dict, lang="en"):
    ranked_words = []
    for word in glove_dict:
        score = zipf_frequency(word, lang, wordlist='best')
        ranked_words.append((word, score))
    ranked_words.sort(key=lambda x: x[1], reverse=True)  # high freq first
    return ranked_words

ranked_glove = build_zipf_ranking(averaged_glove)

In [None]:
# save_to_pickle(ranked_glove, 'ranked_glove')

In [10]:
def print_words_at_percentile(ranked_words, percentiles=[10, 25, 50, 75, 90], leeway=0):
    total = len(ranked_words)
    print_df = {'Percentile': [], 'Word(s)': [], 'Zipf': []}

    for p in percentiles:
        index = int(total * (p / 100))
        index_range = slice(max(0, index - leeway), min(total, index + leeway + 1))

        word_tuples = ranked_words[index_range]
        words = [word_tuple[0] for word_tuple in word_tuples]
        score = np.mean([word_tuple[1] for word_tuple in word_tuples])

        # word, score = ranked_words[index]

        print_df['Percentile'].append(f'{p}th')
        print_df['Word(s)'].append(', '.join(words))
        print_df['Zipf'].append(round(score, 2))

        # print(f"{p}th percentile: {word} (Zipf: {score:.2f})")

    pd.set_option('display.max_colwidth', None)

    df = pd.DataFrame.from_dict(print_df)
    df = df.set_index('Percentile')

    # df = df.style.set_properties(subset=['Word(s)'], **{'text-align': 'left'})

    display(df)

n_percentiles = 10
print_words_at_percentile(ranked_glove, percentiles=range(0, 100, 100 // n_percentiles), leeway=3)

Unnamed: 0_level_0,Word(s),Zipf
Percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
0th,"the, and, of, in",7.45
10th,"loon, incomparable, mush, swivel, livid, fryer, phosphorylation",2.99
20th,"groupthink, taurine, hsp, lek, lindt, mongoloid, ecp",2.33
30th,"brendel, briquette, knud, rawal, amortisation, crooke, shippen",1.92
40th,"foxwell, tardigrade, thermoluminescence, neurofibrillary, cowlick, healthfulness, husbandman",1.64
50th,"imagism, bohinj, snowblind, amylopectin, augier, waterdown, wymore",1.4
60th,"chernihiv, mérite, nunciature, emplace, muridae, mozarteum, nazmul",1.17
70th,"hombach, byggmark, soysal, sjoland, toujour, bacai, laridae",0.0
80th,"trbovlje, leatherstocke, donop, primor, guttorm, niedere, bushism",0.0
90th,"akbarābād, mindorashvili, bubenik, armary, blessig, gunasegaran, mandisi",0.0


In [None]:
k = 4
ranked_glove_above_k = list(filter(lambda x: x[1] >= k, ranked_glove))

print_words_at_percentile(ranked_glove_above_k, percentiles=range(0, 100, 100 // n_percentiles), leeway=3)

Unnamed: 0_level_0,Word(s),Zipf
Percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
0th,"the, and, of, in",7.45
10th,"st, won, hold, industry, player, return, private",5.19
20th,"worst, cancer, direction, dance, master, wonder, trump",4.93
30th,"plane, mail, emergency, session, fuel, festival, zone",4.72
40th,"cheese, weekly, adam, describe, tournament, constitution, victim",4.56
50th,"assault, arrival, concrete, deeply, lieutenant, badly, fiction",4.44
60th,"billy, trap, wooden, di, greatly, uniform, transmission",4.33
70th,"substance, unexpected, aaron, legendary, syndrome, suggestion, recipe",4.23
80th,"cc, morris, collins, southeast, norway, belgium, bacon",4.15
90th,"burger, charming, curse, miserable, submission, monkey, span",4.07


In [26]:
def get_words(word_tuple, range=[4.0, 5.0]):
    word, zipf_score = word_tuple[0], word_tuple[1]
    return len(word) > 3 and range[0] <= zipf_score <= range[1]

filtered_ranked_glove = list(filter(get_words, ranked_glove))
print_words_at_percentile(filtered_ranked_glove, percentiles=range(0, 100, 100 // n_percentiles), leeway=3)

Unnamed: 0_level_0,Word(s),Zipf
Percentile,Unnamed: 1_level_1,Unnamed: 2_level_1
0th,"region, campaign, despite, nearly",5.0
10th,"pull, metal, fashion, faith, hill, profile, hearing",4.82
20th,"boston, strike, candidate, concern, sector, ireland, discuss",4.66
30th,"catholic, mate, circle, surely, amazon, manner, noise",4.55
40th,"jeff, covering, alan, luke, rome, islamic, producer",4.45
50th,"stretch, exit, luxury, tear, exception, preferred, knock",4.35
60th,"fiscal, venture, mutual, controversy, virtually, requirement, offense",4.26
70th,"taxi, glasgow, nerve, sheep, bacteria, sandy, emotion",4.19
80th,"tribe, adjacent, pete, pursuit, journalism, excessive, brad",4.12
90th,"chile, password, smartphone, adorable, pokemon, sierra, spectacular",4.06


In [29]:
filtered_ranked_glove[0], filtered_ranked_glove[1] 

(('region', 5.0), ('campaign', 5.0))

In [27]:
len(filtered_ranked_glove)

3572