In [None]:
import numpy as np
import pandas as pd
import gensim
import gensim.downloader
import typing
import pprint

In [None]:
df = pd.read_csv("example_titles.csv")
print(df)
titles = []
for title in df["Title"]:
    titles.append(title.lower())

print(titles)

In [None]:
categories = {
    "engine" : ["engine"],
    "pharma" : ["pharmaceutical", "medical"],
    "electronic" : ["electronic"],
    "military" : ["military"],
    "software" : ["software"],
}
print(categories)

In [None]:
# print(list(gensim.downloader.info()['models'].keys()))
# w2v = gensim.downloader.load('word2vec-google-news-300')


In [None]:
temporary_filepath = "w2v_model.d2v"
w2v = gensim.models.KeyedVectors.load(temporary_filepath)
print(w2v)
# w2v.most_similar('twitter')

In [None]:
# w2v.save(temporary_filepath)

In [None]:
def check_categories():
    for _, comps in categories.items():
        for comp in comps:
            if comp not in w2v:
                print("Error, comp not in word dictionary: ", comp)
check_categories()

In [None]:
def calc_sim(title: str, comps: typing.List[str], n_avg: int):
    avg_word_sims = []
    word_cnt = 0
    for word in title.split(" "):
        if word not in w2v:
            continue
        word_cnt += 1
        sim_sum = 0
        # print(word)
        for comp in comps:
            # print(comp)
            # print(w2v.similarity(word, comp))
            sim_sum += w2v.similarity(word, comp)
        avg_word_sims.append(sim_sum / len(comps))

    top_sims = np.array(avg_word_sims)
    top_sims.sort()
    # print(top_sims)
    n = min(n_avg, word_cnt)
    return top_sims[-n:].mean()

In [None]:
def calc_category_sims(title: str, categories: typing.Dict[str, str]):
    sims = []
    for _, comps in categories.items():
        sim = calc_sim(title, comps, 2)
        sims.append(sim)

    return np.array(sims)

In [None]:
def calc_all_sims():
    sim_table = np.zeros((len(titles), len(categories)))
    for title_idx in range(len(titles)):
        sim_table[title_idx, :] = calc_category_sims(titles[title_idx], categories)

    return sim_table

In [None]:
sim_table = calc_all_sims()
threshold = 0.2
bin_sim_table = np.where(sim_table > threshold, 1, 0)
# print(bin_sim_table


In [None]:
def categorize():
    categorization = {}
    cats = list(categories.keys())
    for cat in cats:
        categorization[cat] = []
    categorization["other"] = []

    for title_idx in range(len(titles)):
        title_vec = bin_sim_table[title_idx, :]
        if 1 not in title_vec:
            categorization["other"].append(titles[title_idx])
            continue
        for cat_idx in range(len(cats)):
            if title_vec[cat_idx]:
                categorization[cats[cat_idx]].append(titles[title_idx])

    return categorization


In [None]:
categorization = categorize()
pprint.pprint(categorization)

In [None]:
cats = list(categories.keys())
for cat_idx in range(len(cats)):
    df[cats[cat_idx]] = sim_table[:, cat_idx]

print(df)
df.to_csv("example_titles_sorted.csv")
