In [None]:
import orjson
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties

In [None]:
import matplotlib.font_manager

available_fonts = sorted([(f.name, f.variant) for f in matplotlib.font_manager.fontManager.ttflist])
for (name, variant) in available_fonts:
    print(name, variant)

In [None]:
def count(filepath, field=None, value=None):
    with open(filepath, 'rb') as f:
        dataset = orjson.loads(f.read())
        if field is not None and value is not None:
            dataset = [x[field] for x in dataset if x[field] == value]
        return len(dataset)

In [None]:
def count_words(filepath, field):
    with open(filepath, 'rb') as f:
        dataset = orjson.loads(f.read())
        words = defaultdict()
        for x in dataset:
            if x[field] is None:
                continue
            item = x[field].split()
            for word in item:
                word = word.strip().lower()
                word = word.replace('.', '').replace(',', '')
                if word is not None:
                    if word not in words:
                        words[word] = 1
                    else:
                        words[word] += 1
        words = dict(sorted(words.items(), key=lambda item: item[1], reverse=True))
        return words

In [None]:
def count_unique_labels(filepath, field):
    with open(filepath, 'rb') as f:
        dataset = orjson.loads(f.read())
        labels = defaultdict()
        for x in dataset:
            if x[field] is None:
                continue
            item = x[field]
            if item not in labels:
                labels[item] = 1
            else:
                labels[item] += 1
        return labels

In [None]:
colors = []
colors.append((23, 131, 232))
colors.append((2, 38, 110))
colors.append((185, 208, 241))
colors.append((118, 205, 3))
colors.append((162, 0, 0))
colors = [tuple(np.array(c) / 255.0) for c in colors]

In [None]:
print(f"Number of records in the dataset %d." % count("data/ilid.json"))

In [None]:
print(f"Number of records %d for %s." % (count("data/ilid.json", "label_short", "grid plate clamping system"), "grid plate clamping system"))

In [None]:
words_dict = count_words("data/ilid.json", "label_long")
print(f"Number of unique words in the dataset %d." % len(words_dict))
print(f"Number of words in the dataset %d." % sum(words_dict.values()))
print(f"Number of word x in the dataset %d." % words_dict["collet"])

In [None]:
print(f"Number of unique labels in the dataset %d under label %s." % (len(count_unique_labels("data/ilid.json", "label_short")), "label_short"))
print(f"Number of unique labels in the dataset %d under label %s." % (len(count_unique_labels("data/ilid.json", "label_long")), "label_long"))
print(f"Number of unique labels in the dataset %d under label %s." % (len(count_unique_labels("data/ilid.json", "material")), "material"))
print(f"Number of unique labels in the dataset %d under label %s." % (len(count_unique_labels("data/ilid.json", "material_finish")), "material_finish"))
print(f"Number of unique labels in the dataset %d under label %s." % (len(count_unique_labels("data/ilid.json", "description")), "description"))

In [None]:
words_dict = count_words("data/ilid.json", "material")
top_n = 40
words = list(words_dict.keys())[:top_n]
counts = list(words_dict.values())[:top_n]

font = FontProperties()
font.set_name('Arial')

plt.figure(figsize=(10, 5))
plt.bar(words, counts, color=colors[1], zorder=3)
plt.yscale('log')

lines = [2000, 1000, 900, 800, 700, 600, 500, 400, 300, 200]
for line in lines:
    plt.axhline(y=line, color='lightgrey', linewidth=0.4, zorder=1, linestyle='--')

#plt.xlabel('Words', fontproperties=font, fontweight='bold', fontsize=18)
#plt.ylabel('Word Occurrences', fontproperties=font, fontweight='bold', fontsize=14)
plt.xticks(rotation=60, ha="right", fontproperties=font)

plt.subplots_adjust(bottom=0.17, top=0.98, left=0.04, right=0.99)
plt.savefig('word_occurrences.png', dpi=300)