# Sandox: QbE keyword lists

Herman Kamper, Stellenbosch University, 2018-2019.

## Preliminaries

In [11]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from collections import Counter
from os import path
import codecs
import matplotlib.pyplot as plt
import numpy as np
import random

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Keywords

In [2]:
dev_keywords_fn = "../features/mfcc/HA/ha.dev.gt_words.npz"
test_fn = "../features/mfcc/HA/ha.eval.npz"
dev_keywords_features = np.load(dev_keywords_fn)
test_features = np.load(test_fn)

In [3]:
def read_forced_alignment(globalphone_fa_fn):
    """Return a dictionary of transcriptions obtained from a GlobalPhone forced alignment file."""
    transcription_dict = {}
    with codecs.open(globalphone_fa_fn, "r", "utf-8") as f:
        for line in f:
            line = line.strip().split(" ")
            utterance_key = line[0]
            label = line[4].lower()
            if utterance_key not in transcription_dict:
                transcription_dict[utterance_key] = []
            transcription_dict[utterance_key].append(label)
    return transcription_dict    

test_transcription = read_forced_alignment("/home/kamperh/endgame/datasets/globalphone_alignments/HA/eval.ctm")

In [4]:
test_counter = Counter()
for utterance_key in test_transcription:
    for word in test_transcription[utterance_key]:
        test_counter[word] += 1

In [43]:
n = 9
more_than_n = set()
for word, count in test_counter.most_common():
    if count >= n:
        more_than_n.add(word)
print("No. words more than {}: {}".format(n, len(more_than_n)))

No. words more than 9: 111


In [44]:
dev_counter = Counter()
dev_words = set()
for segment_key in dev_keywords_features:
    word = segment_key.split("_")[0].lower()
    dev_counter[word] += 1
    dev_words.add(word)

In [45]:
overlap = more_than_n.intersection(dev_words)
print("No. words overlap:", len(overlap))
for word in sorted(overlap):
    print("{}: {} times in dev".format(word, dev_counter[word]))

No. words overlap: 33
aikin: 6 times in dev
amfani: 8 times in dev
amurka: 6 times in dev
arziki: 2 times in dev
babban: 2 times in dev
bayan: 6 times in dev
bayyana: 5 times in dev
birnin: 7 times in dev
cikin: 3 times in dev
daban: 1 times in dev
daular: 6 times in dev
domin: 2 times in dev
duniya: 8 times in dev
hankali: 7 times in dev
hanyar: 5 times in dev
harkokin: 12 times in dev
kasance: 6 times in dev
kasar: 14 times in dev
kasashe: 4 times in dev
kasashen: 13 times in dev
lokacin: 12 times in dev
majalisar: 8 times in dev
mutane: 18 times in dev
samun: 5 times in dev
sarki: 7 times in dev
sosai: 25 times in dev
tattalin: 4 times in dev
tsakanin: 11 times in dev
wajen: 2 times in dev
wanda: 1 times in dev
wannan: 5 times in dev
zaman: 1 times in dev
zamanin: 9 times in dev


In [48]:
n_keywords = 30
keywords = list(overlap)
random.seed(1)
random.shuffle(keywords)
keywords = keywords[:n_keywords]

In [49]:
print("Keywords:", sorted(keywords))
print("No. keywords:", len(keywords))

Keywords: ['amfani', 'amurka', 'arziki', 'babban', 'bayan', 'bayyana', 'birnin', 'daban', 'daular', 'domin', 'duniya', 'hankali', 'hanyar', 'harkokin', 'kasar', 'kasashe', 'kasashen', 'lokacin', 'majalisar', 'mutane', 'samun', 'sarki', 'sosai', 'tattalin', 'tsakanin', 'wajen', 'wanda', 'wannan', 'zaman', 'zamanin']
No. keywords: 30


In [50]:
with codecs.open("keywords.txt", "w", "utf-8") as f:
    for keyword in sorted(keywords):
        f.write(keyword + "\n")