In [44]:
import collections
import random

In [1]:
def get_words_alpha(path='words_alpha.txt'):
    with open(path) as f:
        vocab = {w.strip():1 for w in f.read().split()}
    return vocab

def get_unigram_freq(path='unigram_freq.csv'):
    with open(path) as f:
        vocab = {}
        for l in f.read().split():
            w, c = l.split(',', 2)
            vocab[w] = c
    return vocab

In [3]:
    unigram_freq = get_unigram_freq()
    words_alpha = get_words_alpha()

    unigram_freq_s = set(unigram_freq)
    words_alpha_s = set(words_alpha)

    only_unigram_freq = unigram_freq_s.difference(words_alpha_s)
    only_words_alpha = words_alpha_s.difference(unigram_freq_s)
    common_words = unigram_freq_s.intersection(words_alpha_s)


In [14]:
def print_basic_stats():
    print(f'# of unigram_freq is {len(unigram_freq)}')
    print(f'# of words unique to unigram_freq is {len(only_unigram_freq)} ({len(only_unigram_freq)/len(unigram_freq):.02f}%)')
    print(f'# of words_alpha is {len(words_alpha)}')
    print(f'# of words unique to words_alpha is {len(only_words_alpha)} ({len(only_words_alpha)/len(words_alpha):.02f}%)')
    print(f'# of common words is {len(common_words)}')
print_basic_stats()


# of unigram_freq is 333333
# of words unique to unigram_freq is 237031 (0.71%)
# of words_alpha is 370103
# of words unique to words_alpha is 273801 (0.74%)
# of common words is 96302


In [38]:
def random_sample_and_show(L, n=10):
    samples = random.sample(L, n)
    for s in samples:
        print(s)
        
print("Random samples from only unigram_freq")
#random_sample_and_show(only_unigram_freq)
def random_sample_and_show_with_freq(L, D, n=10):
    samples = random.sample(L, n)
    maxlen = max(len(w) for w in samples)
    for s in samples:
        print(f'{s:{maxlen}s}:{D.get(s, 0)}')
random_sample_and_show_with_freq(only_unigram_freq, unigram_freq)
print("\nRandom samples from only words_alpha")
random_sample_and_show(only_words_alpha)
print("\nRandom samples from common subset")
random_sample_and_show(common_words)

Random samples from only unigram_freq
sirmans          :15448
liapunov         :18447
seiberg          :53445
ukt              :24621
giordana         :62234
adcalls          :91478
schulbuch        :22905
mxz              :42551
promotioncustomer:16022
kwang            :215409

Random samples from only words_alpha
clubwoman
curets
tripaschal
gappy
befouled
agriotypus
neuropsychical
purled
howffs
overloose

Random samples from common subset
firehouse
typifies
safeguard
myrica
grammatical
tuples
subshrub
damages
minimised
unicellular


## Looking at freq distribution
Going to take a look at what the frequency distribution is for the words in the common subset and see if we can have a slightly more useful weighting function

In [49]:
unigram_freq_common_subset = {k:v for k, v in unigram_freq.items() if k in common_words}
freqs = {v for k, v in unigram_freq_common_subset.items()}
def get_distribution(iterable):
    freq_counter = collections.Counter()
    for x in iterable:
        freq_counter[x] += 1
    return freq_counter
freq_distribution = get_distribution(freqs)

print(len(freq_distribution))
print(f'Min freq: {min(freqs)}')
print(f'Max freq: {max(freqs)}')
print(f'Max/Min ratio: {int(max(freqs))/int(min(freqs))}')

84846
Min freq: 100002
Max freq: 999899654
Max/Min ratio: 9998.796564068718


Next steps:
- [ ] Look at the words in a few different segments of the distribution
- [ ] See if there is a sane cutoff point / if there is useful filtering to be done
- [ ] See if there is useful rescaling I can do of the counts. Probably want to keep counts instead of weights b/c it allows re-scaling of subsets (ie of just the 5-letter words)