In [60]:
import nltk

In [59]:
from nltk.corpus import gutenberg
from nltk.corpus.reader import bnc
import nltk
import numpy as np
from tqdm import tqdm
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [61]:
bncreader = bnc.BNCCorpusReader(root='english/2554/download/Texts', fileids=r'[A-K]/\w*/\w*\.xml')
len(bncreader.fileids())

Read in language corpus and calculate word frequency statistics

In [111]:
def containsalpha(s):
    return any(c.isalpha() for c in s)

bigrams = []
words = []
first_words = []
nwords = 0
for f in tqdm(bncreader.fileids()[:1000]):
    for s in bncreader.sents(f):
        s_clean = [s[i].lower() for i in range(len(s)) if containsalpha(s[i])]
        try:
            for i in np.where(["'" in w for w in s_clean])[0]:
                if i<len(s_clean):
                    s_clean[i-1:i+1] = [s_clean[i-1] + s_clean[i]]
        except:
            print i, len(s_clean)
        if len(s_clean)>0:
            nwords += len(s_clean)
            words.extend(s_clean)
            first_words.append(s_clean[0])
            bigrams.extend(nltk.bigrams(s_clean))

100%|██████████| 1000/1000 [07:47<00:00,  1.62it/s]


In [112]:
len(bigrams), nwords, len(bncreader.fileids())

(23410476, 24794226, 4049)

In [113]:
first_word_frequencies = nltk.FreqDist(first_words)
first_word_frequencies.most_common(10)

[('the', 162102),
 ('it', 51398),
 ('i', 48312),
 ('in', 42650),
 ('he', 41169),
 ('but', 32926),
 ('this', 30527),
 ('a', 28790),
 ('there', 20713),
 ('they', 20298)]

In [114]:
word_frequencies = nltk.FreqDist(words)
conditional_word_frequencies = nltk.ConditionalFreqDist(bigrams)

In [115]:
import pickle
pickle.dump(word_frequencies,open('word_frequencies.pkl','w'))
pickle.dump(first_word_frequencies,open('first_word_frequencies.pkl','w'))
pickle.dump(conditional_word_frequencies,open('conditional_word_frequencies_full.pkl','w'))

In [91]:
import pickle
word_frequencies = pickle.load(open('word_frequencies.pkl'))
first_word_frequencies = pickle.load(open('first_word_frequencies.pkl'))
conditional_word_frequencies = pickle.load(open('conditional_word_frequencies.pkl'))

Method used in the web app for predictive text suggestions

In [4]:
def suggest(prev_word, current_letters, n_suggestions=4):
    suggestions = []
    
    if len(prev_word)==0:
        for c in first_word_frequencies.most_common():
            if c[0].startswith(current_letters):
                suggestions.append(c[0].title())
            if len(suggestions)==n_suggestions:
                break       
    else:
        if prev_word and len(prev_word)>0:
            common = conditional_word_frequencies[prev_word].most_common()
            for c in common:
                if c[0].startswith(current_letters):
                    suggestions.append(c[0])
                if len(suggestions)==n_suggestions:
                    break

    if len(suggestions)<n_suggestions:    
        for c in word_frequencies.most_common():
            if c[0].startswith(current_letters):
                suggestions.append(c[0])
            if len(suggestions)==n_suggestions:
                break
            
    if len(suggestions)<n_suggestions:
        suggestions.extend(['']*(n_suggestions-len(suggestions)))

    return suggestions

Trim down the bigram model, to only pairs of words observed 3 or more times (compresses file to around 10%)

In [3]:
import pickle
conditional_word_frequencies = pickle.load(open('conditional_word_frequencies_full.pkl'))

In [35]:
freqs = []
for k1 in conditional_word_frequencies.keys():
    for k2 in conditional_word_frequencies[k1].keys():
        if conditional_word_frequencies[k1][k2]<4:
            del(conditional_word_frequencies[k1][k2])
    if len(conditional_word_frequencies[k1])==0:
        del(conditional_word_frequencies[k1])

pickle.dump(conditional_word_frequencies,open('conditional_word_frequencies.pkl','w'))

Count in normal usage which words tend to be capitalised

In [85]:
def containsalpha(s):
    return any(c.isalpha() for c in s)

capitalised_count = {}

nwords = 0
for f in tqdm(bncreader.fileids()):
    for s in bncreader.sents(f):
        s_clean = [s[i] for i in range(len(s)) if containsalpha(s[i])]
        try:
            for i in np.where(["'" in w for w in s_clean])[0]:
                if i<len(s_clean):
                    s_clean[i-1:i+1] = [s_clean[i-1] + s_clean[i]]
        except:
            print i, len(s_clean)
        if len(s_clean)>1:
            for w in s_clean:
                if w.lower() not in capitalised_count:
                    capitalised_count[w.lower()] = [0,0]
                if w.istitle():
                    capitalised_count[w.lower()][1] += 1
                else:
                    capitalised_count[w.lower()][0] += 1

100%|██████████| 4049/4049 [43:36<00:00,  1.78it/s]


Create a list of words which should be capitalised.

In [105]:
capitalised_words = {}
for w in word_frequencies.keys():
    if w in capitalised_count and capitalised_count[w.lower()][1] > capitalised_count[w.lower()][0]:
                capitalised_words[w.lower()] = True
pickle.dump(capitalised_words,open('capitalised_words.pkl','w'))