In [63]:
from string import maketrans
import collections
import copy
import random
import glob
import pickle

class TextGenerator(object):
    def __init__(self):
        self.text = []
        self.pairs = []
    def LoadFile(self, filename):
        split_symbols = ':;!?#%^"&*()|/-_=+@[]{}<>' + chr(0)
        destination_dictionary = '                          '
        tran_tab = maketrans(split_symbols, destination_dictionary)
        with open(filename) as f:
            for line in f:
                tran_string = line.translate(tran_tab).lower().replace(',', ' , ')
                self.text += tran_string.replace('.', ' . ').split()
    def CountStatistic(self):
        for i, word in enumerate(self.text):
            if i == len(self.text) - 1:
                break
            self.pairs.append(word + ' ' + self.text[i + 1])
        self.stat_prime = collections.Counter(self.text)
        self.stat_after_one = {}
        for i, word in enumerate(self.text):
            if i == len(self.text) - 1:
                break
            if word not in self.stat_after_one:
                wordcount = {}
                wordcount[self.text[i + 1]] = 1
                self.stat_after_one[word] = copy.copy(wordcount)
            else:
                if self.text[i + 1] not in self.stat_after_one[word]:
                    self.stat_after_one[word][self.text[i + 1]] = 1
                else:
                    self.stat_after_one[word][self.text[i + 1]] += 1
        self.stat_after_pair = {}
        for i, word in enumerate(self.pairs):
            if i == len(self.pairs) - 1:
                break
            if word not in self.stat_after_pair:
                wordcount = {}
                wordcount[self.text[i + 2]] = 1
                self.stat_after_pair[word] = copy.copy(wordcount)
            else:
                if self.text[i + 2] not in self.stat_after_pair[word]:
                    self.stat_after_pair[word][self.text[i + 2]] = 1
                else:
                    self.stat_after_pair[word][self.text[i + 2]] += 1
    def ChooseFirstWord(self, n = 100):
        word = self.stat_prime.most_common(n)
        n = min(len(word), n)
        return word[random.randint(0, n-1)][0]
    def ChooseSecondWord(self, word, n = 100):
        if word not in self.stat_after_one:
            return None
        next_word = sorted(self.stat_after_one[word], key=self.stat_after_one[word].get)[-n:]
        n = min(len(next_word), n)
        return next_word[random.randint(0, n-1)]
    def ChooseNextWord(self, pair, n = 100):
        if pair not in self.stat_after_pair:
            return None
        word = sorted(self.stat_after_pair[pair], key=self.stat_after_pair[pair].get)[-n:]
        n = min(len(word), n)
        return word[random.randint(0, n-1)]
    def SaveStatistic(self):
        with open('text.pickle', 'wb') as f:
            pickle.dump(self.text, f, pickle.HIGHEST_PROTOCOL)
        with open('prime.pickle', 'wb') as f:
            pickle.dump(self.stat_prime, f, pickle.HIGHEST_PROTOCOL)
        with open('after_one.pickle', 'wb') as f:
            pickle.dump(self.stat_after_one, f, pickle.HIGHEST_PROTOCOL)
        with open('after_pair.pickle', 'wb') as f:
            pickle.dump(self.stat_after_pair, f, pickle.HIGHEST_PROTOCOL)
    def LoadStatistic(self):
        with open('text.pickle', 'rb') as f:
            self.text = pickle.load(f)
        with open('prime.pickle', 'rb') as f:
            self.stat_prime = pickle.load(f)
        with open('after_one.pickle', 'rb') as f:
            self.stat_after_one = pickle.load(f)
        with open('after_pair.pickle', 'rb') as f:
            self.stat_after_pair = pickle.load(f)
    def Display(self):
        print 'Number of seen words {}'.format(len(self.text))
        print 'Number of prime words {}'.format(len(self.stat_after_one))
        print 'Number of pairs {}'.format(len(self.stat_after_pair))

In [64]:
def GetCapitalIfFirst(word, first):
    if first:
        return word[0].upper() + word[1:]
    else:
        return word

def Generate(Generator, n):
    isfirst = True
    issecond = True
    word = ''
    next_word = ''
    result = ''
    for i in xrange(n):
        if isfirst:
            word = Generator.ChooseFirstWord(100)
            while word == ',' or word == '.':
                word = Generator.ChooseFirstWord(100)
            result += ' ' + GetCapitalIfFirst(word, isfirst)
            isfirst = False
            issecond = True
            continue
        elif issecond:
            next_word = Generator.ChooseSecondWord(word, 1)
            issecond = False
        else:
            pair = word + ' ' + next_word
            word = copy.copy(next_word)
            next_word = Generator.ChooseNextWord(pair, 10)
        if next_word is None or next_word == '.':
            isfirst = True
            result += '.'
        elif next_word == ',':
            result += ','
        else:
            result += ' ' + GetCapitalIfFirst(next_word, isfirst)
    return result[1:]

In [None]:
if __name__ == "__main__":
    s = TextGenerator()
    files = glob.glob('dickens/*txt') + glob.glob('asimov/*txt') + glob.glob('pratchett/*txt')
    for f in files:
        s.LoadFile(f)
    files = glob.glob('shakespeare_and_holmes/*.txt')
    for f in files:
        s.LoadFile(f)
    s.CountStatistic()
    print Generate(s, 500)

In [65]:
s = TextGenerator()

In [66]:
files = glob.glob('dickens/*txt') + glob.glob('asimov/*txt') + glob.glob('pratchett/*txt')

In [67]:
for f in files:
    s.LoadFile(f)

In [68]:
s.CountStatistic()
s.Display()

Number of seen words 20126400
Number of prime words 153664
Number of pairs 2565729


In [None]:
s.SaveStatistic()

In [47]:
s.LoadStatistic()
s.Display()

Number of seen words 794
Number of prime words 34
Number of pairs 296


In [55]:
s.LoadFile('Ceres.txt')

26
26


In [70]:
print Generate(s, 500)

That the whole thing to do. Them. That the two young children i hope so ' the young men in black velvet mask, of whom i love. ' said ralph impatiently. All the same place twice while i write. Know, said the gentleman. Have been to school with her. This is an extremely rare species totally extinct. Any of that and you, mr nickleby had never thought to his feet in hot regions like djelibeybi or in other ways. Did not look at me i had a chance of survival may be able to…sense her, she felt the wind had died when you first come when they have to, said avalon stiffly. Be a very long before you get a few times as bright a face, but it had never been on this, i have had it in. With a great number, the only one of my way. Now, the same, and, as if they can make you, sir ’ the girl, ' cried tom 'yes, my dear ' and with her, with her. As he was a long way. Back to their home. At the time of her life was so. Even if he had no choice in his eyes on him with a great variety of shapes and colors are