# Sentence Generation Project

In [7]:
import numpy as np
from nltk.corpus import BracketParseCorpusReader
from nltk import *
import re


# Constants
PER_SEED = 30
NUM_SEED = 10


# Define the functions to be used

def get_next(distributions, token, order):
    dist = distributions[order-1]
    while ( len(list(dist[token].samples())) == 0):
        order -= 1
        if(order<0 or order>2):
            print("\nERROR: ",token,"HAS NO DISTRIBUTION!!!\n")
        dist = distributions[order-1]
        if (order==1):
            token = token[1]
        elif (order==2):
            token = (token[1], token[2])
            
    return dist[token].generate()


def make_ngram_sentence(distributions, seed, order=1):
    dist_b = distributions[0]
    dist_t = distributions[1]
    dist_f = distributions[2]
    i=0
    sentence = []
    sentence.append(seed)

    root = sentence[i]
    word1 = get_next(distributions, root, 1)
    word2 = get_next(distributions, root, 1)
    i += 1

    if ( dist_b[root].prob(word1) > dist_b[root].prob(word2) ):
        sentence.append(word1)
    else:
        sentence.append(word2)

    if(order == 1):
        while(sentence[i] != "" or len(sentence) <= 2 ):
            root = sentence[i]
            word1 = get_next(distributions, root, order)
            word2 = get_next(distributions, root, order)
            i += 1
        
            if ( dist_b[root].prob(word1) > dist_b[root].prob(word2) ):
                sentence.append(word1)
            else:
                sentence.append(word2)
            
        return sentence
    
    root = (sentence[i-1], sentence[i])
    word1 = get_next(distributions, root, 2)
    word2 = get_next(distributions, root, 2)
    i += 1

    if ( dist_t[root].prob(word1) > dist_t[root].prob(word2) ):
        sentence.append(word1)
    else:
        sentence.append(word2)

    if(order == 2):
        while(sentence[i] != "" or len(sentence) <= 3):
            root = (sentence[i-1], sentence[i])
            word1 = get_next(distributions, root, order)
            word2 = get_next(distributions, root, order)
            i += 1

            if ( dist_t[root].prob(word1) > dist_t[root].prob(word2) ):
                sentence.append(word1)
            else:
                sentence.append(word2)
            
        return sentence

    root = (sentence[i-2], sentence[i-1], sentence[i])
    word1 = get_next(distributions, root, 3)
    word2 = get_next(distributions, root, 3)
    i += 1

    if ( dist_f[root].prob(word1) > dist_f[root].prob(word2) ):
        sentence.append(word1)
    else:
        sentence.append(word2)

    if(order == 3):
        while(sentence[i] != "" or len(sentence) <= 4):
            root = (sentence[i-2], sentence[i-1], sentence[i])
            word1 = get_next(distributions, root, order)
            word2 = get_next(distributions, root, order)
            i += 1

            if ( dist_f[root].prob(word1) > dist_f[root].prob(word2) ):
                sentence.append(word1)
            else:
                sentence.append(word2)
            
        return sentence

    
    

# Edit this one
def make_pos_sentence(dist, seed, is_tgram=False):
    i=0
    sentence = []
    sentence.append(seed)
    if(not is_tgram):
        while(sentence[i] != ""):
            sentence.append(get_next(dist, sentence[i]))
            i += 1
    else:
        sentence.append(get_next(dist,sentence[i]))
        while(sentence[i] != ""):
            sentence.append(get_next(dist,(sentence[i-1],sentence[i]) ))
            i += 1
            
    return sentence


def get_ngram_probability(distributions, ngram_input, sentence, order=1):
    dist_b = distributions[0]
    dist_t = distributions[1]
    dist_f = distributions[2]

    prob = np.log(ngram_input.count(sentence[0])/len(ngram_input))
    prob += np.log( dist_b[sentence[0]].prob(sentence[1]) )

    if(order == 1):
        for i in range(2,len(sentence)):
            condition = sentence[i-1]
            prob += np.log(dist_b[condition].prob(sentence[i]))
            
        return prob/len(sentence)
    
    condition = (sentence[0], sentence[1])
    prob += np.log( dist_t[condition].prob(sentence[2]) )

    if(order == 2):
        for i in range(3,len(sentence)):
            condition = (sentence[i-2], sentence[i-1])
            prob += np.log(dist_t[condition].prob(sentence[i]))
            
        return prob/len(sentence)

    condition = (sentence[0], sentence[1], sentence[2])
    prob += np.log( dist_f[condition].prob(sentence[3]) )

    if(order == 3):
        for i in range(4,len(sentence)):
            condition = (sentence[i-3], sentence[i-2], sentence[i-1])
            prob += np.log(dist_f[condition].prob(sentence[i]))
            
        return prob/len(sentence)

def get_pos_probability(dist, pos_input, tag_sentence):
    prob = np.log(pos_input.count(tag_sentence[0][0])/len(pos_input))
    
    for i in range(1,len(tag_sentence)):
        prob += np.log(dist[tag_sentence[i-1]].prob(sentence[i]))

    return prob/len(sentence)



In [2]:
# Import and parse the corpus

corpus_root = './corpus_clean/'
corpus = BracketParseCorpusReader(corpus_root, ".*")

tagged_sentences = corpus.tagged_sents()
ngram_input = []
pos_input = []
legal_tags = ["EOS","$","#", "GW", "CC", "CD", "DT", "EX", "FW", "IN", "JJ","JJR","JJS","LS","MD",
             "NN","NNS","NNP",'NNPS','PDT','POS','PRP','PRP$','RB','RBR','RBS','RP','TO', "UH",'VB',
             'VBD',"VBG","VBN","VBP","VBZ","WDT","WP","WP$","WRB", "\"", "\'", ",", ".", "AFX"]

single_letter_words = ["a", "i", ",", ".", "!", "?", "\'", "\"", ":", ';', '0', '1', '2', "3", '4',
                       '5', "6", '7', '8', "9", "=", "&", "#", '/', '>', "$", '<', '+', '%',]

# tags_removed = ["-NONE-","SYM", "CODE", "ADD", "HYPH","-LSB-", "-RSB-",":", "NFP", "XX", "-LRB-", "-RRB-"]

#  Remove -NONE- and  SYM tags from the training data and create a list of tokens and a list of tags.
for sentence in tagged_sentences:
    for token in sentence:
        word = token[0].lower()
        tag = token[1]
        
        if(tag == "NP"):
            tag = "NNS"

        if not tag in legal_tags:
            del token
            continue
        
        if len(word) == 1:
            if not word in single_letter_words:
                del token
                continue
        
        if (word[0:5] == "rsquo"):
            word = "\'" + word[5:]

        ngram_input.append(word)
        pos_input.append(tag)

    ngram_input.append("")
    pos_input.append("EOS")

unique_alphas = []

for string in list(set(ngram_input)):
    if string[0:1].isalpha():
        unique_alphas.append(string)

print("There are",len(ngram_input),"tokens in the corpus.")
print("There are",len(unique_alphas),"unique tokens that start with a letter.")


tag_set = set(pos_input)
print("There are",len(tag_set),"unique tags in the corpus.")




There are 543356 tokens in the corpus.
There are 25836 unique tokens that start with a letter.
There are 44 unique tags in the corpus.


In [3]:
print("http:$$$$demo.businesslayers.com" in ngram_input)
print("ADD" in pos_input)


False
False


In [4]:
# Create bigram and trigram lists
bgram = list(ngrams(ngram_input,2))
tgram = list(ngrams(ngram_input,3))
fgram = list(ngrams(ngram_input,4))

pos_bgram = list(ngrams(pos_input,2))


# Create conditional frequency distributions
cfd_b = ConditionalFreqDist(bgram)

cfd_t = ConditionalFreqDist()
for trigram in tgram:
    condition = (trigram[0], trigram[1])
    cfd_t[condition][trigram[2]] += 1

cfd_f = ConditionalFreqDist()
for fourgram in fgram:
    condition = (fourgram[0], fourgram[1], fourgram[2])
    cfd_f[condition][fourgram[3]] += 1

cfd_pos = ConditionalFreqDist(pos_bgram)

cfd_t2w = ConditionalFreqDist()
for tag, word in zip(pos_input, ngram_input):
    cfd_t2w[tag][word] += 1

cfd_w2t = ConditionalFreqDist()
for tag, word in zip(pos_input, ngram_input):
    cfd_w2t[word][tag] += 1

# Create conditional probability distributions
cpd_b = ConditionalProbDist(cfd_b, MLEProbDist)
cpd_t = ConditionalProbDist(cfd_t, MLEProbDist)
cpd_f = ConditionalProbDist(cfd_f, MLEProbDist)

cpd_pos = ConditionalProbDist(cfd_pos, MLEProbDist)
cpd_t2w = ConditionalProbDist(cfd_t2w, MLEProbDist)
cpd_w2t = ConditionalProbDist(cfd_w2t, MLEProbDist)


# Consolidate the ngram probability distributions into a single object
distributions = [cpd_b, cpd_t, cpd_f]



print("There are",len(bgram),"bigrams.")
print("There are",len(tgram),"trigrams.")
print("There are",len(fgram),"fourgrams.")



There are 543355 bigrams.
There are 543354 trigrams.
There are 543353 fourgrams.


In [8]:

seed_list = np.random.choice(unique_alphas,NUM_SEED)

print(seed_list,"\n\n")

for i in range(NUM_SEED):

    sentences = []
    full_sentences = []
    prob = []

    for j in range(PER_SEED):
        sentences.append(make_ngram_sentence(distributions, seed_list[i], 3))
        prob.append(get_ngram_probability(distributions, ngram_input, sentences[j], 3))
        full_sentences.append(" ".join(sentences[j]))

    best_index = np.argmax(prob)
    print(full_sentences[best_index],"\n",prob[best_index],"\n")


['celli' 'urgently' 'positivity' 'shriek' 'ferried' 'pantyhose' 'sadly'
 'coughed' 'systems' 'da-de-da-de-da-de-dat-dat-dah'] 


celli and basses  example 4 transitions from c♯ to f♯ , and adds pitch class to the pentatonic set of example 3 , outlines a pentatonic set and strongly presents c♯ as the tonal center for the section .  
 -0.494018898592 

urgently furnish this office with your detailed information to enable us process your payment are as follows ; 1 your full names 2 contact address 3 phone numbers 4 qualification educational level 5 age :  
 -0.455099273094 

positivity for ama .  
 -2.64110400021 

shriek pitifully , breath coming in shallow sobs .  
 -1.3205520001 

ferried home to houseboats , and the second greatest terrorist threat comes from several countries , mainly israel , who sometimes resort to terrorism themselves , claiming " self defense , " as some israeli neocons would try to say ? " first year florida coach ron zook said with a sly tone .  
 -0.4369424884

In [9]:
def get_next_tag(pos_dist, tag):
    return pos_dist[tag].generate()

def get_next_word(t2w_dist, tag):
    return t2w_dist[tag].generate()


def make_pos_sentence(pos_dist, t2w_dist, w2t_dist, seed):
    i=0
    tag = w2t_dist[seed].generate()

    sentence = []
    sentence.append(seed)

    tags = []
    tags.append(tag)

    while(sentence[i] != "" or len(sentence) <= 2 ):
        tags.append(get_next_tag(pos_dist, tags[i]))
        sentence.append(get_next_word(t2w_dist, tags[i]))
        i += 1
        
    return sentence
    
print(make_pos_sentence(cpd_pos, cpd_t2w, cpd_w2t, "farming"))


['farming', 'everyone', 'good', 'oneself', 'the', 'network', ';', 'and', 'one', 'tourists', 'with', 'my', 'regulations', 'of', 'forces', 'heard', 'up', 'for', 'confident', 'of', 'mirror', 'held', 'the', 'simple', 'victim', 'and', 'one', '18', 'options', ',', 'long', 'to', 'be', 'at', 'senator', '"', 'when', 'them', ',', 'according', 'myself', 'use', 'benefit', ',', 'building', 'exposition', 'into', 'your', 'lack', 'to', 'do', 'him', ',', 'you', "'re", 'up', ',', 'mg', 'in', 'this', 'netscape', 'but', ',', 'the', 'bars', 'including', 'over', 'the', 'time', 'engaged', 'that', 'the', 'has', 'ago', 'of', 'i', 'with', 'me', "'s", 'at', 'alongside', 'homes', 'as', 'the', 'laws', 'as', 'gen.', 'enron', 'new', 'ian', ',', 'indicadores', 'hall', 'chung', "'s", 'abandoning', ',', 'be', 'a', 'downplayed', 'facility', ',', 'the', 'farmer', 'of', 'their', 'respect', 'to', "n't", '.', '']
