In [None]:
import numpy as np
import sqlite3, time, csv, re
from collections import defaultdict, Counter
from random import choice, randint, shuffle
import unicodedata


In [None]:
print('Querying DB...\n')
sql_conn = sqlite3.connect("database.sqlite")
the_data = sql_conn.execute("SELECT subreddit, body FROM May2015 limit 500000")
print('Done querying DB...\n')

In [None]:
print('Building Corpora...\n')
corpus_dict = defaultdict(list)

for post in the_data:
    # corpus.append(post[1])
    corpus_dict[post[0]].append(post[1])
print('Done building Corpora...\n')

In [None]:
#for k in sorted(corpus_dict, key=lambda k: len(corpus_dict[k]), reverse=True):
#    print (k, len(corpus_dict[k]))

In [None]:
#subreds = ['AskReddit', 'news','csgobetting', 'nfl', 'nba', 'hockey', 'chicagobulls']
subreds = ['nfl']

temp = [corpus_dict[a] for a in subreds]
corpus = [item for sublist in temp for item in sublist]


In [None]:

# removing newline; tabs and encoding stuff
# remove if exactly same comments appear 5+ times
#    '*Please refer to our [detailed rules and posting guidelines.]'
#    'feel free to make a new post that fits within the rules'
# remove links? 
def text_clean(inputlist):
    
    cnt = Counter(inputlist)
    dups = [k for k, v in cnt.items() if v > 1]
    
    #dups = [item for item in set(cleaned) if cleaned.count(item) > 1]
    removed = list(set(inputlist) - set(dups))
    
    cleaned = []
    for comment in removed:
        cleaned.append([a for a in comment.replace('&gt;', '>').replace('&lt;', '>').split(' ') if (a!='')]) #.replace("\n", '').replace("\t", '')
    
    

    
    return cleaned
print ("original length: " , len(corpus))
cleaned = text_clean(corpus)
print ("cleaned length: " , len(cleaned))


In [None]:
def make_ngrams(n, inputlist):
    """Make ngrams of every n consecutive
    words to feed the dictionary function, AS LIST."""
    ngrams = []
    
    '''    
    for x in range(0, len(inputlist)-n):
        wordlist = []
        for i in range(n):
            wordlist.append(inputlist[x+i])
        ngrams.append(wordlist)
    ''' 
    for comment in inputlist:
        if len(comment) >= n:
            for x in range(0, len(comment)-n):
                
                wordlist = []
                
                for i in range(n):
                    wordlist.append(comment[x+i])
                
                ngrams.append(wordlist)
    return ngrams

N = 5
ngrams = make_ngrams(N, cleaned)

In [None]:
choice(list(ngrams))

In [None]:
def make_dictionary(n, ngram):
    """For every ngram, takes first n-1 words as key, and last as value."""

    temp_dict = defaultdict(list)
    
    for ng in ngram:
        
        wordlist = ng[:n-1]
        final_word = ng[n-1]
        
        temp_dict[tuple(wordlist)].append(final_word)

    return temp_dict

ngram_dict = make_dictionary(N, ngrams)

In [None]:
length_cnt = defaultdict(int)

for k, v in ngram_dict.items():
    length_cnt[len(v)] += 1

#length_cnt

In [None]:
def filter_dict(threshold, d):
    final_d = {}
    for k, v in d.items():
        if len(v) > threshold:
            final_d[k] = v
        
    return final_d

print('Dictionary length before threhold: ', len(ngram_dict))

filtered = filter_dict(1, ngram_dict)

print('Dictionary length after threhold: ', len(filtered))


In [None]:
def generate(n, ngram_dict, length):
    """Make random text of given length (using ngrams of the given n)."""
    
    #seed_no = randint(0,len(inputlist)) # choose random seed
    
    start = list(choice(list(ngram_dict))) # start off with randomly chosen n-1 words
    output = start
    
    word_count = n-1
    done = 0
    
    while(True):
        
        last_char = output[-1][-1]
        
        # last character was end of sentence punct:
        if (last_char in ['.', '!', '?']):
            if (word_count > length):
                break
        
        next_key = tuple(output[-(n-1):])
        
        if next_key not in ngram_dict:
            #print ('restart')
            next_key = choice(list(ngram_dict))
        
        output.append(choice(ngram_dict[next_key]))
                
        
        word_count += 1

    return " ".join(output)


## PERPLEXITY

In [None]:
# BUILDING BIGRAM CORPUS

def to_bigram_words(text):
    
    bigrams = []
    for i in range(0, len(text)-1):
        bigrams.append((text[i], text[i+1]))
    return bigrams

# input: list of comments ['this is comment 1', 'this is comment 2', etc]
# output: list of character level 5-grams ['this ', 'his is', 'is is', 's is ', ' is c', 'is co' , etc]    
def get_corpus_char(all_text):
    all_tokens = []

    for comment in all_text:
        #print(comment)
        all_tokens += to_bigram_words(comment)
    return all_tokens

all_tokens = get_corpus_char(cleaned)

In [None]:
def fivegrams(tokens):
    
    model = defaultdict(lambda: 2.5e-06) # baseline for words that don't appear in corpus -- smoothing
    for f in tokens:
        try:
            model[f] += 1
        except KeyError:
            model [f] = 1
            continue

    total = float(sum(model.values()))
    
    for word in model:
        model[word] = model[word]/total
    
    return model

fivegram_prob = fivegrams(all_tokens)

In [None]:
# checking lowest probability word to decide smoothing value above
i = 0
aa = []
lowest = 1
lowest_word = ''
for k, v in fivegram_prob.items():
    if v < lowest:
        lowest = v
        lowest_word = k
        
lowest_word, lowest

In [None]:
#computes perplexity of the unigram model on a testset  
def perplexity(testset, model):
    
    #testset = testset.split()
    #testset = char_ngrams(testset)
    
    testset = [b for l in [testset] for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]

    #print(testset)
    perplexity = 1
    N = 0
    for word in testset:
        N += 1
        perplexity = perplexity * (1/model[word])
        
    perplexity = pow(perplexity, 1/float(N))
    
    return perplexity

In [None]:
NBA_toeval = []
while(len(NBA_toeval) < 2000):
    sent = generate(N, filtered, 12)
    if perplexity(sent, fivegram_prob) < 999999:
        NBA_toeval.append(sent)
    #else:
        #print ('aa')

In [None]:
NBA_scores_ng = []
for sent in NBA_toeval:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    #print(sc)
    NBA_scores_ng.append(sc)
    
np.mean(NBA_scores_ng)

In [None]:
NBA_hmm = []
with open('hmm/nbaResults.txt', 'r') as f:
    
    #for line in f:
    text = f.read()
    NBA_hmm = text.split('\n')
    
NBA_hmm = [ sent for sent in NBA_hmm if len(sent.split()) == 12]

In [None]:
NBA_scores_hmm = []
for sent in NBA_hmm:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    #print(sc)
    NBA_scores_hmm.append(sc)
    
np.mean(NBA_scores_hmm)

In [None]:
#NFL_toeval = []
while(len(NFL_toeval) < 2000):
    sent = generate(N, filtered, 12)
    if perplexity(sent, fivegram_prob) < 999999:
        NFL_toeval.append(sent)
        #print (len(NFL_toeval))
    #else:
        #print ('aa')

In [None]:
NFL_scores_ng = []
for sent in NFL_toeval:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    #print(sc)
    NFL_scores_ng.append(sc)
    
np.mean(NFL_scores_ng)

In [None]:
NFL_hmm = []
with open('hmm/nflResults.txt', 'r') as f:
    
    #for line in f:
    text = f.read()
        #print (unicode(line, errors='ignore'))
    NFL_hmm = text.split('\n')
    
NFL_hmm = [ sent for sent in NFL_hmm if len(sent.split()) == 12]

In [None]:
NFL_scores_hmm = []
for sent in NFL_hmm:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    #print(sc)
    NFL_scores_hmm.append(sc)
    
np.mean(NFL_scores_hmm)

In [None]:
AR_toeval = []
while(len(AR_toeval) < 2000):
    sent = generate(N, filtered, 12)
    if perplexity(sent, fivegram_prob) < 999999:
        AR_toeval.append(sent)
    else:
        print ('aa')

In [None]:
AR_scores_ng = []
for sent in AR_toeval:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    #print(sc)
    AR_scores_ng.append(sc)
    
np.mean(AR_scores_ng)

In [None]:
AR_hmm = []
with open('hmm/AskRedditResults.txt', 'r') as f:
    
    #for line in f:
    text = f.read()
        #print (unicode(line, errors='ignore'))
    AR_hmm = text.split('\n')
    
AR_hmm = [ sent for sent in AR_hmm if len(sent.split()) == 12]

In [None]:
AR_scores_hmm = []
for sent in AR_hmm:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    #print(sc)
    AR_scores_hmm.append(sc)
    
np.mean(AR_scores_hmm)