In [None]:
import numpy as np
import sqlite3, time, csv, re
from collections import defaultdict, Counter
from random import choice, randint, shuffle


In [None]:
print('Querying DB...\n')
sql_conn = sqlite3.connect("database.sqlite")
the_data = sql_conn.execute("SELECT subreddit, body FROM May2015 limit 500000")
print('Done querying DB...\n')

In [None]:
print('Building Corpora...\n')
corpus_dict = defaultdict(list)

for post in the_data:
    # corpus.append(post[1])
    corpus_dict[post[0]].append(post[1])
print('Done building Corpora...\n')

In [None]:
# separating comments by their subreddits
# 'subreddit1' : ['comment1', 'comment2' 'comment3', etc]
#for k in sorted(corpus_dict, key=lambda k: len(corpus_dict[k]), reverse=True):
#    print (k, len(corpus_dict[k]))

In [None]:
# Selecting subreddits

subreds = ['nba']
#subreds = ['leagueoflegends']


#[[all comments in subreddits separated by comma]]
temp = [corpus_dict[a] for a in subreds] 

# [comment1, comment2, etc]
sentences = [item for sublist in temp for item in sublist]

# [c, o, m, m, e, n, t, etc]
characters = [c for sentence in sentences for c in sentence]

In [None]:
# remove tabs/newline??
# remove if exact same comments appear 5+ times
#    '*Please refer to our [detailed rules and posting guidelines.]'
#    'feel free to make a new post that fits within the rules'
# remove links? 

def text_clean(inputlist):
    
    cnt = Counter(inputlist)
    dups = [k for k, v in cnt.items() if v > 5]
    removed = list(set(inputlist) - set(dups))
    
    cleaned = []
    for comment in removed:
        c = re.sub(' +',' ',comment)
        #cleaned.append([a for a in c.replace('&gt;', '>').replace('&lt;', '>').replace('&amp;', '&')]) #.replace("\n", '').replace("\t", '')
        cleaned.append(c.replace('&gt;', '>').replace('&lt;', '>').replace('&amp;', '&'))
    
    return cleaned

print ("original length: " , len(sentences))

cleaned = text_clean(sentences)

print ("cleaned length: " , len(cleaned))

In [None]:
cleaned[:10]

In [None]:
## combine with dictionary?
def make_ngrams(n, inputlist):

    ngrams = []
    temp_dict = defaultdict(list)

    for comment in inputlist:
        if len(comment) >= n:
            for x in range(0, len(comment)-n):
                
                charlist = []
                
                for i in range(n):
                    charlist.append(comment[x+i])
                
                keys = tuple(charlist[:n-1])
                final_char = charlist[n-1]
                
                temp_dict[keys].append(final_char)
    return temp_dict

N = 5
ngrams = make_ngrams(N, cleaned)

In [None]:
length_cnt = defaultdict(int)

for k, v in ngrams.items():
    length_cnt[len(v)] += 1

#length_cnt

In [None]:
def filter_dict(threshold, d):
    final_d = {}
    for k, v in d.items():
        if len(v) > threshold:
            final_d[k] = v
        
    return final_d

print('Dictionary length before threhold: ', len(ngrams))

filtered = filter_dict(10, ngrams)

print('Dictionary length after threhold: ', len(filtered))


## Generate random string

In [None]:
def generate(n, ngram_dict, length):
    """Make random text of given length (using ngrams of the given n)."""
    
    #seed_no = randint(0,len(inputlist)) # choose random seed
    
    #start = list(choice(list(ngram_dict))) # start off with randomly chosen n-1 words
    #output = start
    
    # start off with randomly chosen n-1 words that starts with upper case letter
    start = list(choice(tuple([l for l in list(ngram_dict) if l[0].isupper()]))) 
    output = start
    
    letter_count = n-1
    
    while(True):
        
        last_char = output[-1][-1]
        
        # last character was end of sentence punct:
        if (last_char in ['.', '!', '?']):
            if (letter_count > length):
                break
            
        next_key = tuple(output[-(n-1):])
        
        if next_key not in ngram_dict:
            #print ('restart')
            next_key = choice(list(ngram_dict))
        
        output.append(choice(ngram_dict[next_key]))
        
        
        letter_count += 1

    return "".join(output)


## Input starting string

In [None]:
def generate_with_start(startstr, n, ngram_dict, length):
    """Make random text of given length (using ngrams of the given n)."""
    
    # if starting string length > n, use last n-1 
    if len(startstr) >= n:
        last_n_char = startstr[-(n-1):]
        candidates = [ng for ng in list(ngram_dict) if ng[:n-1] == tuple(i for i in last_n_char)]
        
        start = list(choice(candidates))
        output = list(startstr[:len(startstr)-len(last_n_char)]) + start
    else:
        candidates = [ng for ng in list(ngram_dict) if ng[:len(startstr)] == tuple(i for i in startstr)]
            
        start = list(choice(candidates))
        output = start
    
    letter_count = n-1
    
    while(True):
        
        last_char = output[-1][-1]
        
        # last character was end of sentence:
        if (last_char in ['.', '!', '?']):
            if (letter_count > length):
                break
            
        next_key = tuple(output[-(n-1):])
        
        if next_key not in ngram_dict:
            #print ('restart')
            next_key = choice(list(ngram_dict))
        
        output.append(choice(ngram_dict[next_key]))
        
        
        letter_count += 1

    return "".join(output)

In [None]:
startstr = "A"
generate_with_start(startstr, N, filtered, 100)

## Perplexity -- lower is better

In [None]:
# Should the corpus be character ngram? or unigram words???

# wrote functions for getting both character level corpus & unigram corpus
# using character level corpus, the probability of 5-gram characters appearing in that corpus is too low
# that when it calculates -log probability later, it gives infinity for almost everything

# switching to word level is a bit better

# when using below functions, use either get_corpus_char, or get_corpus_uni to get all possible tokens from corpus

In [None]:
len(cleaned) # number of comments

In [None]:
# BUILDING CHARACTER LEVEL CORPUS

def char_ngrams(text, n=5):
    return [text[i:i+n] for i in range(len(text)-n+1)]
   
# input: list of comments ['this is comment 1', 'this is comment 2', etc]
# output: list of character level 5-grams ['this ', 'his is', 'is is', 's is ', ' is c', 'is co' , etc]    
def get_corpus_char(all_text):
    all_tokens = []

    for comment in all_text:
        all_tokens += char_ngrams(comment)
    return all_tokens

all_tokens = get_corpus_char(cleaned)

In [None]:
len(all_tokens)

In [None]:
# BUILDING WORD LEVEL CORPUS

# should this be case-insensitive??
def unigram_words(text):
    return text.split(' ')

# input: list of comments ['this is comment 1', 'this is comment 2', etc]
# output: list of individual words ['this', 'is', 'comment', '1', etc]
def get_corpus_uni(all_text):
    all_tokens = []

    for comment in all_text:
        all_tokens += comment.split()
    return all_tokens

all_tokens = get_corpus_uni(cleaned)

In [None]:
len(all_tokens)

In [None]:
def fivegrams(tokens):
    
    model = defaultdict(lambda: 4.5e-06) # baseline for words that don't appear in corpus -- smoothing
    for f in tokens:
        try:
            model[f] += 1
        except KeyError:
            model [f] = 1
            continue

    total = float(sum(model.values()))
    
    for word in model:
        model[word] = model[word]/total
    
    return model

fivegram_prob = fivegrams(all_tokens)

In [None]:
# checking lowest probability word to decide smoothing value above
i = 0
aa = []
lowest = 1
lowest_word = ''
for k, v in fivegram_prob.items():
    if v < lowest:
        lowest = v
        lowest_word = k
        
lowest_word, lowest

In [None]:
# again, might make everything lowercase when calcualting perplexity
print(fivegram_prob['reddit'])
print(fivegram_prob['Reddit'])

In [None]:
#computes perplexity of the unigram model on a testset  
def perplexity(testset, model):
    
    testset = testset.split()
    #testset = char_ngrams(testset)
    
    #testset = unigram_words(testset)

    #print(testset)
    perplexity = 1
    N = 0
    for word in testset:
        N += 1
        perplexity = perplexity * (1/model[word])
        
    perplexity = pow(perplexity, 1/float(N))
    
    return perplexity

In [None]:
#AR_toeval = []
while(len(AR_toeval) < 2000):
    sent = generate(N, filtered, 12)
    if perplexity(sent, fivegram_prob) < 999999:
        AR_toeval.append(sent)
        #print(len(AR_toeval))
    #else:
        #print ('aa')

In [None]:
AR_scores_ng = []
for sent in AR_toeval:
    sc = perplexity(sent, fivegram_prob)
    AR_scores_ng.append(sc)
    
np.mean(AR_scores_ng)

In [None]:
AR_rnn = []
with open('rnn/AskReddit_rnn.txt', 'r') as f:
    text = f.read()
    #print(len(text))

comment = ''  
i = 0
while(i < len(text)):
    
    if (len(comment) < 150):
        comment += text[i]
    else:
        if (text[i] == ' '):
            AR_rnn.append(comment)
            comment = ''
        else:
            comment += text[i]
    i += 1

In [None]:
len(AR_rnn)

In [None]:
AR_scores_rnn = []
for sent in NBA_rnn:
    AR_scores_rnn.append(perplexity(sent, fivegram_prob))
    
np.mean(AR_scores_rnn)

In [None]:
#NBA_toeval = []
while(len(NBA_toeval) < 2000):
    sent = generate(N, filtered, 12)
    if perplexity(sent, fivegram_prob) < 999999:
        NBA_toeval.append(sent)
    #else:
        #print ('aa')

In [None]:
NBA_scores_ng = []
for sent in NBA_toeval:
    NBA_scores_ng.append(perplexity(sent, fivegram_prob))
    
np.mean(NBA_scores_ng)

In [None]:
NBA_rnn = []
with open('rnn/nba_rnn.txt', 'r') as f:
    text = f.read()
    #print(len(text))

comment = ''   
i = 0
while(i < len(text)):
    
    if (len(comment) < 150):
        comment += text[i]
    else:
        if (text[i] == ' '):
            NBA_rnn.append(comment)
            comment = ''
        else:
            comment += text[i]
    i += 1

In [None]:
NBA_scores_rnn = []
for sent in NBA_rnn:
    NBA_scores_rnn.append(perplexity(sent, fivegram_prob))
    
np.mean(NBA_scores_rnn)

In [None]:
NFL_toeval = []
while(len(NFL_toeval) < 2000):
    sent = generate(N, filtered, 200)
    if perplexity(sent, fivegram_prob) < 999999:
        NFL_toeval.append(sent)
        #print(len(NFL_toeval))
    #else:
        #print ('aa')

In [None]:
NFL_scores_ng = []
for sent in NBA_toeval:
    NFL_scores_ng.append(perplexity(sent, fivegram_prob))
    
np.mean(NFL_scores_ng)

In [None]:
NFL_rnn = []
with open('rnn/nfl_rnn.txt', 'r') as f:
    text = f.read()
    #print(len(text))

comment = '' 
i = 0
while(i < len(text)):
    
    if (len(comment) < 150):
        comment += text[i]
    else:
        if (text[i] == ' '):
            NFL_rnn.append(comment)
            comment = ''
        else:
            comment += text[i]
    i += 1

In [None]:
NFL_scores_rnn = []
for sent in NFL_rnn:
    NFL_scores_rnn.append(perplexity(sent, fivegram_prob))
    
np.mean(NFL_scores_rnn)