In [393]:
import numpy as np
import sqlite3, time, csv, re
from collections import defaultdict, Counter
from random import choice, randint, shuffle
import unicodedata


In [504]:
print('Querying DB...\n')
sql_conn = sqlite3.connect("database.sqlite")
the_data = sql_conn.execute("SELECT subreddit, body FROM May2015 limit 500000")
print('Done querying DB...\n')

Querying DB...

Done querying DB...



In [505]:
print('Building Corpora...\n')
corpus_dict = defaultdict(list)

for post in the_data:
    # corpus.append(post[1])
    corpus_dict[post[0]].append(post[1])
print('Done building Corpora...\n')

Building Corpora...

Done building Corpora...



In [506]:
#sorted_corpus = sorted(corpus_dict, key=lambda k: len(corpus_dict[k]), reverse=True)

for k in sorted(corpus_dict, key=lambda k: len(corpus_dict[k]), reverse=True):
    print (k, len(corpus_dict[k]))

AskReddit 37018
nfl 35062
nba 15721
hockey 9031
funny 7032
WTF 6025
leagueoflegends 5710
videos 5057
news 4917
movies 4320
worldnews 3991
pics 3947
todayilearned 3415
AdviceAnimals 3057
DotA2 2913
pcmasterrace 2784
SquaredCircle 2548
GlobalOffensive 2476
amiibo 2443
fatpeoplehate 2356
Showerthoughts 2346
anime 2281
politics 2226
trees 2208
KCRoyals 2198
DestinyTheGame 2149
gaming 2145
Browns 2110
eagles 2108
CasualConversation 2107
Texans 2080
Music 2043
IAmA 1977
minnesotavikings 1929
LAClippers 1803
gifs 1777
chicagobulls 1735
electronic_cigarette 1672
relationships 1654
gonewild 1613
smashbros 1517
rangers 1507
thebutton 1444
CHIBears 1434
Fireteams 1375
aww 1345
cowboys 1341
soccer 1298
Mariners 1289
newsokur 1244
TwoXChromosomes 1242
Patriots 1236
csgobetting 1231
explainlikeimfive 1224
greysanatomy 1217
GreenBayPackers 1201
whowouldwin 1197
teenagers 1186
AskWomen 1180
Pokemongiveaway 1176
technology 1175
Random_Acts_Of_Amazon 1175
personalfinance 1155
TumblrInAction 1125
caps 11

In [507]:
#subreds = ['AskReddit', 'news','csgobetting', 'nfl', 'nba', 'hockey', 'chicagobulls']
subreds = ['nfl']

temp = [corpus_dict[a] for a in subreds]
corpus = [item for sublist in temp for item in sublist]


In [508]:

# removing newline; tabs and encoding stuff
# remove if exactly same comments appear 5+ times
#    '*Please refer to our [detailed rules and posting guidelines.]'
#    'feel free to make a new post that fits within the rules'
# remove links? 
def text_clean(inputlist):
    
    cnt = Counter(inputlist)
    dups = [k for k, v in cnt.items() if v > 1]
    
    #dups = [item for item in set(cleaned) if cleaned.count(item) > 1]
    removed = list(set(inputlist) - set(dups))
    
    cleaned = []
    for comment in removed:
        cleaned.append([a for a in comment.replace('&gt;', '>').replace('&lt;', '>').split(' ') if (a!='')]) #.replace("\n", '').replace("\t", '')
    
    

    
    return cleaned
print ("original length: " , len(corpus))
cleaned = text_clean(corpus)
print ("cleaned length: " , len(cleaned))


original length:  35062
cleaned length:  31111


In [509]:
def make_ngrams(n, inputlist):
    """Make ngrams of every n consecutive
    words to feed the dictionary function, AS LIST."""
    ngrams = []
    
    '''    
    for x in range(0, len(inputlist)-n):
        wordlist = []
        for i in range(n):
            wordlist.append(inputlist[x+i])
        ngrams.append(wordlist)
    ''' 
    for comment in inputlist:
        if len(comment) >= n:
            for x in range(0, len(comment)-n):
                
                wordlist = []
                
                for i in range(n):
                    wordlist.append(comment[x+i])
                
                ngrams.append(wordlist)
    return ngrams

N = 5
ngrams = make_ngrams(N, cleaned)

In [510]:
choice(list(ngrams))

['lil', 'bit', 'more', 'time.\n\n\nThey', 'wait']

In [511]:
def make_dictionary(n, ngram):
    """For every ngram, takes first n-1 words as key, and last as value."""

    temp_dict = defaultdict(list)
    
    for ng in ngram:
        
        wordlist = ng[:n-1]
        final_word = ng[n-1]
        
        temp_dict[tuple(wordlist)].append(final_word)

    return temp_dict

ngram_dict = make_dictionary(N, ngrams)

In [512]:
length_cnt = defaultdict(int)

for k, v in ngram_dict.items():
    length_cnt[len(v)] += 1

length_cnt

defaultdict(int,
            {1: 214730,
             2: 4593,
             3: 879,
             4: 274,
             5: 131,
             6: 64,
             7: 35,
             8: 19,
             9: 23,
             10: 16,
             11: 16,
             12: 7,
             13: 4,
             14: 5,
             15: 3,
             16: 3,
             17: 2,
             18: 4,
             19: 2,
             21: 2,
             22: 2,
             23: 2,
             26: 1,
             29: 2,
             31: 1,
             32: 1,
             33: 4,
             41: 1,
             52: 1,
             72: 1,
             77: 1})

In [513]:
def filter_dict(threshold, d):
    final_d = {}
    for k, v in d.items():
        if len(v) > threshold:
            final_d[k] = v
        
    return final_d

print('Dictionary length before threhold: ', len(ngram_dict))

filtered = filter_dict(1, ngram_dict)

print('Dictionary length after threhold: ', len(filtered))


Dictionary length before threhold:  220829
Dictionary length after threhold:  6099


In [514]:
def generate(n, ngram_dict, length):
    """Make random text of given length (using ngrams of the given n)."""
    
    #seed_no = randint(0,len(inputlist)) # choose random seed
    
    start = list(choice(list(ngram_dict))) # start off with randomly chosen n-1 words
    output = start
    
    word_count = n-1
    done = 0
    
    while(True):
        
        last_char = output[-1][-1]
        
        # last character was end of sentence punct:
        if (last_char in ['.', '!', '?']):
            if (word_count > length):
                break
        
        next_key = tuple(output[-(n-1):])
        
        if next_key not in ngram_dict:
            #print ('restart')
            next_key = choice(list(ngram_dict))
        
        output.append(choice(ngram_dict[next_key]))
                
        
        word_count += 1

    return " ".join(output)


## PERPLEXITY

In [515]:
# BUILDING BIGRAM CORPUS

def to_bigram_words(text):
    
    bigrams = []
    for i in range(0, len(text)-1):
        bigrams.append((text[i], text[i+1]))
    return bigrams

# input: list of comments ['this is comment 1', 'this is comment 2', etc]
# output: list of character level 5-grams ['this ', 'his is', 'is is', 's is ', ' is c', 'is co' , etc]    
def get_corpus_char(all_text):
    all_tokens = []

    for comment in all_text:
        #print(comment)
        all_tokens += to_bigram_words(comment)
    return all_tokens

all_tokens = get_corpus_char(cleaned)

In [518]:
def fivegrams(tokens):
    
    model = defaultdict(lambda: 2.5e-06) # baseline for words that don't appear in corpus -- smoothing
    for f in tokens:
        try:
            model[f] += 1
        except KeyError:
            model [f] = 1
            continue

    total = float(sum(model.values()))
    
    for word in model:
        model[word] = model[word]/total
    
    return model

fivegram_prob = fivegrams(all_tokens)

In [519]:
# checking lowest probability word to decide smoothing value above
i = 0
aa = []
lowest = 1
lowest_word = ''
for k, v in fivegram_prob.items():
    if v < lowest:
        lowest = v
        lowest_word = k
        
lowest_word, lowest

(('some', 'Steven'), 2.9673538938206285e-06)

In [520]:
#computes perplexity of the unigram model on a testset  
def perplexity(testset, model):
    
    #testset = testset.split()
    #testset = char_ngrams(testset)
    
    testset = [b for l in [testset] for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]

    #print(testset)
    perplexity = 1
    N = 0
    for word in testset:
        N += 1
        perplexity = perplexity * (1/model[word])
        
    perplexity = pow(perplexity, 1/float(N))
    
    return perplexity

In [458]:
NBA_toeval = []
while(len(NBA_toeval) < 2000):
    sent = generate(N, filtered, 12)
    if perplexity(sent, fivegram_prob) < 999999:
        NBA_toeval.append(sent)
    else:
        print ('aa')

aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa


In [459]:
NBA_scores_ng = []
for sent in NBA_toeval:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    #print(sc)
    NBA_scores_ng.append(sc)
    
np.mean(NBA_scores_ng)

122951.66989038413

In [460]:
NBA_hmm = []
with open('hmm/nbaResults.txt', 'r') as f:
    
    #for line in f:
    text = f.read()
    NBA_hmm = text.split('\n')
    
NBA_hmm = [ sent for sent in NBA_hmm if len(sent.split()) == 12]

In [461]:
NBA_scores_hmm = []
for sent in NBA_hmm:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    #print(sc)
    NBA_scores_hmm.append(sc)
    
np.mean(NBA_scores_hmm)

292267.10878983542

In [542]:
#NFL_toeval = []
while(len(NFL_toeval) < 2000):
    sent = generate(N, filtered, 12)
    if perplexity(sent, fivegram_prob) < 999999:
        NFL_toeval.append(sent)
        print (len(NFL_toeval))
    else:
        print ('aa')

1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
aa
1667
1668
1669
1670
1671
aa
1672
1673
1674
aa
1675
1676
aa
1677
1678
aa
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
aa
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
aa
aa
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
aa
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
aa
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
aa
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
aa
1767
1768
1769
1770
aa
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
aa
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
aa
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
aa
18

In [543]:
NFL_scores_ng = []
for sent in NFL_toeval:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    print(sc)
    NFL_scores_ng.append(sc)
    
np.mean(NFL_scores_ng)

279053.49569313537
174910.31740123263
173109.08081068908
138056.74475427126
71867.43664949587
151525.23817392063
296343.77854408603
56654.93407753886
200440.63127972884
48340.09245800481
196557.10077586622
78192.92769228836
171972.11050490593
219978.58531074467
13497.521527231147
157149.2305169771
139709.96196832266
98124.77890700685
184381.16417233174
227617.37662932638
93165.99446127658
139545.60606063757
70891.50917055724
168901.37250520123
93334.89391747637
141805.26925994278
174119.751118885
162269.8516011771
171219.97151737884
98713.11002443252
65684.36909709123
236840.46858061774
155793.3288481249
213527.06125360145
144827.69793693363
245080.07461207127
115837.64529725588
174622.93128369199
195729.49658608055
197910.53284397267
195096.68760052553
143039.55466778536
261280.44271077198
228611.96167067232
115624.78479130384
164162.98385672408
86311.0433225181
135988.47896274575
138258.2891125075
229603.68884747187
191188.02990563464
111259.22676281452
201190.7254428163
136889.99484

147496.43684154112

In [525]:
NFL_hmm = []
with open('hmm/nflResults.txt', 'r') as f:
    
    #for line in f:
    text = f.read()
        #print (unicode(line, errors='ignore'))
    NFL_hmm = text.split('\n')
    
NFL_hmm = [ sent for sent in NFL_hmm if len(sent.split()) == 12]

In [526]:
NFL_scores_hmm = []
for sent in NFL_hmm:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    #print(sc)
    NFL_scores_hmm.append(sc)
    
np.mean(NFL_scores_hmm)

293236.66436370951

In [441]:
AR_toeval = []
while(len(AR_toeval) < 2000):
    sent = generate(N, filtered, 12)
    if perplexity(sent, fivegram_prob) < 999999:
        AR_toeval.append(sent)
    else:
        print ('aa')

aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa
aa


In [442]:
AR_scores_ng = []
for sent in AR_toeval:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    #print(sc)
    AR_scores_ng.append(sc)
    
np.mean(AR_scores_ng)

144317.55286906497

In [443]:
AR_hmm = []
with open('hmm/AskRedditResults.txt', 'r') as f:
    
    #for line in f:
    text = f.read()
        #print (unicode(line, errors='ignore'))
    AR_hmm = text.split('\n')
    
AR_hmm = [ sent for sent in AR_hmm if len(sent.split()) == 12]

In [444]:
AR_scores_hmm = []
for sent in AR_hmm:
    #print(sent)
    sc = perplexity(sent, fivegram_prob)
    #print(sc)
    AR_scores_hmm.append(sc)
    
np.mean(AR_scores_hmm)

283473.57305733149