In [39]:
%matplotlib inline

In [1]:
import re
import nltk
import argparse
import numpy as np
import pandas as pd

from nltk.collocations import *
from clpsych.helpers import load_tokens

In [44]:
def filter_stopwords(docs, stopwords):
    """ Filter the stopwords from our dataset """
    filter_from = lambda d, s: [w for w in d if w not in s]
    return [filter_from(doc, stopwords) for doc in docs]

def load_stopwords(filename):
    """ Load stopwords from a file and return the individual words as a set. """
    return set([s.strip() for s in open(filename).read().split('\n')])

def print_collocations(collocations):
    """ Pretty-print the results. """
    for score in collocations:
        print('{}\t{}'.format(' '.join(score[0]), score[1]))

def collocations(finder, fn, cutoff=None, max_count=None):
    """ Get the top collocations using the NLTK finder """
    collocations = finder.score_ngrams(fn)
    # return only those collocations above the defined cut-off
    return [c for c in collocations[:max_count] if cutoff is None or c[1] >= cutoff]

def load_docs():
    """ Load the documents, and their desired labels. """
    sample_classes = pd.DataFrame.from_csv('data/classes/train_classes.txt')
    # load all the tokens into a dataframe
    df = load_tokens(mask='data/tokens/tokens?.txt')
    # just get the documents we're looking at -- the samples in this case
    sampled_docs = df.merge(sample_classes, on='post_id').replace(np.nan, '', regex=True)
    # join the document and title values into one
    sampled_docs['text_features'] = sampled_docs[['title', 'doc']].apply(lambda x: ' '.join(x), axis=1)
    # return the relevant dataframe
    return sampled_docs[['post_id', 'class', 'text_features']]

In [45]:
docs = load_docs()

In [None]:
positive_tokenized, all_tokenized = [], []
for i, post_id, cls, text_features in docs.itertuples():
    if cls:
        positive_tokenized.append(text_features.strip().split())
    all_tokenized.append(text_features.strip().split())

In [47]:
stopwords = load_stopwords('data/stopwords.txt')

In [None]:
positive_tokenized = filter_stopwords(positive_tokenized, stopwords)
all_tokenized = filter_stopwords(all_tokenized, stopwords)

In [None]:
pos_bgs = nltk.bigrams([t for doc in positive_tokenized for t in doc])
cnt_bgs = nltk.bigrams([t for doc in all_tokenized for t in doc])

In [52]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = BigramCollocationFinder.from_documents(positive_tokenized)
bigram_finder.apply_freq_filter(200)
bigram_colls = collocations(
    bigram_finder,
    bigram_measures.,
    max_count=50
)
print_collocations(bigram_colls)

via ifttt](http://ift.tt/1gysbzm	672030.753782
trend score	360626.184175
% trend	314893.953829
subscriber today	307070.27255
feel like	233904.507545
mildly trending	201009.119998
right now	153639.171609
year old	120942.480799
anyone else	118854.999615
trend nsfw	113755.062533
oink oink	95629.0204079
year ago	80161.1606728
high school	75492.4759511
even though	69280.0406925
last night	64875.1589085
x post	59417.5698779
first time	58770.9406145
just want	57200.927236
pretty much	52737.8900205
hodor hodor	52147.4206987
month ago	50853.0955312
united states	49798.6259698
san diego	49111.6157655
look like	47483.5965157
need help	45983.1391541
seem like	45331.9291669
@ newegg	43079.7074711
new york	42274.8145171
every day	39014.9136915
come back	37507.6360269
every time	36384.8964915
100 %	36227.3012295
hey guy	35903.8878248
anyone know	35752.5501148
get rid	35073.7745281
los angeles	34360.6776501
greatly appreciate	34313.1027664
go back	33431.1042318
make sure	33155.1451677
long time	32588.

In [None]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_finder = BigramCollocationFinder.from_documents(all_tokenized)
bigram_finder.apply_freq_filter(200)
bigram_colls = collocations(
    bigram_finder,
    bigram_measures.likelihood_ratio,
    max_count=50
)
print_collocations(bigram_colls)