Find file
Fetching contributors…
Cannot retrieve contributors at this time
74 lines (57 sloc) 2.43 KB
import argparse
import json
import nltk
from nltk.collocations import BigramCollocationFinder, TrigramCollocationFinder
from nltk.corpus import stopwords
#import extract_clean_tweet_content
en_stopwords = set(w for w in stopwords.words('english'))
def tweet_as_terms(tweets):
tweets = tweets.lower()
tweets = tweets.replace('-', ' ')
tweets = tweets.replace('"', ' ')
tweets_split = [term for term in tweets.split() if term not in en_stopwords]
return tweets_split
def extract_top_collocations(json_cleaned, return_top_n=10, use_trigrams=False):
if use_trigrams:
measures = nltk.collocations.TrigramAssocMeasures()
measures = nltk.collocations.BigramAssocMeasures()
items = json_cleaned
tweets = "\n".join(item['tweet'] for item in items)
tweets_split = tweet_as_terms(tweets)
# change this to read in your data
if use_trigrams:
finder = TrigramCollocationFinder.from_words(tweets_split)
finder = BigramCollocationFinder.from_words(tweets_split)
# only bigrams that appear 3+ times
# return the 10 n-grams with the highest PMI
top_collocations = finder.nbest(measures.pmi, return_top_n)
return top_collocations
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract information from streaming tweet set')
parser.add_argument('--json-cleaned', help='Cleaned input json')
parser.add_argument('--plain-text', help='File of plain text')
args = parser.parse_args()
items = []
if args.json_cleaned:
for line in open(args.json_cleaned):
if args.plain_text:
for line in open(args.plain_text):
items.append({'tweet': line.strip()})
# the following is a hacky way of cleaning a tweet but the results do not
# markedly improve, so I'm commenting this out (maybe to revisit)
# force a cleaning of the tweet - ignore URL tweets, lowercase, clean some
# unicode
#for item in items:
#cleaned_set = extract_clean_tweet_content.clean_tweets([json.dumps(item)])
#cleaned_tweet = ""
#if len(cleaned_set):
#cleaned_tweet = cleaned_set.pop()
#item['tweet'] = cleaned_tweet
top_collocations = extract_top_collocations(items)
print top_collocations
top_collocations = extract_top_collocations(items, use_trigrams=True)
print top_collocations