Find file
Fetching contributors…
Cannot retrieve contributors at this time
238 lines (195 sloc) 10.1 KB
#!/usr/bin/env python
"""1 liner to explain this project"""
# -*- coding: utf-8 -*-
import argparse
import json
import sys
import logging
from ttp import ttp
import maksim_utils
import networkx as nx
import matplotlib.pyplot as plt
import make_ngrams
import colloc_analysis
plt.__str__ # silly way to stop pylint error (use plt in IPython)
# Usage:
# load datasets into memory, also output a text file for later parsing
# $ %run --json-raw /media/2ndDrive/data/streaming-twitter-data/pycon/tweets_pycon0.json /media/2ndDrive/data/streaming-twitter-data/pycon/tweets_pycon.json -o clean_pycon_withtweets.json
# $ %run --json-cleaned clean_pycon_withtweets.json --remove-nodes #pycon #python #pycon2013 @pycon --write-graphml pyconout.graphml --remove-hashtags-below 3 --remove-usernames-below 15 --remove-phrases-below 3
# pydata
# $ %run --json-raw /media/2ndDrive/data/streaming-twitter-data/pycon/tweets_pydata0.json /media/2ndDrive/data/streaming-twitter-data/pycon/tweets_pydata.json -o clean_pydata.json
# $ %run --json-cleaned clean_pydata.json --remove-nodes #pydata --remove-usernames-below 0 --remove-phrases-below 0 --write-graphml pydataout.graphml --draw-networkx
# brighton
# $ %run --json-raw /media/2ndDrive/data/streaming-twitter-data/tweets_snapshot4/tweets_brighton.json -o clean_brighton.json
# $ %run --json-cleaned clean_brighton.json --remove-nodes #brighton --draw-networkx --write-graphml brightonout.graphml
# london
# $ %run --json-raw /media/2ndDrive/data/streaming-twitter-data/tweets_snapshot4/tweets_london.json -o clean_london.json
# %run --json-cleaned clean_london.json --remove-nodes #london --remove-hashtags-below 80 --remove-usernames-below 100 --remove-phrases-below 80 --draw-networkx --write-graphml londonout.graphml
# london fashion week
# %run --json-raw /media/2ndDrive/data/streaming-twitter-data/tweets_all_lfw_london_fashion_week.json -o clean_londonfashionweek.json
LOG_FILE = "extractor.log"
logging.basicConfig(filename=LOG_FILE, level=logging.DEBUG)
def get_tweets(tweets):
"""Generator to return entry from valid JSON lines"""
for tweet in tweets:
# load with json to validate
tw = json.loads(tweet)
yield tw
except ValueError as err:
logging.debug("Odd! We have a ValueError when json.loads(tweet): %r" % repr(err))
#def filter_http(tweets):
#"""Ignore links with http links (can be useful to ignore spam)"""
#for tweet in tweets:
#if 'http' not in tweet['text']:
#yield tweet
#except KeyError as err:
#logging.debug("Odd! We have a KeyError: %r" % repr(err))
def get_tweet_body(tweets):
"""Get tweets, ignore ReTweets"""
for tweet in tweets:
if 'text' in tweet:
if not tweet['text'].startswith('RT'):
yield tweet
except KeyError as err:
logging.debug("Odd! We have a KeyError: %r" % repr(err))
def get_useful_information(tweet_parser, tweets):
"""Extract a set of useful information about the tweets that we want to graph"""
for tweet in tweets:
text = tweet['text']
# replace newlines with nothing
text = text.replace('\r', '')
text = text.replace('\n', '')
screen_name = tweet['user']['screen_name'].lower()
result = tweet_parser.parse(text)
hashtags = [tag.lower() for tag in result.tags]
users = [user.lower() for user in result.users]
items = {'hashtags': ['#' + h for h in hashtags], 'tweet': text, 'screen_name': screen_name, 'users': ['@' + usr for usr in users]}
yield items
def files(file_list):
"""Yield lines from a list of input json data files"""
for filename in file_list:
f = open(filename)
for line in f:
yield line
def add_node(G, node_name):
"""Add a node to graph, make a label, increase weight if seen before"""
typ = 2 # default to type 2 (phrase)
label = node_name
if node_name.startswith('#'):
typ = 0
label = node_name[1:]
if node_name.startswith('@'):
typ = 1
label = node_name[1:]
if not G.has_node(node_name):
G.add_node(node_name, label=label, type=typ, weight=-1)
G.node[node_name]['weight'] += 1
def build_and_trim_network(json_cleaned_lines, remove_nodes, remove_usernames_below, remove_hashtags_below, remove_phrases_below):
items = json_cleaned_lines
hashtag_net = nx.Graph()
top_collocations = colloc_analysis.extract_top_collocations(items)
for item in items:
# combine hashtags and users into one list of things to pair up
all_items = item['hashtags'] + item['users']
word_sequences = make_ngrams.get_cleaned_capitalised_word_sequences(item['tweet'])
for word_sequence in word_sequences:
if len(word_sequence) > 1:
capitalised_words = " ".join(word_sequence)
capitalised_words = capitalised_words.lower() # normalise e.g. Github GitHub GITHUB -> github
# extract frequent collocations
tweet_cleaned_lowercased = " ".join(colloc_analysis.tweet_as_terms(item['tweet']))
for top_collocation in top_collocations:
tc = " ".join(top_collocation)
if tc in tweet_cleaned_lowercased:
all_items.append(tc) # add collocation phrase
# add nodes with a default weight
for item in all_items:
add_node(hashtag_net, item)
for t1 in all_items:
for t2 in all_items:
if t1 is not t2:
maksim_utils.add_or_inc_edge(hashtag_net, t1, t2)
for node in hashtag_net.nodes():
if node.startswith('@'):
if hashtag_net.node[node]['weight'] < remove_usernames_below:
if node.startswith('#'):
if hashtag_net.node[node]['weight'] < remove_hashtags_below:
if not node.startswith('#') and not node.startswith('@'):
# here if we have a phrase
if hashtag_net.node[node]['weight'] < remove_phrases_below:
# remove nodes that too many people might be connected to
for removal in remove_nodes:
except nx.NetworkXError as err:
logging.warning("Node %r not in the graph (error==%r)" % (removal, err))
# remove singularly connected nodes until none left
while True:
nbr_of_nodes = hashtag_net.number_of_nodes()"Trimming, currently we have %d nodes" % (nbr_of_nodes))
hashtag_net = maksim_utils.trim_degrees(hashtag_net)
if hashtag_net.number_of_nodes() == nbr_of_nodes:
return hashtag_net
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Extract information from streaming tweet set')
parser.add_argument('--json-raw', nargs="*", help='Input to analyse e.g. tweets.json')
parser.add_argument('--output', "-o", help="Output to write (else stdout) e.g. -o pycon.json")
parser.add_argument('--json-cleaned', help='Cleaned input json')
parser.add_argument('--remove-nodes', nargs="*", default=[], help='Remove named nodes e.g. "--remove-nodes #pycon @pycon"')
parser.add_argument('--draw-networkx', action="store_true", help='Draw the graph using networkX')
parser.add_argument('--write-graphml', help='Filename for graphml output')
parser.add_argument('--remove-usernames-below', type=int, default=50, help='Remove usernames who are mentioned less than n times e.g. "--remove-usernames-below 50"')
parser.add_argument('--remove-hashtags-below', type=int, default=2, help='Remove hashtags that are mentioned less than n times e.g. "--remove-hashtagss-below 2"')
parser.add_argument('--remove-phrases-below', type=int, default=10, help='Remove phrases (>1 word) that are mentioned less than n times e.g. "--remove-phrases-below 10"')
args = parser.parse_args()
if args.json_raw:
tweet_parser = ttp.Parser()
# stream through a list of user-provided filenames
all_json_lines = files(args.json_raw)
tweets = get_tweets(all_json_lines)
# get tweets (ignore rubbish from streaming api), extract useful info
stream = get_tweet_body(tweets)
stream = get_useful_information(tweet_parser, stream)
if args.output:
output = open(args.output, 'w')
output = sys.stdout # use stdout if no file specified
items = []
for item in stream:
outstr = json.dumps(item)
output.write("%s\n" % (outstr))
if args.output:
output.close() # don't close sys.stdout by mistake
if args.json_cleaned:
items = []
for line in open(args.json_cleaned):
hashtag_net = build_and_trim_network(items, args.remove_nodes, args.remove_usernames_below, args.remove_hashtags_below, args.remove_phrases_below)
if args.draw_networkx:
# we can draw a network using networkx, optionally using graphviz
# for improved layout
graphviz = True
import pygraphviz
pygraphviz.release.version # stupid statement to avoid pylint error
except ImportError as err:
graphviz = False
if graphviz:"Drawing using GraphViz layout engine")
nx.draw_graphviz(hashtag_net, edge_color="b")
else:"Drawing using NetworkX layout engine")
if args.write_graphml:
nx.write_graphml(hashtag_net, open(args.write_graphml, "w"))