-
Notifications
You must be signed in to change notification settings - Fork 0
/
redTweets.py
70 lines (57 loc) · 1.8 KB
/
redTweets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python
#
# redTweets.py: Reduce file used by Hadoop
# Input: stock sentiments as ticker|sentiment pairs, keyed by time (10 minute periods)
# Output: time, ticker, aggregated sun of tweetShift, aggregated count of Tweets, and the text and score of the most
# significant Tweet for the period
import sys
from datetime import datetime
data={}
current_time = '';
class Entry:
def __init__(self):
self.sum = 0.0
self.count = 0
self.hi_tweet = ''
self.hi_score = 0.0
def update(self, tweet, score):
self.sum += score
self.count += 1
if abs(score) > abs(self.hi_score):
self.hi_tweet = tweet
self.hi_score = score
def output():
for ticker in data:
try:
entry = data[ticker]
print '%s,%s,%0.4f,%d,%s,%0.4f' % (current_time, ticker, entry.sum, entry.count,
entry.hi_tweet, entry.hi_score)
except:
continue
input = sys.stdin
for line in input:
try:
keyVal1 = line.strip().split('\t')
time = keyVal1[0]
vals = keyVal1[1].strip().split(',')
# Keys are in order so a new time means we can output the old time period
if time != current_time:
output()
current_time = time
data = {}
# Current tweet
tweet = vals[0]
# Aggregate each ticker
for entry in vals[1:]:
keyVal2 = entry.split('|')
ticker = keyVal2[0]
score = float(keyVal2[1])
if not ticker in data:
data[ticker] = Entry()
entry = data[ticker]
entry.update(tweet, score)
except Exception as e:
print >> sys.stderr, e
continue
# Output the last time period
output()