-
Notifications
You must be signed in to change notification settings - Fork 0
/
reader.py
executable file
·117 lines (106 loc) · 2.48 KB
/
reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from StringIO import StringIO
from zipfile import ZipFile
import urllib2
import gzip
import re
import json
import socket
import time
from config import *
from topic import *
from candidateFilter import *
from sentimentPMI import *
from helper import *
num_of_retries = 50
# tweetRomneyTotal = open('tweetRomneyT'+'.txt', 'w')
# tweetRomneyCount = open('tweetRomneyCount.txt', 'w')
tweetFL = open('tweetFL.txt', 'w')
tweetIA = open('tweetIA.txt', 'w')
tweetVA = open('tweetVA.txt', 'w')
tweetOH = open('tweetOH.txt', 'w')
tweetCO = open('tweetCO.txt', 'w')
i = 0
for fileStr in fileStrs: #reversed(fileStrs[:i]):
# url is a file-like object
url = None
for _ in range (num_of_retries):
try:
url = urllib2.urlopen(fileStr, timeout = 50)
print 'Success open: ' + fileStr
break
except urllib2.URLError:
time.sleep(5)
tic = time.time()
try:
gzipfile = gzip.GzipFile(fileobj = StringIO(url.read()))
except IOError:
print 'Error read: ' + fileStr
continue
toc = time.time()
print toc - tic
# tweetRomney = open('tweetRomney'+str(i)+'.txt', 'w')
i -= 1
tweetCount = 0
for jsonStr in gzipfile:
jsonStrClean = cleanJson(jsonStr)
#print jsonStrClean
try:
jsonDict = json.loads(jsonStrClean)
except:
continue
text = ''
try:
text = jsonDict['text']
except KeyError:
pass
hashtags = []
try:
hashtags = jsonDict['entities']['hashtags']
except KeyError:
pass
location = ''
try:
location = jsonDict['user']['location']
if location == '':
continue
except KeyError as e:
continue
try:
lang = jsonDict['user']['lang']
if lang != '' and lang != 'en':
continue
except KeyError as e:
continue
num_fav = 0
try:
num_fav = jsonDict['user']['favourites_count']
if lang != '':
num_fav = int(num_fav)
except KeyError as e:
pass
num_retweet = 0
try:
num_retweet = jsonDict['retweet_count']
if num_retweet != '':
num_retweet = int(num_retweet)
except KeyError as e:
pass
text = hashtagPreprocss (text, hashtags)
if text == '':
continue
text += str((num_fav,num_retweet)) + '\n'
if not aboutRomney(text):
continue
if location in FL:
tweetFL.write(text.encode('utf8'))
if location in IA:
tweetIA.write(text.encode('utf8'))
if location in VA:
tweetVA.write(text.encode('utf8'))
if location in OH:
tweetOH.write(text.encode('utf8'))
if location in CO:
tweetCO.write(text.encode('utf8'))
#tweetCount += 1
#tweetRomneyCount.write(str(tweetCount))
print 'finished ' + str(fileStr)