/
tweetsToJson.py
executable file
·163 lines (142 loc) · 5.72 KB
/
tweetsToJson.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#! /usr/bin/env python2
#
# usage: tweetsToJson input_file output_file
#
import json
import sys
import os, errno
import urllib
from urlparse import urlparse
from tweet import Tweet
def extractTweets(fileName):
""" Extract tweets from JSON file into a list of Tweets """
tweetList = list()
try:
file = open(fileName)
json_str = file.read()
data = json.loads(json_str)
print("Parsing "+fileName+" ...")
for tweet in data['results']:
nTweet = Tweet()
nTweet.id = tweet['id']
nTweet.userId = tweet['from_user_id']
nTweet.text = tweet['text']
nTweet.user = tweet['from_user']
nTweet.userName = tweet['from_user_name']
nTweet.profileImgUrlHttp = tweet['profile_image_url']
nTweet.source = tweet['source']
nTweet.toUser = tweet['to_user']
nTweet.date = tweet['created_at']
if ('urls' in tweet['entities']):
for urls in tweet['entities']['urls']:
urlStr = urls['url']
expandedUrl = urls['expanded_url']
try:
u = urllib.urlopen(expandedUrl)
expandedUrl = u.url
u = None
except IOError as ioe:
print("Error urllib.urlopen")
print("---> URL = {}".format(expandedUrl))
continue
nTweet.urls.append(expandedUrl)
for mention in tweet['entities']['user_mentions']:
nTweet.userMentions.append({"id":mention["id"],"name":mention["name"], "screenName":mention["screen_name"]})
tweetList.append(nTweet)
file.close()
except(ValueError):
sys.exit("Error while parsing {0}".format(fileName) + " Not a valid JSON file")
return tweetList
def printTweets(tweetList):
""" Print tweets from tweetList on stdout """
for tweet in tweetList:
print("-------------------\n")
print("From : {0} ID : {1}".format(tweet.userName,tweet.userId))
print("Msg : "+tweet.text+"\n")
for mention in tweet.userMentions:
print("Mentioned User ID : {0}".format(mention))
print("There are {0} tweets".format(len(tweetList)))
def findNodeFromUid(gexf,uid):
""" Returns the Node corresponding to the userId of a tweet """
for node in gexf.graphs[0].nodes.values():
if (str(node.attributes[0]['value']) == str(uid)):
return node
def clearDuplicates(tweetList, tmpList):
""" Returns a list without duplicate entries """
temp = tmpList
for tweet in tweetList:
for t in tmpList:
if (t.id == tweet.id):
temp.remove(t)
return temp
def compute_user_score(user, tweetList):
count = 0
for tweet in tweetList:
if tweet.userName != user:
for mention in tweet.userMentions:
if mention["name"] == user:
count += 1
return count
def compute_site_score(url, tweetList):
count = 0
for tweet in tweetList:
for turl in tweet.urls:
if turl == url:
count += 1
return count
def get_articles(hostname,tweetList):
url_list = list()
for tweet in tweetList:
for url in tweet.urls:
if hostname == urlparse(url).hostname:
if url not in url_list:
url_list.append(url)
return url_list
def writeJson(tweetList, outputFile):
nodeList = list()
edgesList = list()
for tweet in tweetList:
if not any (tweet.userName == node["nodeName"] for node in nodeList):
nodeList.append({"userId":tweet.user, "nodeName":tweet.userName, "avatar": tweet.profileImgUrlHttp, "nodeScore":compute_user_score(tweet.userName, tweetList), "webSite":False})
if len(tweet.urls) > 0:
for url in tweet.urls:
url_hostname = urlparse(url).hostname
if not any (url_hostname == node["nodeName"] for node in nodeList):
nodeList.append({"nodeName":url_hostname, "nodeScore":compute_site_score(url,tweetList),"website":True, "articles": get_articles(url_hostname,tweetList)})
edgesList.append({"source":tweet.userName,"destination":url_hostname, "url":url ,"date": tweet.date})
for mention in tweet.userMentions:
if not any(mention["name"] == node["nodeName"] for node in nodeList):
nodeList.append({"userId":mention["screenName"] ,"nodeName":mention["name"],"nodeScore":compute_user_score(mention["name"],tweetList), "webSite":False})
edgesList.append({"source":tweet.userName,"destination":mention["name"], "date":tweet.date, "content":tweet.text})
edgesList.sort(key=lambda r: r["date"])
dic = {"nodes":nodeList,"edges": edgesList}
json.dump(dic, open(outputFile,"w"))
if (len(sys.argv) < 3):
print( """
usage: tweetsToJson input output_file
- input: input file or directory
- output: output file
""")
sys.exit()
inFile = sys.argv[1]
outFile = sys.argv[2]
tlist = list()
cpt = 0
# if the input file is a directory, parse each file from it
if (os.path.isdir(inFile)):
jsonFiles = os.listdir(inFile)
print("Parsing files from "+inFile+"/ ...")
for file in jsonFiles:
tmpList = (extractTweets(inFile + "/" + file))
if (len(tlist) > 0):
tmpList = clearDuplicates(tlist, tmpList)
tlist.extend(tmpList)
if (cpt % 10 == 0 and cpt > 0):
writeJson(tlist,outFile)
x = cpt / len(tlist) * 100
print("%.2f%% done" % x)
cpt += 1
else:
tlist = extractTweets(inFile)
print("{0} tweets parsed".format(len(tlist)))
writeJson(tlist, outFile)