-
Notifications
You must be signed in to change notification settings - Fork 0
/
giphycrawl.py
194 lines (153 loc) · 5.43 KB
/
giphycrawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
''' Giphy Crawl -- Obtain GIFs from the Categories page
Author:
Justin Morgan
jjmorgan@ucla.edu
Repository:
github.com/jjmorgan/GiphyCrawl
Obtains gifs from the most popular tags in the Giphy database
- Crawling is used to obtain the categories and tags
- Giphy API is used to get the images
- Does not guarantee that images across categories are unique
'''
import urllib2
import os.path
import json
import threading
import time
from sys import stdout
from sys import exit
MAX_DL_PER_TAG = 10
MAX_THREADS = 6
PUBLIC_BETA_KEY = "dc6zaTOxFJmzC"
USER_AGENT = "Mozilla/5.0 (Windows NT 6.0; WOW64; rv:24.0) Gecko/20100101 Firefox/24.0"
''' Thread-safe print to console '''
def thread_print(str, print_lock):
print_lock.acquire()
try:
print str
finally:
print_lock.release()
''' Get images from tags in a category '''
def crawl_category(tid, cid, category, tag_list_array, print_lock):
tid_str = "[" + str(tid) + "]"
dir = "crawl\\" + str(cid)
if not os.path.exists(dir):
os.mkdir(dir)
thread_print(tid_str + " Begin retrieving images for " + category + " (" + str(cid) + ")", print_lock)
tags = get_tags(category)
for t in tags:
thread_print(tid_str + " Searching by tag: " + t, print_lock)
urls = get_urls_by_tag(t)
for n, u, p in urls:
download_image(n, u, dir)
tag_list_array.append(get_tags_for_image(n, p))
#thread_print(tid_str + " Downloaded: " + name, print_lock)
''' Save an image to the crawl directory '''
def download_image(name, url, todir):
path = os.path.join(todir, name)
if not os.path.exists(path):
with open(path, 'wb+') as fout:
response = urllib2.urlopen(url).read()
fout.write(response)
''' Retrieve URLs to the original image and containing page '''
def get_urls_by_tag(tag):
urls = []
t = tag.replace(' ', '+')
limit = str(MAX_DL_PER_TAG)
try:
# api.giphy.com checks the User-Agent of the request object to ensure it's a valid browser
request = urllib2.Request("http://api.giphy.com/v1/gifs/search?q=" + t + "&limit=" + limit + "&api_key=" + PUBLIC_BETA_KEY)
request.add_header("User-Agent", USER_AGENT)
response = urllib2.urlopen(request).read()
except urllib2.HTTPError, e:
with open("httperror.html", "w+") as f:
f.write(e.fp.read())
print "*** API call failed!"
return [], []
result = json.loads(response)
images = result["data"]
for g in images:
# (name, originalurl, pageurl)
urls.append( (g["id"] + ".gif", g["images"]["original"]["url"], g["url"]) )
return urls
''' Get the most popular tags associated with a category '''
def get_tags(category):
tags = []
response = urllib2.urlopen("http://giphy.com/categories/" + category).read()
# Tags are defined by the element:
# <a href="/search/<TAG>/">#<TAG></a>
i = 0
while True:
start = response.find("/\">#", i)
if start == -1:
break
end = response.find("<", start)
tag = response[start+4:end]
tags.append(tag)
i = start + 1
return tags
''' Get category names from the categories page '''
def get_categories():
categories = []
response = urllib2.urlopen("http://giphy.com/categories").read()
# Categories are defined by the element:
# <a href="/categories/<CATEGORY>" class="tag">
i = 0
while True:
end = response.find("class=\"tag\"", i)
if end == -1:
break
start = response.rfind("/", i, end)
category = response[start+1:end-2]
categories.append(category)
i = end + 1
return categories
''' Get the user tags / keywords associated with a single image '''
def get_tags_for_image(name, url):
# Tags associated with an image are not given in the JSON response
# Only way to get them is to visit the image's page itself
response = urllib2.urlopen(url).read()
# Tags for an image are defined by the element:
# <meta name="keywords" content="tag, tag, tag, tag, ..., tag, GIF, Animated GIF">
# The last two tags are always "GIF, Animated GIF", so we leave these out
end = response.find(", GIF, Animated GIF\">")
start = response.rfind("\"", 0, end)
tags = response[start+1:end]
return (name, tags)
''' Main routine -- Launches threads to crawl Giphy categories '''
def main():
if not os.path.exists("crawl"):
os.mkdir("crawl")
categories = get_categories()
print "Starting crawl for", len(categories), "categories..."
# assign each thread a single category to crawl
print_lock = threading.Lock()
threads = []
tag_list_array = []
c_index = 0
for i in range(min(MAX_THREADS, len(categories))):
t = threading.Thread(target=crawl_category, args=(i, c_index, categories[c_index], tag_list_array, print_lock))
threads.append((t, i))
t.start()
c_index += 1
# wait for threads to finish, and create new threads for remaining categories
while len(threads) > 0:
finished_threads = []
for t, tid in threads:
if not t.isAlive():
finished_threads.append((t, tid))
for f, tid in finished_threads:
f.join()
threads.remove((f, tid))
if c_index < len(categories):
t = threading.Thread(target=crawl_category, args=(tid, c_index, categories[c_index], tag_list_array, print_lock))
threads.append((t, tid))
t.start()
c_index += 1
with open("crawl\\taglist.txt", "w+") as tagf:
for n, t in tag_list_array:
tagf.write(n + '\t' + t + '\n')
print "Done."
''' Main entry point '''
if __name__ == '__main__':
main()