forked from ajschumacher/gadsdc
-
Notifications
You must be signed in to change notification settings - Fork 10
/
scrapers_ala_sinan.py
44 lines (28 loc) · 1.47 KB
/
scrapers_ala_sinan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
###### begin stock scraper
import requests
from BeautifulSoup import BeautifulSoup
stock_ticker = 'ge'
base_url = "http://www.marketwatch.com/investing/stock/" #base url for all stock quotes
r = requests.get(base_url + stock_ticker) # create requests objects
soup = BeautifulSoup(r.text) # create Beautiful Soup Object (note bs3 notation)
price = float(soup.find('p', attrs={'class':'data bgLast'}).text) # this pattern I found in the html
###### end stock scraper
###### begin twitter scraper
handle = 'planarrowspace'
base_url = "https://twitter.com/"
r = requests.get(base_url + handle)
soup = BeautifulSoup(r.text)
#I found dates and contents seperately and in the end used a python command (zip) to put them together
tweet_dates = soup.findAll('a', attrs={'class':'ProfileTweet-timestamp js-permalink js-nav js-tooltip'})
dates = [t['title'] for t in tweet_dates]
tweet_contents = soup.findAll('div', attrs={'class':'ProfileTweet-contents'})
contents = [t.findChild('p', attrs={'class':'ProfileTweet-text js-tweet-text u-dir'}).text for t in tweet_contents]
# at this point, contents holds a list of all tweets on page and dates holds a list of times represented as strings
dictionary_of_tweets = [{
'tweet_contents':a,
'tweet_date':b
}
for a, b in zip(contents, dates)
]
print dictionary_of_tweets
###### end twitter scraper