# Scrape tweets from available caches

Once we've found the available archives we need to scrape the HTML of those pages for the tweet content. We'll use the `BeautifulSoup` library to do that.

Since the caches were taken at different times, the structure of the HTML may have changed. We'll need to write code that can handle parsing these different formats.

In [2]:
import urllib
from bs4 import BeautifulSoup
import csv
import requests

In [3]:
newTweetArr = []

## Testing scraping code

We've found two versions of the Twitter user pages in the caches. One from ~2015, and one used around ~2016-2017. We'll use one twitter page of each to test our parsing code, ensuring we get the data we want for each


In [4]:
# testing for new version

url = "http://web.archive.org/web/20150603004258/https://twitter.com/AlwaysHungryBae"
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')

tweets = soup.find_all('li', attrs={'data-item-type': 'tweet'})

for t in tweets:
    tweet_obj = {}
    tweet_obj['tweet_id'] = t.get("data-item-id")
    tweet_container = t.find('div', attrs={'class': 'tweet'})
    tweet_obj['screen_name'] = tweet_container.get('data-screen-name')
    tweet_obj['permalink'] = tweet_container.get('data-permalink-path')
    tweet_content = tweet_container.find('p', attrs={'class': 'tweet-text'})
    tweet_obj['tweet_text'] = tweet_content.text
    tweet_obj['user_id'] = tweet_container.get('data-user-id')
    
    tweet_time = tweet_container.find('span', attrs={'class': '_timestamp'})
    tweet_obj['timestamp'] = tweet_time.get('data-time-ms')
    
    hashtags = tweet_container.find_all('a', attrs={'class': 'twitter-hashtag'})
    tweet_obj['hashtags'] = []
    tweet_obj['links'] = []
    
    for ht in hashtags:
        ht_obj = {}
        ht_obj['tag'] = ht.find('b').text
        ht_obj['archived_url'] = ht.get('href')
        tweet_obj['hashtags'].append(ht_obj)
    
    links = tweet_container.find_all('a', attrs={'class': 'twitter-timeline-link'})
    for li in links:
        li_obj = {}
        if li.get('data-expanded-url'):
            li_obj['url'] = li.get('data-expanded-url')
        elif li.get('data-resolved-url-large'):
            li_obj['url'] = li.get('data-resolved-url-large')
        else:
            li_obj['url'] = li.text
        li_obj['archived_url'] = li.get('href')
        tweet_obj['links'].append(li_obj)
    
    print(tweet_obj)

{'tweet_id': '561931644785811457', 'screen_name': 'AlwaysHungryBae', 'permalink': '/AlwaysHungryBae/status/561931644785811457', 'tweet_text': 'Happy Super Bowl Sunday \n#superbowlfood pic.twitter.com/s6rwMtdLom', 'user_id': '2882130846', 'timestamp': '1422809918000', 'hashtags': [{'tag': 'superbowlfood', 'archived_url': '/web/20150603004258/https://twitter.com/hashtag/superbowlfood?src=hash'}], 'links': [{'url': 'pic.twitter.com/s6rwMtdLom', 'archived_url': 'http://web.archive.org/web/20150603004258/http://t.co/s6rwMtdLom'}, {'url': 'https://pbs.twimg.com/media/B8xh2fFCQAE-vxU.jpg:large', 'archived_url': '//web.archive.org/web/20150603004258/https://twitter.com/AlwaysHungryBae/status/561931644785811457/photo/1'}]}
{'tweet_id': '561917739108155392', 'screen_name': 'BMoreBirdsNest', 'permalink': '/BMoreBirdsNest/status/561917739108155392', 'tweet_text': 'Making the award-winning Pigs in a Pillow again today! Bacon, goat cheese, jam, glazed donut as bread. #superbowlfood pic.twitter.com/1

In [57]:

# testing for old version

url = "http://web.archive.org/web/20140615165707/https://twitter.com/NikaFast"
page = requests.get(url).text
soup = BeautifulSoup(page, 'html.parser')

#tweets = soup.find_all('p', attrs={'class': 'ProfileTweet-text'})
tweets = soup.find_all('div', attrs={'data-item-type': 'tweet'})


for t in tweets:
    #print(t.text)
    tweet_container = t.find('div')
    tweet_content = tweet_container.find('p', attrs={'class': 'ProfileTweet-text'}).text
    urls = []
    print("--------------------")
    for l in tweet_container.find_all('a', attrs={'class': 'twitter-timeline-link'}):
        print(l.get('href'))
        print(l.text)
    print(tweet_content)
    print(tweet_container.find('a', attrs={'class': 'js-permalink'}).get('href'))
    print(tweet_container.get('data-screen-name'))
    print(tweet_container.get('data-tweet-id'))
    print(tweet_container.get('data-user-id'))
    for h in tweet_container.find_all('a', attrs={'class': 'twitter-hashtag'}):
        print(h.get('href'))
        print(h.find('b').text)
    print("--------------------")

--------------------
http://web.archive.org/web/20140615165707/http://t.co/SB2S8EmxKD
http://bit.ly/1lCXL1d 
Ну, это больше похоже на шутку. Хотя... ЕС - марионетка США, вредящая странам участницам. http://bit.ly/1lCXL1d 
/web/20140615165707/https://twitter.com/NikaFast/status/478219206542901249
NikaFast
478219206542901249
772081711
--------------------
--------------------
http://web.archive.org/web/20140615165707/http://t.co/BVbQTAcjPH
http://bit.ly/1p4F7ka 
Переговоры велись в трёхстороннем режиме:Россия-Украина-ЕС. http://bit.ly/1p4F7ka  #ЕС
/web/20140615165707/https://twitter.com/NikaFast/status/478180966305763329
NikaFast
478180966305763329
772081711
/web/20140615165707/https://twitter.com/hashtag/%D0%95%D0%A1?src=hash
ЕС
--------------------
--------------------
http://web.archive.org/web/20140615165707/http://t.co/JFEtsuKzMB
http://bit.ly/1hWT56S 
Вы читали это? ЕС забыли о своих принципах http://bit.ly/1hWT56S 
/web/20140615165707/https://twitter.com/NikaFast/status/4781540558

## Scraping Tweets

Now that our parsing code can handle either HTML format let's scrape all the tweets we can find from the available archives:

In [1]:
tweet_arr = []
with open('avail_urls.txt') as f:
    #with open('tweets_2.csv', 'w') as tweet_file:
    #    writer  = csv.writer(tweet_file)
    for l in f:
        page = requests.get(l).text
        soup = BeautifulSoup(page, 'html.parser')
        
        tweets = soup.find_all('div', attrs={'data-item-type': 'tweet'})
        for t in tweets:
            tweet_container = t.find('div')
            try:
                tweet_content = tweet_container.find('p', attrs={'class': 'ProfileTweet-text'}).text
                tweet_obj = {}
                tweet_obj['tweet_text'] = tweet_content
                tweet_obj['permalink'] = tweet_container.find('a', attrs={'class': 'js-permalink'}).get('href')
                tweet_obj['screen_name'] = tweet_container.get('data-screen-name')
                tweet_obj['tweet_id'] = tweet_container.get('data-tweet-id')
                tweet_obj['user_id'] = tweet_container.get('data-user-id')
                tweet_obj['links'] = []
                tweet_obj['hashtags'] = []
                print("--------------------")
                for l in tweet_container.find_all('a', attrs={'class': 'twitter-timeline-link'}):
                    lo = {}
                    lo['archived_url'] = l.get('href')
                    lo['url'] = l.text
                    tweet_obj['links'].append(lo)
                for h in tweet_container.find_all('a', attrs={'class': 'twitter-hashtag'}):
                    ht = {}
                    ht['tag'] = h.find('b').text
                    ht['archived_url'] = h.get('href')
                    tweet_obj['hashtags'].append(ht)
                print("--------------------")
                tweet_arr.append(tweet_obj)
                print("processed a user")
            except:
                pass
        if not tweets:
            #newer html version
            tweets = soup.find_all('li', attrs={'data-item-type': 'tweet'})
            try:
                for t in tweets:
                    tweet_obj = {}
                    tweet_obj['tweet_id'] = t.get("data-item-id")
                    tweet_container = t.find('div', attrs={'class': 'tweet'})
                    tweet_obj['screen_name'] = tweet_container.get('data-screen-name')
                    tweet_obj['permalink'] = tweet_container.get('data-permalink-path')
                    tweet_content = tweet_container.find('p', attrs={'class': 'tweet-text'})
                    tweet_obj['tweet_text'] = tweet_content.text
                    tweet_obj['user_id'] = tweet_container.get('data-user-id')
                    
                    tweet_time = tweet_container.find('span', attrs={'class': '_timestamp'})
                    tweet_obj['timestamp'] = tweet_time.get('data-time-ms')

                    hashtags = tweet_container.find_all('a', attrs={'class': 'twitter-hashtag'})
                    tweet_obj['hashtags'] = []
                    tweet_obj['links'] = []

                    for ht in hashtags:
                        ht_obj = {}
                        ht_obj['tag'] = ht.find('b').text
                        ht_obj['archived_url'] = ht.get('href')
                        tweet_obj['hashtags'].append(ht_obj)

                    links = tweet_container.find_all('a', attrs={'class': 'twitter-timeline-link'})
                    for li in links:
                        li_obj = {}
                        if li.get('data-expanded-url'):
                            li_obj['url'] = li.get('data-expanded-url')
                        elif li.get('data-resolved-url-large'):
                            li_obj['url'] = li.get('data-resolved-url-large')
                        else:
                            li_obj['url'] = li.text
                        li_obj['archived_url'] = li.get('href')
                        tweet_obj['links'].append(li_obj)
                    tweet_arr.append(tweet_obj)
            except:
                pass
        if not tweets:
            print("NO TWEETS FOR " + l)

In [82]:
# how many tweets did we find?
len(tweet_arr)

1538

In [64]:
# inspect a tweet
tweet_arr[-25]

{'hashtags': [{'archived_url': '/web/20150115191946/https://twitter.com/hashtag/DeadHorse?src=hash',
   'tag': 'DeadHorse'},
  {'archived_url': '/web/20150115191946/https://twitter.com/hashtag/ColumbianChemicals?src=hash',
   'tag': 'ColumbianChemicals'}],
 'links': [],
 'permalink': '/web/20150115191946/https://twitter.com/Rrrebbecaa/status/510204835032211456',
 'screen_name': 'Rrrebbecaa',
 'tweet_id': '510204835032211456',
 'tweet_text': 'Guys keep it cool. A Darwin award for each of them! #DeadHorse #ColumbianChemicals',
 'user_id': '2753274783'}

In [83]:
# write tweets to file
import json
with open('./data/tweets_full.json', 'w') as f:
    json.dump(tweet_arr, f, ensure_ascii=False, sort_keys=True, indent=4)