# Scrape tweets from available caches

Once we've found the available archives we need to scrape the HTML of those pages for the tweet content.

We'll use the `lxml` library and XPath expressions to do that.

Since the caches were taken at different times, the structure of the HTML may have changed. We'll need to write code that can handle parsing these different formats.

In [697]:
import json
import requests
import lxml.html

In [698]:
def extract_hashtag(selector):
    d = {
        "tag": selector.xpath(".//b")[0].text,
        "archived_url": selector.get("href")
    }
    return d

In [699]:
def extract_hashtags(selector):
    hashtags_selector = selector.xpath(".//a[contains(@class, 'twitter-hashtag')]")
    hashtags = [extract_hashtag(hs) for hs in hashtags_selector]
    return hashtags

In [700]:
def extract_link(selector):
    expanded_url = selector.get("data-expanded-url")
    resolved_url = selector.get("data-resolved-url-large")
    if expanded_url:
        url = expanded_url
    elif resolved_url:
        url = resolved_url
    else:
        url = selector.text
        
    d = {
        "url": url,
        "archived_url": selector.get("href")
    }
    return d

In [701]:
def extract_links(selector):
    links_selector = selector.xpath(".//a[contains(@class, 'twitter-timeline-link')]")
    links = [extract_link(ls) for ls in links_selector]
    return links

## Older Twitter pages

We found two versions of the Twitter user page in the caches. The older one is from ~2015.

In [702]:
def extract_older_tweet(selector):
    try:
        container_selector = selector.xpath(".//div")[0]    
    except IndexError as e:
        return None
    
    tweet_id = container_selector.get("data-tweet-id")    
    screen_name = container_selector.get("data-screen-name")
    try:
        permalink = container_selector.xpath(".//a[contains(@class, 'js-permalink')]")[0].get("href")
    except IndexError as e:
        permalink = None
    user_id = container_selector.get("data-user-id")
    
    try:
        tweet_content = container_selector.xpath(".//p[contains(@class, 'ProfileTweet-text')]")[0]
    except IndexError as e:
        tweet_text = None
    else:
        tweet_text = tweet_content.text
    
    hashtags = extract_hashtags(container_selector)
    links = extract_links(container_selector)
    d = {
        "tweet_id": tweet_id,
        "screen_name": screen_name,
        "permalink": permalink,
        "user_id": user_id,
        "tweet_text": tweet_text,
        "hashtags": hashtags,
        "links": links,
    }
    return d

## Some tweets from the older Twitter user page

In [703]:
url = "http://web.archive.org/web/20150603004258/https://twitter.com/NikaFast"
req = requests.get(url)
req.status_code

200

In [704]:
root = lxml.html.fromstring(req.text)
root

<Element html at 0x7efcb82f25e8>

In [705]:
older_tweets_selector = root.xpath("//div[@data-item-type='tweet']")

In [706]:
older_tweets = [extract_older_tweet(s) for s in tweets_selector]
older_tweets[:3]

[{'tweet_id': '478219206542901249',
  'screen_name': 'NikaFast',
  'permalink': '/web/20140615165707/https://twitter.com/NikaFast/status/478219206542901249',
  'user_id': '772081711',
  'tweet_text': 'Ну, это больше похоже на шутку. Хотя... ЕС - марионетка США, вредящая странам участницам. ',
  'hashtags': [],
  'links': [{'url': 'http://bit.ly/1lCXL1d',
    'archived_url': 'http://web.archive.org/web/20140615165707/http://t.co/SB2S8EmxKD'}]},
 {'tweet_id': '478180966305763329',
  'screen_name': 'NikaFast',
  'permalink': '/web/20140615165707/https://twitter.com/NikaFast/status/478180966305763329',
  'user_id': '772081711',
  'tweet_text': 'Переговоры велись в трёхстороннем режиме:Россия-Украина-ЕС. ',
  'hashtags': [{'tag': 'ЕС',
    'archived_url': '/web/20140615165707/https://twitter.com/hashtag/%D0%95%D0%A1?src=hash'}],
  'links': [{'url': 'http://bit.ly/1p4F7ka',
    'archived_url': 'http://web.archive.org/web/20140615165707/http://t.co/BVbQTAcjPH'}]},
 {'tweet_id': '4781540558400

##  Newer Twitter user page

The other version of Twitter user page we found in the Internet Archive cache was used around ~2016-2017.

In [716]:
def extract_newer_tweet(selector):
    try:
        container_selector = selector.xpath("./div[contains(@class, 'tweet')]")[0]
    except IndexError as e:
        return None
    
    tweet_id = selector.get("data-item-id")
    screen_name = container_selector.get("data-screen-name")
    permalink = container_selector.get("data-permalink-path")
    user_id = container_selector.get("data-user-id")
    try:
        span_timestamp = container_selector.xpath(".//span[contains(@class, '_timestamp')]")[0]
    except IndexError as e:
        timestamp_ms = None   
    else:
        timestamp_ms = span_timestamp.get("data-time-ms")
    
    try:
        tweet_content = container_selector.xpath(".//p[contains(@class, 'tweet-text')]")[0]
    except IndexError as e:
        tweet_text = None
    else:
        tweet_text = tweet_content.text
    
    hashtags = extract_hashtags(container_selector)
    links = extract_links(container_selector)
    
    d = {
        "tweet_id": tweet_id,
        "screen_name": screen_name,
        "permalink": permalink,
        "user_id": user_id,
        "tweet_text": tweet_text,
        "timestamp_ms": timestamp_ms,
        "hashtags": hashtags,
        "links": links
    }
    return d

## Some tweets from the newer Twitter user page

In [708]:
url = "http://web.archive.org/web/20150603004258/https://twitter.com/AlwaysHungryBae"
req = requests.get(url)
req.status_code

200

In [709]:
root = lxml.html.fromstring(req.text)
root

<Element html at 0x7efcb82f2188>

In [710]:
newer_tweets_selector = root.xpath("//li[@data-item-type='tweet']")

In [711]:
newer_tweets = [extract_newer_tweet(s) for s in newer_tweets_selector]
newer_tweets[:3]

[{'tweet_id': '561931644785811457',
  'screen_name': 'AlwaysHungryBae',
  'permalink': '/AlwaysHungryBae/status/561931644785811457',
  'user_id': '2882130846',
  'tweet_text': 'Happy Super Bowl Sunday \n',
  'timestamp_ms': '1422809918000',
  'hashtags': [{'tag': 'superbowlfood',
    'archived_url': '/web/20150603004258/https://twitter.com/hashtag/superbowlfood?src=hash'}],
  'links': [{'url': 'pic.twitter.com/s6rwMtdLom',
    'archived_url': 'http://web.archive.org/web/20150603004258/http://t.co/s6rwMtdLom'},
   {'url': 'https://pbs.twimg.com/media/B8xh2fFCQAE-vxU.jpg:large',
    'archived_url': '//web.archive.org/web/20150603004258/https://twitter.com/AlwaysHungryBae/status/561931644785811457/photo/1'}]},
 {'tweet_id': '561917739108155392',
  'screen_name': 'BMoreBirdsNest',
  'permalink': '/BMoreBirdsNest/status/561917739108155392',
  'user_id': '61483830',
  'tweet_text': 'Making the award-winning Pigs in a Pillow again today! Bacon, goat cheese, jam, glazed donut as bread. ',
  't

## Scraping Tweets

Now that our parsing code can handle either HTML format let's scrape all the tweets we can find from the available archives:

In [717]:
%%time
all_tweets = []
with open("data/avail_urls.txt") as f:
    for url in f:
        req = requests.get(url)
        root = lxml.html.fromstring(req.text)
        tweets = []
        
        older_tweets_selector = root.xpath("//div[@data-item-type='tweet']")
        tweets = [extract_older_tweet(s) for s in older_tweets_selector if s is not None]
        
        newer_tweets_selector = root.xpath("//li[@data-item-type='tweet']")
        tweets = [extract_newer_tweet(s) for s in newer_tweets_selector if s is not None]
        
        if not tweets:
            print("No tweets for {}".format(url))
        else:
            print("Found {} tweet/s for {}".format(len(tweets), url))
            all_tweets.extend(tweets)

Found 21 tweet/s for http://web.archive.org/web/20171003154430/https://twitter.com/4ever1937

Found 20 tweet/s for http://web.archive.org/web/20160208193830/https://twitter.com/4MySquad

Found 19 tweet/s for http://web.archive.org/web/20150814102034/https://twitter.com/_SherylGilbert

Found 20 tweet/s for http://web.archive.org/web/20170218070727/https://twitter.com/acejinev

Found 20 tweet/s for http://web.archive.org/web/20170121124009/https://twitter.com/Aldrich420

Found 19 tweet/s for http://web.archive.org/web/20150603004258/https://twitter.com/AlwaysHungryBae

Found 20 tweet/s for http://web.archive.org/web/20170323143223/https://twitter.com/ameliebaldwin

Found 20 tweet/s for http://web.archive.org/web/20150603005024/https://twitter.com/AnnRussela/

No tweets for http://web.archive.org/web/20100329223044/http://twitter.com:80/anzgri

Found 19 tweet/s for http://web.archive.org/web/20170425084324/https://twitter.com/atlanta_online

Found 21 tweet/s for http://web.archive.org/web

Found 20 tweet/s for http://web.archive.org/web/20170718121946/https://twitter.com/SpecialAffair

Found 19 tweet/s for http://web.archive.org/web/20160609011646/https://twitter.com/StLouisOnline

Found 21 tweet/s for http://web.archive.org/web/20170818065026/https://twitter.com/TEN_GOP

Found 21 tweet/s for http://web.archive.org/web/20170901204311/https://twitter.com/TheFoundingSon

Found 21 tweet/s for http://web.archive.org/web/20151125170757/https://twitter.com/todaycleveland

Found 20 tweet/s for http://web.archive.org/web/20170305061124/https://twitter.com/todayinsyria

Found 19 tweet/s for http://web.archive.org/web/20151230073807/https://twitter.com/TodayMiami

Found 20 tweet/s for http://web.archive.org/web/20151130095931/https://twitter.com/todaypittsburgh

Found 20 tweet/s for http://web.archive.org/web/20170221154320/https://twitter.com/tpartynews

Found 21 tweet/s for http://web.archive.org/web/20170718070114/https://twitter.com/TrayneshaCole

Found 20 tweet/s for http://w

In [718]:
len(all_tweets)

1904

In [730]:
valid_tweets = [t for t in all_tweets if (t is not None and t["tweet_text"] is not None)]
len(valid_tweets)

1463

In [731]:
invalid_tweets = [t for t in all_tweets if (t is None or t["tweet_text"] is None)]
len(invalid_tweets)

441

In [732]:
valid_tweets[0]

{'tweet_id': '915184813522505728',
 'screen_name': '4ever1937',
 'permalink': '/4ever1937/status/915184813522505728',
 'user_id': '748870305280647168',
 'tweet_text': 'Маразм крепчал',
 'timestamp_ms': '1507032037000',
 'hashtags': [],
 'links': [{'url': 'pic.twitter.com/tqhdHePcsc',
   'archived_url': 'http://web.archive.org/web/20171003154430/https://t.co/tqhdHePcsc'}]}

In [734]:
invalid_tweets[:3]

[{'tweet_id': '848170166181666816',
  'screen_name': '4ever1937',
  'permalink': '/4ever1937/status/848170166181666816',
  'user_id': '748870305280647168',
  'tweet_text': None,
  'timestamp_ms': '1491054500000',
  'hashtags': [],
  'links': [{'url': 'pic.twitter.com/ya2H0K0aQZ',
    'archived_url': 'http://web.archive.org/web/20171003154430/https://t.co/ya2H0K0aQZ'}]},
 None,
 {'tweet_id': '695623657410977792',
  'screen_name': '4MySquad',
  'permalink': '/4MySquad/status/695623657410977792',
  'user_id': '4036537452',
  'tweet_text': None,
  'timestamp_ms': '1454684578000',
  'hashtags': [{'tag': 'BlackTwitter',
    'archived_url': '/web/20160208193830/https://twitter.com/hashtag/BlackTwitter?src=hash'}],
  'links': [{'url': 'pic.twitter.com/7hqbJ2nC7G',
    'archived_url': 'http://web.archive.org/web/20160208193830/https://t.co/7hqbJ2nC7G'}]}]

## Write JSON with all tweets we found

In [735]:
%%time
with open("./data/tweets_full.json", "w") as f:
    json.dump(valid_tweets, f, ensure_ascii=False, sort_keys=True, indent=4)

CPU times: user 108 ms, sys: 4 ms, total: 112 ms
Wall time: 118 ms
