# Imports

In [1]:
import time
from datetime import datetime
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import seaborn as sns
%matplotlib inline

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from textblob import TextBlob

# Selenium Tiwitter Web Scraper (Topic: Crypto)

In [2]:
# Set driver and chrome driver path
chrome_path = r"/Users/jinpark/Desktop/chromedriver_mac/chromedriver"
driver = webdriver.Chrome(chrome_path)

# Url that I will access to scrape
url = u"https://twitter.com/search?f=news&vertical=news&q=crypto&src=typd"
driver.get(url)
time.sleep(1)

# Iterate twitter page
body = driver.find_element_by_tag_name('body') # Find element body
for _ in range(10): # Iterate twitter page down
    body.send_keys(Keys.PAGE_DOWN) # Send keys to scroll page down
    time.sleep(0.2) # Put computer to sleep and wait until page loads

# Scrape tweets
tweets = driver.find_elements_by_class_name('tweet-text') # Find class 'tweet-text' that has tweets
tweet_list = [] # Empty list to store all tweet texts
for tweet in tweets: # Iterate tweet texts 
    tweet_list.append(tweet.text) # store each tweets into list in text format

# Scrape number of comments
replies = driver.find_elements_by_class_name('js-actionReply') # Find class 'js-actionReply' number of comments
reply_list = [] # Empty list to store all comments
for reply in replies: # Iterate comments
    reply_list.append(reply.text) # Store each comment info into list
comment_counts = [] # Another empty list to get clean data which it stores "ONLY NUMBER" of comments
for count in reply_list: # Iterate through reply_list 
    if len(count) > 0: # If length of each comment is greater than 0
        comment_counts.append(count[6:]) # append results after index 6 which it contains only the number of comments

# Scrape retweets, same process
retweets = driver.find_elements_by_class_name('js-actionRetweet')
retweets_list = []
for retweet in retweets:
    retweets_list.append(retweet.text)
retweet_counts = []
for count in retweets_list:
    if len(count) > 0:
        retweet_counts.append(count[8:])

# Scrape number of likes, same process
likes = driver.find_elements_by_class_name('js-actionFavorite')
likes_list = []
for like in likes:
    likes_list.append(like.text)
likes_counts = []
for count in likes_list:
    if len(count) > 0:
        likes_counts.append(count[5:])

# Scrape dates
dates_list = [] # Empty list to store dates. Getting date was bit tricky. To access date format m-d-y, I had to get attribute and find the xpath for the location.
dates = [element.get_attribute('title') for element in driver.find_elements_by_xpath('//a[starts-with(@class, "tweet-timestamp js-permalink js-nav js-tooltip")]')] 
for date in dates:
    if '-' in date:
        dates_list = [date.split("-",1)[1][1:] if '-' in date else date for date in dates]

## Sentiment Analysis

In [None]:
# Get only polarity for sentiment analysis
# Polarity measures how positive or negative some text is
polarity = []
for tweet in tweets:
    sentiment_analysis = TextBlob(tweet.text).sentiment
    polarity.append(sentiment_analysis[0])
print(polarity)

In [None]:
# Get only subjectivity for sentiment analysis
# Subjectivity measures how much of an opinion it is versus how factual
subjectivity = []
for tweet in tweets:
    sentiment_analysis = TextBlob(tweet.text).sentiment
    subjectivity.append(sentiment_analysis[1])
print(subjectivity)

## Combine All Data

In [None]:
# Put all the data into a dataframe
df = pd.DataFrame({'date': dates_list,
                   'tweets': tweet_list, 
                   'comments': comment_counts, 
                   'retweets': retweet_counts,
                   'likes': likes_counts,
                   'polarity_level': polarity,
                   'subjectivity_level': subjectivity})

In [None]:
# Make sure if all data shows correct
df.head()

In [None]:
# Change column date to datetime
df['date'] = datetime.strptime('15 Mar 2018', '%d %b %Y')

In [None]:
df.head()

In [None]:
# Rearrange columns in order and name dataframe to crypto
crypto = df[['date', 'tweets', 'comments', 'likes', 'polarity_level', 'subjectivity_level']]

In [None]:
# Looks just the way I want it to be
crypto.head()

# Save Twitter Data

In [None]:
# Save the data as "webscrape_twitter_crypto_data.csv" in data folder
crypto.to_csv('data/webscrape_twitter_crypto_data.csv', index=False)