# Setup

In [None]:
"""
1. Install Chrome
2. Check your Chrome version
3. Install Chromedriver version appropriate for your Chrome version
4. Install requirements
    - If I find time, I will make a requirements.txt, but I am running this script in a larger venv for another project, so can't quickly pip freeze one
"""

# Imports

In [None]:
from seleniumwire import webdriver
from seleniumwire.utils import decode

from webdriver_manager.chrome import ChromeDriverManager

from selenium.webdriver.chrome.service import Service

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys


import time

import pandas as pd

# Advanced search

In [None]:
# IIRC, must use this advanced search url  with ending "f=live". Otherwise first returns "top" list, not "latest", meaning scrape will stop too soon
 
ACCOUNT = "IAPonomarenko"
START = "2023-01-01" #YYYY-MM-DD
END = "2023-02-01" #YYYY-MM-DD
url_adv = f"https://twitter.com/search?q=(from%3A{ACCOUNT})%20until%3A{END}%20since%3A{START}&src=typed_query&f=live"

print(url_adv)
# TO scale, could loop through a list of account, start, ends dates.


# Sample urls for testing
# url_adv = "https://twitter.com/search?q=(from%3AIAPonomarenko)%20until%3A2023-05-31%20since%3A2023-01-01&src=typed_query"
# url_adv = "https://twitter.com/search?q=(from%3AIAPonomarenko)%20until%3A2023-02-31%20since%3A2023-01-01&src=typed_query&f=live"
# url_adv = "https://twitter.com/search?q=(from%3AHDeSotoPeru)%20until%3A2021-04-30%20since%3A2021-01-01&src=typed_query&f=live"

In [None]:
# ATTENTION:
# On first run, enter your Twitter username and password. On next run this will be saved to the "user-data-dir"
# This is because to use Twitter's advanced search options, you need to be logged in (update: well, now you always need to be logged in)

In [None]:
# Change this based on where you put your Chromedriver. I like to put it close to my script (do not need to specify this if Chromedriver is in your PATH iirc)
driver_location = '../INPUT/chromedriver' 

# Add options
options = webdriver.ChromeOptions()

options.add_argument("--start-maximized")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--no-sandbox")

options.add_argument("user-data-dir=C:\environments\selenium")
#options.add_argument("user-data-dir=selenium") # I usually use this and put my directory next to my work, not the above (which I added for a friend)

options.add_argument("user-agent=Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36")
options.add_argument("--disable-popup-blocking")
options.add_argument("--disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument('--disable-blink-features=AutomationControlled')


# Start driver
driver = webdriver.Chrome(service=Service(driver_location), options=options)

# Run after driver initialized
driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")

driver.get(url_adv)

time.sleep(5) # might want to make these random

counter = 0
index = None
index_list = []

new_list = []

# This is how we scroll now! Will scroll until the end, using screen size to see if there is more
lastHeight = driver.execute_script("return document.documentElement.scrollHeight")
print('lastHeight', lastHeight)

while True:

    driver.execute_script(f"window.scrollTo(0, {lastHeight});")
    time.sleep(1)
    newHeight = driver.execute_script("return document.documentElement.scrollHeight")
    print('newHeight', newHeight)

    if newHeight == lastHeight:
        break

    lastHeight = newHeight

    elem = driver.find_element(By.TAG_NAME, "html")
    elem.send_keys(Keys.END)
    print("Scrolling")
    time.sleep(5) # might want to make these random


    for request in driver.requests:
        request_str = str(request)

        if "api/graphql" in request_str and "twitter.com" in request_str and "rawQuery" in request_str:
            print(request_str)
            try:
                data = decode(request.response.body, request.response.headers.get('Content-Encoding', 'identity'))

                data = data.decode("utf8") #GB2312

                print("Index of object " + str(counter))

                index = counter
                index_list.append(index)
                print("----------------------------------------------")

                # # Getting request we want
                new_list.append(data)


            except:
                print("ERROR DECODING")
                print("----------------------------------------------")

        counter += 1

driver.close()

# Parse results

This is allot of exploring the json to find the right data. You can skip down to FINAL PARSE if you like

In [None]:
import json

In [None]:
len(new_list)

In [None]:
new_list[0]

In [None]:
output = {}

for d in new_list:
    d = json.loads(d)

    for key, value in d.items():
        output.setdefault(key, []).append(value)

In [None]:
output

In [None]:
len(output['data'])


In [None]:
# Here is the filter where you can see the values I would like to retrieve:

output['data'][1]['search_by_raw_query']#['search_timeline']['timeline']['instructions']#[0]["entries"]#[0]["content"]["itemContent"]['tweet_results']["result"]

In [None]:
# FINAL PARSE
# This captures lots of duplicates, but then you can just drop them
# You can see it hits some errors, but not many...
# I printed the error lines so you can inspect them

adv_entries = []

counter = 1


for entry in output["data"]:
  
    ent = entry['search_by_raw_query']['search_timeline']['timeline']['instructions']

    for e in ent:
        try:
            e1 = e["entries"]

            for e2 in e1:
                adv_entries.append(e2["content"]["itemContent"]["tweet_results"]["result"]["legacy"]) # usin legacy here helps narrow down the number of columns you have to deal with
                print("good")
        except:

            print(e2)
            pass


In [None]:
len(adv_entries)

# Putting into DataFrame

In [None]:
df = pd.json_normalize(adv_entries)

In [None]:
df.head(3)

In [None]:
# We still have loads of columns!
for col in df.columns:
    print(col)

In [None]:
# Checking for unique entries, to get rid of duplicates
df.id_str.nunique()

In [None]:
# Checking for unique entries, to get rid of duplicates
df.full_text.nunique()

In [None]:
# Drop duplicates based on which column you think most likely is unique, and still captures all your tweets
# I choose id_str here

df.drop_duplicates(subset="id_str", inplace=True)

In [None]:
df.head()

In [None]:
# Printing texts so I can check if the error texts we got above are in here

for index, row in df.iterrows():
    print(row.full_text)
    print("---------------------------------------------------------")

In [None]:
# DROP EXTRA COLUMNS HERE! Be sure to keep anything you make want in the future!

# With that many columns, dropping a column would take too long. You can instead create a new df of just columns you want
# Tons of other ways to do this, but this is simpliest

# df_new = df[[list of columns I want]]

# Example
# wanted_columns = ['A','D']
# new_dataset = dataset[wanted_columns]

In [None]:
# If you are collecting daily, you would concat/merge/join your dataframes here.

# Save

In [None]:
PATH = "../DATA/"
FILENAME = "twitter_advanced.csv" # may wish to use datetime to string and concat to make filename dynamic based on last scrape

In [None]:
df.to_csv(PATH + FILENAME, index=False)