# Set Tweeter Credentials

In [None]:
import os
import tweepy
from dotenv import load_dotenv

API_KEY=" "
API_KEY_SECRET=" "
ACCESS_TOKEN=" "
ACCESS_TOKEN_SECRET=" "
BEARER_TOKEN=" "

consumer_key = API_KEY
consumer_secret = API_KEY_SECRET
access_token = ACCESS_TOKEN
access_token_secret = ACCESS_TOKEN_SECRET

auth = tweepy.OAuth1UserHandler(
  consumer_key, 
  consumer_secret, 
  access_token, 
  access_token_secret
)

api = tweepy.API(auth)

print(api.verify_credentials())

# Extract the tweeets from Tweeter

In [None]:
import requests
import json
import time
import random
import os
import pandas as pd

bearer_token = " "

endpoint_url = "https://api.twitter.com/2/tweets/search/recent"

rules = [
    {"value": '(("chatGPT" OR "#GPT-4" OR "GPT4" OR "#chatgpt")) -is:retweet lang:en', "tag": "advice_18"}
]

query_parameters = {
    "tweet.fields": "id,text,author_id,created_at,public_metrics",
    "user.fields": "id,name,username,created_at,description,location,verified",
    "expansions": "author_id",
    "max_results": 100,
}

def request_headers(bearer_token: str) -> dict:
    """
    Sets up the request headers. 
    Returns a dictionary summarising the bearer token authentication details.
    """
    return {"Authorization": "Bearer {}".format(bearer_token)}

headers = request_headers(bearer_token)

def connect_to_endpoint(endpoint_url: str, headers: dict, parameters: dict) -> json:
    """
    Connects to the endpoint and requests data.
    Returns a json with Twitter data if a 200 status code is yielded.
    Programme stops if there is a problem with the request and sleeps
    if there is a temporary problem accessing the endpoint.
    """
    response = requests.request(
        "GET", url=endpoint_url, headers=headers, params=parameters
    )
    response_status_code = response.status_code
    if response_status_code != 200:
        if response_status_code >= 400 and response_status_code < 500:
            raise Exception(
                "Cannot get data, the program will stop!\nHTTP {}: {}".format(
                    response_status_code, response.text
                )
            )
        
        sleep_seconds = random.randint(5, 60)
        print(
            "Cannot get data, your program will sleep for {} seconds...\nHTTP {}: {}".format(
                sleep_seconds, response_status_code, response.text
            )
        )
        time.sleep(sleep_seconds)
        return connect_to_endpoint(endpoint_url, headers, parameters)
    return response.json()


def process_twitter_data(
    json_response: json,
    query_tag: str,
    tweets_data: pd.DataFrame,
    users_data: pd.DataFrame,
) -> (pd.DataFrame, pd.DataFrame):
    """
    Adds new tweet/user information to the table of
    tweets/users and saves dataframes as pickle files,
    if data is avaiable.
    
    Returns the tweets and users updated dataframes.
    """
    if "data" in json_response.keys():
        new = pd.DataFrame(json_response["data"])
        tweets_data = pd.concat([tweets_data, new])
        tweets_data.to_pickle("./raw_tweets/tweets_" + query_tag + ".pkl")

        if "users" in json_response["includes"].keys():
            new = pd.DataFrame(json_response["includes"]["users"])
            users_data = pd.concat([users_data, new])
            users_data.drop_duplicates("id", inplace=True)
            users_data.to_pickle("./raw_tweets/users_" + query_tag + ".pkl")

    return tweets_data, users_data


pd.set_option('display.max_colwidth', None)

tweets_data = pd.DataFrame()
users_data = pd.DataFrame()

for i in range(len(rules)):
    print("Working on rule ", i)
    query_parameters["query"] = rules[i]["value"]
    query_tag = rules[i]["tag"]

    json_response = connect_to_endpoint(endpoint_url, headers, query_parameters)
    tweets_data, users_data = process_twitter_data(
       json_response, query_tag, tweets_data, users_data
    )

    time.sleep(5)

    while "next_token" in json_response["meta"]:
        query_parameters["next_token"] = json_response["meta"]["next_token"]

        json_response = connect_to_endpoint(endpoint_url, headers, query_parameters)
        tweets_data, users_data = process_twitter_data(
            json_response, query_tag, tweets_data, users_data
        )

        time.sleep(5)
        
print("finished!")

# Concatanate all the extracted tweet files, and remove duplicates based on tweet ID**

In [None]:
import glob
import pandas as pd
files = glob.glob("./raw_tweets/tweets_*.pkl")

full_df = pd.DataFrame()

for file in files:
    df = pd.read_pickle(file) 
    new_df = pd.concat([df.drop(['public_metrics'], axis=1), df['public_metrics'].apply(pd.Series)], axis=1)
    new_df["rule"] = str(file)
    
    full_df = pd.concat([full_df, new_df], axis=0)

print(full_df.shape[0])
full_df = full_df.drop_duplicates(subset=["id"])
print(full_df.shape[0])

full_df.to_csv("./final_tweets_04062023.csv")