## Twitter API Data Collection Script

This script collects tweets using the Twitter API and stores them in a MongoDB database. It includes rate limit handling to ensure continuous data collection without exceeding the API limits.

In [None]:
# Importing Necessary Libraries
# In this section, we import libraries needed for the script:
# - `requests` for making HTTP requests to the Twitter API.
# - `pymongo` for interacting with MongoDB.
# - `os` for accessing environment variables and other operating system-related functions.
# - `time` for handling rate limits by pausing execution.

import requests
import pymongo
import os
import time

### MongoDB Connection Setup

This is establishing a connection to MongoDB for storing the tweets.

In [None]:
# MongoDB connection
client = pymongo.MongoClient("mongodb://localhost:27017/")
db_name = "WS_Data_DB"  # Database name
collection_name = "TwitterComments"  # Collection name

### Twitter API Configuration

Here, we define the bearer token for authenticating with the Twitter API and set up the Twitter API query. 

In [None]:
# Bearer token for Twitter API
bearer_token = "AAAAAAAAAAAAAAAAAAAAAABfsAEAAAAAQmungvEoLc4vDPs9WqhsD8nAV0o%3DX64BVBakw2zGrPXnhYacyLBpH3PKBOYfsvonkpwfhd30K2cP7r"

# Twitter API setup
search_url = "https://api.twitter.com/2/tweets/search/recent"
query_params = {
    'query': 'LogRhythm -from:LogRhythm',  # Exclude tweets from @LogRhythm
    'tweet.fields': 'id,text,author_id,created_at',
    'max_results': 100  # Specify the number of results per request, up to 100
}

### Rate Limit Handling and API Connection Functions

Functions to handle the rate limiting of the Twitter API and to make requests to the API, set up Oauth connection to Twitter, and defines the endpoint connection to the Search API.

In [None]:
# Rate limiting settings
requests_per_minute = 60  # Adjust this based on your rate limits
sleep_time = 60 / requests_per_minute

In [None]:
# OAuth Setup
# This function sets up the OAuth for the Twitter API requests.
def bearer_oauth(r):
    r.headers["Authorization"] = f"Bearer {bearer_token}"
    r.headers["User-Agent"] = "v2RecentSearchPython"
    return r

# Endpoint Connection
# This function manages the connection to the Twitter API endpoint.
def connect_to_endpoint(url, params):
    response = requests.get(url, auth=bearer_oauth, params=params)
    
    # Check if the response is a rate limit exceeded error (HTTP status code 429)
    if response.status_code == 429:
        print("Rate limit exceeded. Waiting for reset...")
        time.sleep(15 * 60)  # Wait for 15 minutes for rate limit reset
        return connect_to_endpoint(url, params)  # Retry the request after waiting
    
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)
    
    return response.json()

### Tweet Existence Check
This function checks if a tweet already exists in the MongoDB collection.

In [None]:
# Function to check if a tweet with a specific ID exists in the database
def tweet_exists(tweet_id, collection):
    return collection.find_one({'tweet_id': tweet_id}) is not None

### Main Function
This is the main function where the script execution begins.
It continuously fetches and processes tweets.

In [None]:
# Main function for collecting and storing tweets
def main():
    # Create or access the MongoDB collection
    db = client[db_name]
    collection = db[collection_name]
    
    next_token = None  # Initialize next_token
    oldest_tweet_id = None  # Initialize the oldest tweet ID
    
    sleep_time = 15  # Sleep time (in seconds) to respect rate limits
    
    while True:
        if next_token:
            query_params['next_token'] = next_token  # Include next_token in the query
            
        if oldest_tweet_id:
            query_params['until_id'] = oldest_tweet_id  # Include until_id to get older tweets
            
        json_response = connect_to_endpoint(search_url, query_params)
        tweets = json_response.get("data", [])

        if not tweets:
            print("No more tweets to fetch.")
            break

        # Process and store tweets
        for tweet in tweets:
            tweet_id = tweet["id"]
            
            # Update the oldest tweet ID if necessary
            if oldest_tweet_id is None or tweet_id < oldest_tweet_id:
                oldest_tweet_id = tweet_id

            # Check if the tweet is already in the database
            if not tweet_exists(tweet_id, collection):
                tweet_data = {
                    'tweet_id': tweet_id,
                    'text': tweet["text"],
                    'created_at': tweet["created_at"],
                    'author_id': tweet["author_id"]
                    # Add more fields as needed
                }

                # Insert the tweet into the database
                collection.insert_one(tweet_data)
                print(f"Inserted tweet with ID: {tweet_id}")
            else:
                print(f"Tweet with ID {tweet_id} already exists in the database")
        
        # Get the next_token for pagination, if available
        next_token = json_response.get("meta", {}).get("next_token")
        
        # Sleep to respect rate limits
        time.sleep(sleep_time)

if __name__ == "__main__":
    main()