In [1]:
import os
import time
import tweepy
import pandas as pd
from datetime import datetime
from dateutil.parser import parse

class TwitterScraper:
    def __init__(self, api_key, api_key_secret, access_token, access_token_secret):
        self.api = self.authenticate(api_key, api_key_secret, access_token, access_token_secret)
        self.tweet_ids = set()  # Set to hold tweet ids

    @staticmethod
    def authenticate(api_key, api_key_secret, access_token, access_token_secret):
        authenticator = tweepy.OAuthHandler(api_key, api_key_secret)
        authenticator.set_access_token(access_token, access_token_secret)
        return tweepy.API(authenticator, wait_on_rate_limit=True)

    def get_tweets(self, search_terms, count):
        tweets = tweepy.Cursor(
            self.api.search_tweets,
            q=search_terms,
            lang='en',
            tweet_mode='extended',
        ).items(count)

        data = []
        for tweet in tweets:
            if tweet.id not in self.tweet_ids:
                self.tweet_ids.add(tweet.id)
                tweet_dict = tweet._json
                user = tweet_dict.pop('user', {})
                for key, value in user.items():
                    tweet_dict[f'user_{key}'] = value
                data.append(tweet_dict)
        print(f"Fetched {len(data)} new tweets")  # Print the number of new tweets fetched
        return data


    @staticmethod   
    def save_to_csv(data, filepath):
        df = pd.DataFrame(data)
        if not df.empty:  # Check if the DataFrame is not empty
            df['created_at'] = df['created_at'].apply(lambda x: parse(x).strftime('%Y-%m-%d %H:%M:%S'))
            df.to_csv(filepath, mode='a', index=False)  # Add mode='a' to append to the existing file
        else:
            print("No new tweets fetched")


    def fetch_and_save(self, search_terms, count, filepath, interval=60, max_tweets=50000):
        total_tweets = 0
        while total_tweets < max_tweets:
            tweets = self.get_tweets(search_terms, count)
            self.save_to_csv(tweets, filepath)
            total_tweets += len(tweets)
            if total_tweets < max_tweets:
                time.sleep(interval)

# Credentials
api_key = ''
api_key_secret = ''
access_token_secret = ''
access_token = ''

# Instantiate the scraper
scraper = TwitterScraper(api_key, api_key_secret, access_token, access_token_secret)

# Set parameters
search_terms = '("Will Smith" OR "Chris Rock" OR "Jada Pinkett Smith" OR "Oscars" OR "Academy Awards" OR "slap" OR "#AcademyAwards" OR "#WillSmith" OR "#ChrisRock" OR "@jadapsmith" OR "@TheAcademy" OR "Best Actor award")'
count = 100

# Define the filename
now = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
directory = "./data/"  # data directory in the current directory
filename = f"tweets_{now}.csv"
filepath = os.path.join(directory, filename)

# Create the directory if it doesn't exist
os.makedirs(directory, exist_ok=True)

# Start fetching and saving tweets
scraper.fetch_and_save(search_terms, count, filepath)


Fetched 100 new tweets
Fetched 18 new tweets
Fetched 20 new tweets
Fetched 23 new tweets
Fetched 19 new tweets
Fetched 28 new tweets


KeyboardInterrupt: 