In [4]:
import tweepy
import pandas as pd
from datetime import datetime, timedelta, timezone
import logging
import os
import re
import time


# Define the clean_text function
def clean_text(text):
    """
    Clean tweet text by removing URLs, mentions, hashtags, and special characters.
    
    Args:
        text (str): The original tweet text.
        
    Returns:
        str: The cleaned tweet text.
    """
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)  # Remove mentions
    text = re.sub(r"#\w+", "", text)  # Remove hashtags
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = text.strip()  # Remove leading and trailing spaces
    return text

def update_masterlist(new_data, master_file='masterlist.csv'):
    """
    Update the master list with new data.
    """
    if os.path.exists(master_file):
        master_df = pd.read_csv(master_file)
        print(f"Loaded existing master list with {len(master_df)} entries.")
    else:
        master_df = pd.DataFrame()
        print("No master list found. Creating a new one.")
    
    updated_master = pd.concat([master_df, new_data], ignore_index=True)
    updated_master = updated_master.drop_duplicates(subset='tweet_id', keep='last')
    updated_master.to_csv(master_file, index=False)
    print(f"Master list updated and saved to {master_file}. Total entries: {len(updated_master)}")
    return updated_master

def get_recent_tweets(handle, bearer_token, days_back=6, max_retries=3, master_file='masterlist.csv'):
    """
    Retrieve recent tweets, clean text, analyze sentiment, and classify posts.
    """
    client = tweepy.Client(bearer_token=bearer_token)
    end_time = datetime.now(timezone.utc).replace(microsecond=0) - timedelta(seconds=30)
    start_time = end_time - timedelta(days=days_back)
    
    start_time_str = start_time.strftime('%Y-%m-%dT%H:%M:%SZ')
    end_time_str = end_time.strftime('%Y-%m-%dT%H:%M:%SZ')
    
    query = f"@{handle} -is:retweet lang:en"
    
    for attempt in range(max_retries):
        try:
            paginator = tweepy.Paginator(
                client.search_recent_tweets,
                query=query,
                start_time=start_time_str,
                end_time=end_time_str,
                max_results=100,
                tweet_fields=["created_at", "author_id", "text", "public_metrics"]
            )
            
            tweets_data = []
            for response in paginator:
                if response.data:
                    for tweet in response.data:
                        cleaned_text = clean_text(tweet.text)
                        
                        
                        public_metrics = tweet.public_metrics
                        tweets_data.append({
                            'tweet_id': tweet.id,
                            'author_id': tweet.author_id,
                            'text': tweet.text,
                            'cleaned_text': cleaned_text,
                            'created_at': tweet.created_at,
                            'likes': public_metrics.get('like_count', 0),
                            'retweets': public_metrics.get('retweet_count', 0),
                            'replies': public_metrics.get('reply_count', 0),
                            'views': public_metrics.get('impression_count', 0)
                        })
                
                # Sleep to handle rate limits
                time.sleep(1)
            
            df = pd.DataFrame(tweets_data)
            updated_master_df = update_masterlist(df, master_file)
            return updated_master_df
        
        except tweepy.TooManyRequests as e:
            logging.error(f"Rate limit exceeded: {e}")
            print(f"Rate limit exceeded. Waiting before retrying...")
            time.sleep(60)  # Wait for 1 minute before retrying
        
        except Exception as e:
            logging.error(f"Error retrieving tweets on attempt {attempt + 1}: {e}")
            print(f"Error retrieving tweets: {e}")
            
            if attempt == max_retries - 1:
                return pd.DataFrame()

# Replace with your actual Twitter API bearer token
BEARER_TOKEN = 'YOUR KEY'
handle = "BANK'S HANDLE"  # Replace with the desired Twitter handle

masterlist_df = get_recent_tweets(handle, BEARER_TOKEN)


No master list found. Creating a new one.
Master list updated and saved to masterlist.csv. Total entries: 22
