In [46]:
import re
import pandas as pd

def clean_text(text):
    if pd.isnull(text):
        return None
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)  
    text = re.sub(r'[^\w\s]', '', text)  
    text = text.lower() 
    return text


tweets = pd.read_csv('/Users/keremsmacbook/Desktop/42/gitHub/Presidential_Tweets_Impacts_SP500/PresidentialTweetsImpectOnS&P500/Data/Raw/DonaldTrumpTweets.csv')
tweets['text'] = tweets['text'].apply(clean_text)

tweets = tweets.dropna(subset=['text'])  
tweets = tweets[tweets['text'].str.strip() != '']

tweets['Date'] = pd.to_datetime(tweets['date'], errors='coerce').dt.date
tweets = tweets.dropna(subset=['Date'])

In [47]:
output_path = '/Users/keremsmacbook/Desktop/42/gitHub/Presidential_Tweets_Impacts_SP500/PresidentialTweetsImpectOnS&P500/Data/Processed/DonaldTrumpTweets_Cleaned.csv'
tweets.to_csv(output_path, index=False)

In [49]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()
tweets['VaderSentiment'] = tweets['text'].apply(lambda x: analyzer.polarity_scores(x)['compound'])

daily_sentiment = tweets.groupby('Date').agg(
    number_of_tweets=('text', 'count'),
    avg_vader_sentiment_score=('VaderSentiment', 'mean')
).reset_index()


output_path = '/Users/keremsmacbook/Desktop/42/gitHub/Presidential_Tweets_Impacts_SP500/PresidentialTweetsImpectOnS&P500/Data/Processed/daily_vader_sentiment.csv'
daily_sentiment.to_csv(output_path, index=False)

In [50]:
import torch
print(torch.backends.mps.is_available())  # Should return True

True


In [57]:
from transformers import pipeline
import torch
from tqdm import tqdm

if torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"

finbert = pipeline("sentiment-analysis", model="yiyanghkust/finbert-tone", device=0 if device == "mps" else -1)


def get_sentiment_score(output):
    """
    Map FinBERT output to a sentiment score.
    - Positive: Return score as is.
    - Negative: Return negative score.
    - Neutral: Return 0.
    """
    label = output['label']
    score = output['score']
    if label == 'Positive':
        return score
    elif label == 'Negative':
        return -score
    else:  # label == Neutral
        return 0

def process_in_batches(texts, batch_size):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Processing FinBERT Sentiments"):
        batch = texts[i:i + batch_size]
        batch_results = finbert(batch)
        scores = [get_sentiment_score(result) for result in batch_results]
        results.extend(scores)
    return results

batch_size = 128  
tweets['FinBertSentiment'] = process_in_batches(tweets['text'].tolist(), batch_size)

daily_sentiment = tweets.groupby('Date').agg(
    number_of_tweets=('text', 'count'),
    avg_vader_sentiment=('VaderSentiment', 'mean'),
    avg_finbert_sentiment=('FinBertSentiment', 'mean')
).reset_index()

output_path = '/Users/keremsmacbook/Desktop/42/gitHub/Presidential_Tweets_Impacts_SP500/PresidentialTweetsImpectOnS&P500/Data/Processed/daily_sentiment_with_batches.csv'
daily_sentiment.to_csv(output_path, index=False)

Processing FinBERT Sentiments: 100%|██████████| 432/432 [18:57<00:00,  2.63s/it]


In [58]:
daily_vader_sentiment = pd.read_csv('/Users/keremsmacbook/Desktop/42/gitHub/Presidential_Tweets_Impacts_SP500/PresidentialTweetsImpectOnS&P500/Data/Processed/daily_vader_sentiment.csv')
daily_vader_sentiment['Date'] = pd.to_datetime(daily_vader_sentiment['Date']).dt.date

daily_finbert_sentiment = pd.read_csv(
    '/Users/keremsmacbook/Desktop/42/gitHub/Presidential_Tweets_Impacts_SP500/PresidentialTweetsImpectOnS&P500/Data/Processed/daily_sentiment_with_batches.csv'
)
daily_finbert_sentiment['Date'] = pd.to_datetime(daily_finbert_sentiment['Date']).dt.date

sp500 = pd.read_csv('/Users/keremsmacbook/Desktop/42/gitHub/Presidential_Tweets_Impacts_SP500/PresidentialTweetsImpectOnS&P500/Data/Processed/cleaned_sp500_index_prices.csv')
sp500['Date'] = pd.to_datetime(sp500['Date']).dt.date  

aligned_dataset = pd.merge(sp500, daily_vader_sentiment, on='Date', how='left')
aligned_dataset = pd.merge(aligned_dataset, daily_finbert_sentiment, on='Date', how='left')
aligned_dataset.fillna(0, inplace=True)

aligned_output_path = '/Users/keremsmacbook/Desktop/42/gitHub/Presidential_Tweets_Impacts_SP500/PresidentialTweetsImpectOnS&P500/Data/Processed/aligned_dataset.csv'
aligned_dataset.to_csv(aligned_output_path, index=False)