In [1]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from scipy.stats import spearmanr
import nltk
nltk.download('vader_lexicon')

# ------------------------
# 1. Load and Preprocess Tweets
# ------------------------
tweets_data = pd.read_csv('Tesla.csv')
tweets_data['timestamp'] = pd.to_datetime(tweets_data['date'])
tweets_data['hour'] = tweets_data['timestamp'].dt.floor('H')

cleaned_data = pd.DataFrame()
cleaned_data['original_tweet'] = tweets_data['tweet']
cleaned_data['timestamp'] = tweets_data['timestamp']
cleaned_data['hour'] = tweets_data['hour']
cleaned_data['cleaned_tweet'] = ""
cleaned_data['compound_score'] = 0.0

# Preprocess tweets
def preprocess_tweet(tweet):
    tweet = str(tweet)
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)  # Remove URLs
    tweet = re.sub(r'@\w+', '', tweet)  # Remove mentions
    tweet = re.sub(r'#', '', tweet)  # Remove hashtags
    tweet = re.sub(r'[^\w\s]', '', tweet)  # Remove special characters
    tweet = re.sub(r'\d+', '', tweet)  # Remove numbers
    tweet = tweet.strip()  # Remove leading/trailing whitespace
    return tweet

cleaned_data['cleaned_tweet'] = tweets_data['tweet'].apply(preprocess_tweet)

# Sentiment analysis
sentiment_analyzer = SentimentIntensityAnalyzer()
cleaned_data['compound_score'] = cleaned_data['cleaned_tweet'].apply(lambda x: sentiment_analyzer.polarity_scores(x)['compound'])

# Aggregate hourly sentiment scores
hourly_sentiment = cleaned_data.groupby('hour')['compound_score'].mean().reset_index()
hourly_sentiment.rename(columns={'compound_score': 'average_compound'}, inplace=True)

# ------------------------
# 2. Enhance Stock Data with Full Timestamps
# ------------------------
stock_data = pd.read_csv('TSLA_stock_data.csv')
stock_data.rename(columns={'Time': 'time', 'Price': 'Close'}, inplace=True)

# Extract the common date from the tweets dataset
common_date = pd.to_datetime(tweets_data['date'].iloc[0]).date()

# Combine common date with stock times to create full timestamps
stock_data['timestamp'] = pd.to_datetime(str(common_date) + ' ' + stock_data['time'])
stock_data['hour'] = stock_data['timestamp'].dt.floor('H')

# Calculate hourly returns
stock_data['Close'] = pd.to_numeric(stock_data['Close'], errors='coerce')
stock_data['Hourly_Return'] = stock_data['Close'].pct_change()

# ------------------------
# 3. Align Data
# ------------------------
merged_data = pd.merge(hourly_sentiment, stock_data, on='hour', how='inner')

# ------------------------
# 4. Compute Spearman Correlation
# ------------------------
if not merged_data.empty and merged_data['average_compound'].nunique() > 1 and merged_data['Hourly_Return'].nunique() > 1:
    correlation, p_value = spearmanr(merged_data['average_compound'], merged_data['Hourly_Return'])
    print(f"Spearman Correlation: {correlation}, p-value: {p_value}")
else:
    print("Insufficient or non-overlapping data for meaningful correlation.")




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\folan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Spearman Correlation: 0.39999999999999997, p-value: 0.6
