In [4]:
# import pandas as pd
# import numpy as np
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# import re
# from scipy.stats import spearmanr
# import nltk
# nltk.download('vader_lexicon')

# # ------------------------
# # 1. Preprocessing Tweets
# # ------------------------
# data = pd.read_csv('Tesla.csv')

# cleaned_data = pd.DataFrame()
# cleaned_data['original_tweet'] = data['tweet']
# cleaned_data['cleaned_tweet'] = ""
# cleaned_data['compound_score'] = 0.0
# cleaned_data['positive_score'] = 0.0
# cleaned_data['negative_score'] = 0.0
# cleaned_data['neutral_score'] = 0.0

# # Preprocess tweets
# def preprocess_tweet(tweet):
#     tweet = str(tweet)
#     tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)  # Remove URLs
#     tweet = re.sub(r'@\w+', '', tweet)  # Remove mentions
#     tweet = re.sub(r'#', '', tweet)  # Remove hashtags
#     tweet = re.sub(r'[^\w\s]', '', tweet)  # Remove special characters
#     tweet = re.sub(r'\d+', '', tweet)  # Remove numbers
#     tweet = tweet.strip()  # Remove leading/trailing whitespace
#     return tweet

# cleaned_data['cleaned_tweet'] = data['tweet'].apply(preprocess_tweet)

# # Sentiment analysis
# sentiment_analyzer = SentimentIntensityAnalyzer()
# for i in range(len(cleaned_data)):
#     curr_tweet = cleaned_data["cleaned_tweet"][i]
#     curr_score = sentiment_analyzer.polarity_scores(curr_tweet)
#     cleaned_data.loc[i, "compound_score"] = curr_score["compound"]
#     cleaned_data.loc[i, "positive_score"] = curr_score["pos"]
#     cleaned_data.loc[i, "negative_score"] = curr_score["neg"]
#     cleaned_data.loc[i, "neutral_score"] = curr_score["neu"]

# # ------------------------
# # 2. Aggregate Sentiment Scores
# # ------------------------
# daily_sentiment = pd.DataFrame({
#     'date': [data['date'].iloc[0]],  # Assuming all tweets are from the same day
#     'average_compound': [cleaned_data['compound_score'].mean()],
#     'average_positive': [cleaned_data['positive_score'].mean()],
#     'average_negative': [cleaned_data['negative_score'].mean()],
#     'average_neutral': [cleaned_data['neutral_score'].mean()],
#     'total_tweets': [len(cleaned_data)]
# })

# # ------------------------
# # 3. Load and Prepare Stock Data
# # ------------------------
# stock_data = pd.read_csv('TSLA_stock_data.csv')
# stock_data['Date'] = pd.to_datetime(stock_data['Date'])
# stock_data['Return'] = stock_data['Close'].pct_change()

# # ------------------------
# # 4. Align Data
# # ------------------------
# daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
# merged_data = pd.merge(daily_sentiment, stock_data, left_on='date', right_on='Date', how='inner')

# # ------------------------
# # 5. Experiment with Correlations
# # ------------------------
# # Same-day correlations
# correlations = {}
# for sentiment_metric in ['average_compound', 'average_positive', 'average_negative']:
#     correlation, p_value = spearmanr(merged_data[sentiment_metric], merged_data['Return'])
#     correlations[sentiment_metric] = {'correlation': correlation, 'p_value': p_value}

# # Lagged correlations
# merged_data['Lagged_Return'] = merged_data['Return'].shift(-1)
# for sentiment_metric in ['average_compound', 'average_positive', 'average_negative']:
#     correlation, p_value = spearmanr(merged_data[sentiment_metric], merged_data['Lagged_Return'])
#     correlations[f'{sentiment_metric}_lagged'] = {'correlation': correlation, 'p_value': p_value}

# # Display results
# for key, value in correlations.items():
#     print(f"{key}: Correlation = {value['correlation']}, p-value = {value['p_value']}")

#  ////////////////////////////////////////////////////////////////////////
#  ////////////////////////////////////////////////////////////////////////

import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from scipy.stats import spearmanr
import nltk
nltk.download('vader_lexicon')

# ------------------------
# 1. Preprocessing Tweets
# ------------------------
data = pd.read_csv('Tesla.csv')

cleaned_data = pd.DataFrame()
cleaned_data['original_tweet'] = data['tweet']
cleaned_data['cleaned_tweet'] = ""
cleaned_data['compound_score'] = 0.0
cleaned_data['positive_score'] = 0.0
cleaned_data['negative_score'] = 0.0
cleaned_data['neutral_score'] = 0.0

# Preprocess tweets
def preprocess_tweet(tweet):
    tweet = str(tweet)
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet)  # Remove URLs
    tweet = re.sub(r'@\w+', '', tweet)  # Remove mentions
    tweet = re.sub(r'#', '', tweet)  # Remove hashtags
    tweet = re.sub(r'[^\w\s]', '', tweet)  # Remove special characters
    tweet = re.sub(r'\d+', '', tweet)  # Remove numbers
    tweet = tweet.strip()  # Remove leading/trailing whitespace
    return tweet

cleaned_data['cleaned_tweet'] = data['tweet'].apply(preprocess_tweet)

# Sentiment analysis
sentiment_analyzer = SentimentIntensityAnalyzer()
for i in range(len(cleaned_data)):
    curr_tweet = cleaned_data["cleaned_tweet"][i]
    curr_score = sentiment_analyzer.polarity_scores(curr_tweet)
    cleaned_data.loc[i, "compound_score"] = curr_score["compound"]
    cleaned_data.loc[i, "positive_score"] = curr_score["pos"]
    cleaned_data.loc[i, "negative_score"] = curr_score["neg"]
    cleaned_data.loc[i, "neutral_score"] = curr_score["neu"]

# Ensure sentiment scores are numeric
for column in ['compound_score', 'positive_score', 'negative_score', 'neutral_score']:
    cleaned_data[column] = pd.to_numeric(cleaned_data[column], errors='coerce')
cleaned_data.fillna(0, inplace=True)  # Replace any NaN with 0

# ------------------------
# 2. Aggregate Sentiment Scores
# ------------------------
# Convert date to datetime and extract only the date
data['date'] = pd.to_datetime(data['date']).dt.date

daily_sentiment = pd.DataFrame({
    'date': [data['date'].iloc[0]],  # Assuming all tweets are from the same day
    'average_compound': [cleaned_data['compound_score'].mean()],
    'average_positive': [cleaned_data['positive_score'].mean()],
    'average_negative': [cleaned_data['negative_score'].mean()],
    'average_neutral': [cleaned_data['neutral_score'].mean()],
    'total_tweets': [len(cleaned_data)]
})

# ------------------------
# 3. Load and Prepare Stock Data
# ------------------------
stock_data = pd.read_csv('TSLA_stock_data.csv')
stock_data['Date'] = pd.to_datetime(stock_data['Date'])

# Ensure 'Close' column is numeric and compute returns
stock_data['Close'] = pd.to_numeric(stock_data['Close'], errors='coerce')
stock_data.fillna(method='ffill', inplace=True)  # Forward fill missing values
stock_data['Return'] = stock_data['Close'].pct_change()

# ------------------------
# 4. Align Data
# ------------------------
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])
merged_data = pd.merge(daily_sentiment, stock_data, left_on='date', right_on='Date', how='inner')

# ------------------------
# 5. Experiment with Correlations
# ------------------------
correlations = {}

# Same-day correlations
for sentiment_metric in ['average_compound', 'average_positive', 'average_negative']:
    if len(merged_data) > 1:  # Ensure enough data points for correlation
        correlation, p_value = spearmanr(merged_data[sentiment_metric], merged_data['Return'])
        correlations[sentiment_metric] = {'correlation': correlation, 'p_value': p_value}
    else:
        correlations[sentiment_metric] = {'correlation': None, 'p_value': None}

# Lagged correlations
merged_data['Lagged_Return'] = merged_data['Return'].shift(-1)
for sentiment_metric in ['average_compound', 'average_positive', 'average_negative']:
    if len(merged_data) > 1:  # Ensure enough data points for correlation
        correlation, p_value = spearmanr(merged_data[sentiment_metric], merged_data['Lagged_Return'])
        correlations[f'{sentiment_metric}_lagged'] = {'correlation': correlation, 'p_value': p_value}
    else:
        correlations[f'{sentiment_metric}_lagged'] = {'correlation': None, 'p_value': None}

# Display results
for key, value in correlations.items():
    print(f"{key}: Correlation = {value['correlation']}, p-value = {value['p_value']}")



[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\folan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


average_compound: Correlation = None, p-value = None
average_positive: Correlation = None, p-value = None
average_negative: Correlation = None, p-value = None
average_compound_lagged: Correlation = None, p-value = None
average_positive_lagged: Correlation = None, p-value = None
average_negative_lagged: Correlation = None, p-value = None


  stock_data.fillna(method='ffill', inplace=True)  # Forward fill missing values
