In [4]:
import pandas as pd
import numpy as np
import re
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
nltk.download('vader_lexicon')

# Function to get BERT sentiment
def get_bert_sentiment(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return predictions[0].detach().numpy()

print("Loading data...")
tweets_data = pd.read_csv('Tesla.csv')
stock_data = pd.read_csv('TSLA_stock_data.csv')

# Basic tweet preprocessing
print("\nPreprocessing tweets...")
tweets_data['cleaned_tweet'] = tweets_data['tweet'].fillna('')
tweets_data['cleaned_tweet'] = tweets_data['tweet'].apply(lambda x: str(x))
tweets_data['cleaned_tweet'] = tweets_data['cleaned_tweet'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
tweets_data['cleaned_tweet'] = tweets_data['cleaned_tweet'].apply(lambda x: re.sub(r'@\w+', '', x))
tweets_data['cleaned_tweet'] = tweets_data['cleaned_tweet'].apply(lambda x: re.sub(r'#', '', x))
tweets_data['created_at'] = pd.to_datetime(tweets_data['created_at'], unit='ms')
tweets_data['hour'] = tweets_data['created_at'].dt.hour

# VADER Analysis
print("\nPerforming VADER analysis...")
analyzer = SentimentIntensityAnalyzer()
vader_scores = []

for tweet in tweets_data['cleaned_tweet']:
    scores = analyzer.polarity_scores(tweet)
    vader_scores.append(scores)

tweets_data['vader_compound'] = [score['compound'] for score in vader_scores]
tweets_data['vader_positive'] = [score['pos'] for score in vader_scores]
tweets_data['vader_negative'] = [score['neg'] for score in vader_scores]
tweets_data['vader_neutral'] = [score['neu'] for score in vader_scores]

# Display VADER results
print("\nVADER Analysis Results:")
print("-" * 50)
print(f"Average Compound Score: {tweets_data['vader_compound'].mean():.3f}")
print(f"Average Positive Score: {tweets_data['vader_positive'].mean():.3f}")
print(f"Average Negative Score: {tweets_data['vader_negative'].mean():.3f}")
print(f"Average Neutral Score: {tweets_data['vader_neutral'].mean():.3f}")

# VADER visualization
plt.figure(figsize=(10, 6))
sns.histplot(data=tweets_data, x='vader_compound', bins=30)
plt.title('Distribution of VADER Compound Scores')
plt.xlabel('Compound Score')
plt.ylabel('Count')
plt.savefig('vader_distribution.png')
plt.close()

# BERT Analysis
print("\nLoading BERT model...")
try:
    tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
    model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
    
    print("\nPerforming BERT analysis...")
    bert_scores = []
    batch_size = 8  # Smaller batch size for memory efficiency
    
    for i in range(0, len(tweets_data), batch_size):
        batch_tweets = tweets_data['cleaned_tweet'][i:i+batch_size].tolist()
        batch_scores = []
        for tweet in batch_tweets:
            sentiment_scores = get_bert_sentiment(tweet, tokenizer, model)
            # Convert 1-5 scale to -1 to 1 scale for comparison with VADER
            bert_score = (np.argmax(sentiment_scores) + 1 - 3) / 2
            batch_scores.append(bert_score)
        bert_scores.extend(batch_scores)
        if i % 100 == 0:
            print(f"Processed {i}/{len(tweets_data)} tweets")
    
    tweets_data['bert_score'] = bert_scores
    
    # Display BERT results
    print("\nBERT Analysis Results:")
    print("-" * 50)
    print(f"Average BERT Score: {tweets_data['bert_score'].mean():.3f}")
    
    # BERT visualization
    plt.figure(figsize=(10, 6))
    sns.histplot(data=tweets_data, x='bert_score', bins=30)
    plt.title('Distribution of BERT Sentiment Scores')
    plt.xlabel('Sentiment Score')
    plt.ylabel('Count')
    plt.savefig('bert_distribution.png')
    plt.close()
    
    # Compare VADER and BERT
    correlation = tweets_data['vader_compound'].corr(tweets_data['bert_score'])
    print(f"\nCorrelation between VADER and BERT scores: {correlation:.3f}")
    
except Exception as e:
    print(f"\nError during BERT analysis: {str(e)}")
    print("Continuing with VADER analysis only...")

# Stock Price Analysis
print("\nAnalyzing relationship with stock prices...")
stock_data['hour'] = stock_data['time'].str.split(':').str[0].astype(int)
stock_data['Return'] = stock_data['price'].pct_change()

# Aggregate sentiment by hour
hourly_sentiment = tweets_data.groupby('hour').agg({
    'vader_compound': 'mean',
    'cleaned_tweet': 'count'
}).reset_index()

merged_data = pd.merge(hourly_sentiment, stock_data, on='hour', how='inner')

# Calculate correlation with stock returns
vader_corr, vader_p = spearmanr(merged_data['vader_compound'], merged_data['Return'])

print("\nStock Return Correlations:")
print("-" * 50)
print(f"VADER Correlation: {vader_corr:.3f} (p-value: {vader_p:.3f})")

# Display most extreme examples
print("\nMost Extreme Examples (VADER):")
print("-" * 50)
print("\nMost Positive Tweet:")
print(tweets_data.loc[tweets_data['vader_compound'].idxmax(), 'cleaned_tweet'])
print(f"VADER score: {tweets_data['vader_compound'].max():.3f}")

print("\nMost Negative Tweet:")
print(tweets_data.loc[tweets_data['vader_compound'].idxmin(), 'cleaned_tweet'])
print(f"VADER score: {tweets_data['vader_compound'].min():.3f}")

# Save results
print("\nSaving results...")
tweets_data.to_csv('sentiment_analysis_results.csv', index=False)

print("\nAnalysis complete! Results have been saved to CSV and visualizations to PNG files.")

# //////////////////////////////////////
# Spearman Correlation

# Stock Price Analysis
print("\nAnalyzing relationship with stock prices...")

# Prepare stock data
stock_data['hour'] = stock_data['time'].str.split(':').str[0].astype(int)
stock_hours = sorted(stock_data['hour'].unique())
print("\nStock data hours available:", stock_hours)

# Calculate returns
stock_data['Return'] = stock_data['price'].pct_change()

# Aggregate sentiment by hour
hourly_sentiment = tweets_data.groupby('hour').agg({
    'vader_compound': 'mean',
    'vader_positive': 'mean',
    'vader_negative': 'mean',
    'cleaned_tweet': 'count'
}).reset_index()

sentiment_hours = sorted(hourly_sentiment['hour'].unique())
print("Sentiment data hours available:", sentiment_hours)

# Find overlapping hours
overlapping_hours = sorted(set(stock_hours) & set(sentiment_hours))
print("\nOverlapping hours (13-17):", overlapping_hours)

# Filter data for overlapping hours only
stock_data_filtered = stock_data[stock_data['hour'].isin(overlapping_hours)]
sentiment_data_filtered = hourly_sentiment[hourly_sentiment['hour'].isin(overlapping_hours)]

# Merge hourly sentiment with stock data
merged_data = pd.merge(stock_data_filtered, sentiment_data_filtered, on='hour', how='inner')

print("\nHourly Analysis (13:00 - 17:00):")
print("-" * 65)
print("Hour | Price | Return  | Sentiment | Tweet Count")
print("-" * 65)
for _, row in merged_data.iterrows():
    print(f"{row['hour']:02d}:00 | {row['price']:6.2f} | {row['Return']:7.3%} | {row['vader_compound']:9.3f} | {int(row['cleaned_tweet']):11d}")

# Calculate correlations
if len(merged_data) > 1:
    vader_corr, vader_p = spearmanr(merged_data['vader_compound'], 
                                   merged_data['Return'], 
                                   nan_policy='omit')
    
    print("\nStock Return Correlations (13:00 - 17:00):")
    print("-" * 50)
    print(f"VADER Compound Score vs Returns:")
    print(f"  Correlation: {vader_corr:.3f}")
    print(f"  p-value: {vader_p:.3f}")
    print(f"  Number of hours analyzed: {len(merged_data)}")
    
    # Additional correlations
    pos_corr, pos_p = spearmanr(merged_data['vader_positive'], merged_data['Return'])
    neg_corr, neg_p = spearmanr(merged_data['vader_negative'], merged_data['Return'])
    
    print("\nAdditional correlations:")
    print(f"Positive sentiment vs Returns: {pos_corr:.3f} (p-value: {pos_p:.3f})")
    print(f"Negative sentiment vs Returns: {neg_corr:.3f} (p-value: {neg_p:.3f})")
else:
    print("\nNot enough overlapping data points for correlation analysis")

# Visualize the relationship
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)

# Plot 1: Price and Sentiment
ax1 = plt.gca()
ax2 = ax1.twinx()

# Plot stock price
line1 = ax1.plot(merged_data['hour'], merged_data['price'], 
                 color='blue', label='Stock Price')
ax1.set_xlabel('Hour')
ax1.set_ylabel('Stock Price', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Plot sentiment
line2 = ax2.plot(merged_data['hour'], merged_data['vader_compound'], 
                 color='red', linestyle='--', label='Sentiment')
ax2.set_ylabel('Sentiment Score', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Add legend
lines = line1 + line2
labels = [l.get_label() for l in lines]
ax1.legend(lines, labels, loc='upper left')

plt.title('Stock Price and Sentiment During Trading Hours')

# Plot 2: Scatter plot of Returns vs Sentiment
plt.subplot(1, 2, 2)
plt.scatter(merged_data['vader_compound'], merged_data['Return'])
plt.xlabel('Sentiment Score')
plt.ylabel('Stock Return')
plt.title('Returns vs Sentiment Correlation')

# Add trend line
z = np.polyfit(merged_data['vader_compound'], merged_data['Return'], 1)
p = np.poly1d(z)
plt.plot(merged_data['vader_compound'], 
         p(merged_data['vader_compound']), 
         "r--", alpha=0.8)

plt.tight_layout()
plt.savefig('trading_hours_analysis.png')
plt.close()

# Print summary statistics
print("\nSummary Statistics (13:00 - 17:00):")
print("-" * 50)
print(f"Average sentiment: {merged_data['vader_compound'].mean():.3f}")
print(f"Average stock return: {merged_data['Return'].mean():.3%}")
print(f"Total tweets analyzed: {merged_data['cleaned_tweet'].sum()}")
print(f"Average tweets per hour: {merged_data['cleaned_tweet'].mean():.1f}")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\folan\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Loading data...

Preprocessing tweets...

Performing VADER analysis...

VADER Analysis Results:
--------------------------------------------------
Average Compound Score: 0.061
Average Positive Score: 0.088
Average Negative Score: 0.054
Average Neutral Score: 0.844

Loading BERT model...


  with pd.option_context('mode.use_inf_as_na', True):



Error during BERT analysis: 
AutoModelForSequenceClassification requires the PyTorch library but it was not found in your environment.
However, we were able to find a TensorFlow installation. TensorFlow classes begin
with "TF", but are otherwise identically named to our PyTorch classes. This
means that the TF equivalent of the class you tried to import would be "TFAutoModelForSequenceClassification".
If you want to use TensorFlow, please use TF classes instead!

If you really do want to use PyTorch please go to
https://pytorch.org/get-started/locally/ and follow the instructions that
match your environment.

Continuing with VADER analysis only...

Analyzing relationship with stock prices...


KeyError: 'time'