In [None]:
import pandas as pd
import numpy as np
import re
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
nltk.download('vader_lexicon')

# Function to get BERT sentiment
def get_bert_sentiment(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
    return predictions[0].detach().numpy()

print("Loading data...")
tweets_data = pd.read_csv('Tesla.csv')
stock_data = pd.read_csv('TSLA_stock_data.csv')

# Basic tweet preprocessing
print("\nPreprocessing tweets...")
tweets_data['cleaned_tweet'] = tweets_data['tweet'].fillna('')
tweets_data['cleaned_tweet'] = tweets_data['tweet'].apply(lambda x: str(x))
tweets_data['cleaned_tweet'] = tweets_data['cleaned_tweet'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))
tweets_data['cleaned_tweet'] = tweets_data['cleaned_tweet'].apply(lambda x: re.sub(r'@\w+', '', x))
tweets_data['cleaned_tweet'] = tweets_data['cleaned_tweet'].apply(lambda x: re.sub(r'#', '', x))
tweets_data['created_at'] = pd.to_datetime(tweets_data['created_at'], unit='ms')
tweets_data['hour'] = tweets_data['created_at'].dt.hour

# VADER Analysis
print("\nPerforming VADER analysis...")
analyzer = SentimentIntensityAnalyzer()
vader_scores = []

for tweet in tweets_data['cleaned_tweet']:
    scores = analyzer.polarity_scores(tweet)
    vader_scores.append(scores)

tweets_data['vader_compound'] = [score['compound'] for score in vader_scores]
tweets_data['vader_positive'] = [score['pos'] for score in vader_scores]
tweets_data['vader_negative'] = [score['neg'] for score in vader_scores]
tweets_data['vader_neutral'] = [score['neu'] for score in vader_scores]

# Display VADER results
print("\nVADER Analysis Results:")
print("-" * 50)
print(f"Average Compound Score: {tweets_data['vader_compound'].mean():.3f}")
print(f"Average Positive Score: {tweets_data['vader_positive'].mean():.3f}")
print(f"Average Negative Score: {tweets_data['vader_negative'].mean():.3f}")
print(f"Average Neutral Score: {tweets_data['vader_neutral'].mean():.3f}")

# VADER visualization
plt.figure(figsize=(10, 6))
sns.histplot(data=tweets_data, x='vader_compound', bins=30)
plt.title('Distribution of VADER Compound Scores')
plt.xlabel('Compound Score')
plt.ylabel('Count')
plt.savefig('vader_distribution.png')
plt.close()

# BERT Analysis
print("\nLoading BERT model...")
try:
    tokenizer = AutoTokenizer.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
    model = AutoModelForSequenceClassification.from_pretrained("nlptown/bert-base-multilingual-uncased-sentiment")
    
    print("\nPerforming BERT analysis...")
    bert_scores = []
    batch_size = 8  # Smaller batch size for memory efficiency
    
    for i in range(0, len(tweets_data), batch_size):
        batch_tweets = tweets_data['cleaned_tweet'][i:i+batch_size].tolist()
        batch_scores = []
        for tweet in batch_tweets:
            sentiment_scores = get_bert_sentiment(tweet, tokenizer, model)
            # Convert 1-5 scale to -1 to 1 scale for comparison with VADER
            bert_score = (np.argmax(sentiment_scores) + 1 - 3) / 2
            batch_scores.append(bert_score)
        bert_scores.extend(batch_scores)
        if i % 100 == 0:
            print(f"Processed {i}/{len(tweets_data)} tweets")
    
    tweets_data['bert_score'] = bert_scores
    
    # Display BERT results
    print("\nBERT Analysis Results:")
    print("-" * 50)
    print(f"Average BERT Score: {tweets_data['bert_score'].mean():.3f}")
    
    # BERT visualization
    plt.figure(figsize=(10, 6))
    sns.histplot(data=tweets_data, x='bert_score', bins=30)
    plt.title('Distribution of BERT Sentiment Scores')
    plt.xlabel('Sentiment Score')
    plt.ylabel('Count')
    plt.savefig('bert_distribution.png')
    plt.close()
    
    # Compare VADER and BERT
    correlation = tweets_data['vader_compound'].corr(tweets_data['bert_score'])
    print(f"\nCorrelation between VADER and BERT scores: {correlation:.3f}")
    
except Exception as e:
    print(f"\nError during BERT analysis: {str(e)}")
    print("Continuing with VADER analysis only...")

# Stock Price Analysis
print("\nAnalyzing relationship with stock prices...")
stock_data['hour'] = stock_data['time'].str.split(':').str[0].astype(int)
stock_data['Return'] = stock_data['price'].pct_change()

# Aggregate sentiment by hour
hourly_sentiment = tweets_data.groupby('hour').agg({
    'vader_compound': 'mean',
    'cleaned_tweet': 'count'
}).reset_index()

merged_data = pd.merge(hourly_sentiment, stock_data, on='hour', how='inner')

# Calculate correlation with stock returns
vader_corr, vader_p = spearmanr(merged_data['vader_compound'], merged_data['Return'])

print("\nStock Return Correlations:")
print("-" * 50)
print(f"VADER Correlation: {vader_corr:.3f} (p-value: {vader_p:.3f})")

# Display most extreme examples
print("\nMost Extreme Examples (VADER):")
print("-" * 50)
print("\nMost Positive Tweet:")
print(tweets_data.loc[tweets_data['vader_compound'].idxmax(), 'cleaned_tweet'])
print(f"VADER score: {tweets_data['vader_compound'].max():.3f}")

print("\nMost Negative Tweet:")
print(tweets_data.loc[tweets_data['vader_compound'].idxmin(), 'cleaned_tweet'])
print(f"VADER score: {tweets_data['vader_compound'].min():.3f}")

# Save results
print("\nSaving results...")
tweets_data.to_csv('sentiment_analysis_results.csv', index=False)

print("\nAnalysis complete! Results have been saved to CSV and visualizations to PNG files.")