Question 2: What is the correlation between the number of times a stock is discussed on the WSJ and its stock price? (could also do sentiment analysis)
- Data from: https://www.kaggle.com/datasets/marta99/elon-musks-tweets-dataset-2022?resource=download
- Dogecoin history from: https://www.kaggle.com/datasets/dhruvildave/dogecoin-historical-data/data

In [2]:
import pandas as pd
import plotly.express as px 
from scipy.stats import pearsonr, chi2_contingency
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime, date


In [3]:
# read twitter data for each day 
twitter_data = pd.read_csv('cleandata.csv')
twitter_data['Date'] = pd.to_datetime(twitter_data['Date'])


def filter_2022(df):
    df = df[df['Date'].dt.year == 2022]
    return df


twitter_data.pipe(filter_2022)


Unnamed: 0,Tweets,Retweets,Likes,Date,Cleaned_Tweets
0,@PeterSchiff 🤣 thanks,209,7021,2022-10-27 16:17:39,thanks
1,@ZubyMusic Absolutely,755,26737,2022-10-27 13:19:25,Absolutely
2,Dear Twitter Advertisers https://t.co/GMwHmInPAS,55927,356623,2022-10-27 13:08:00,Dear Twitter Advertisers
3,Meeting a lot of cool people at Twitter today!,9366,195546,2022-10-26 21:39:32,Meeting a lot of cool people at Twitter today!
4,Entering Twitter HQ – let that sink in! https:...,145520,1043592,2022-10-26 18:45:58,Entering Twitter HQ – let that sink in!
...,...,...,...,...,...
2663,@LimitingThe @baglino Just that manganese is a...,171,3173,2022-01-27 22:01:06,Just that manganese is an alternative to iron ...
2664,@incentives101 @ICRicardoLara Exactly,145,4234,2022-01-27 21:23:20,Exactly
2665,@ICRicardoLara Your policies are directly resp...,421,6144,2022-01-27 21:13:57,Your policies are directly responsible for the...
2666,@ICRicardoLara You should be voted out of office,484,7029,2022-01-27 21:12:27,You should be voted out of office


In [4]:
# returns the positive sentiment intensity for a sentence
def pos_score(sentence: str):
    sentence_analyzer = SentimentIntensityAnalyzer()
    temp_sentence = sentence
    sentiment_result = sentence_analyzer.polarity_scores(temp_sentence)
    return (sentiment_result['pos'])

# returns the negative sentiment intensity for a sentence
def neg_score(sentence: str):
    sentence_analyzer = SentimentIntensityAnalyzer()
    temp_sentence = sentence
    sentiment_result = sentence_analyzer.polarity_scores(temp_sentence)
    return (sentiment_result['neg'])

# returns the neutral sentiment intensity for a sentence
def neu_score(sentence: str):
    sentence_analyzer = SentimentIntensityAnalyzer()
    temp_sentence = sentence
    sentiment_result = sentence_analyzer.polarity_scores(temp_sentence)
    return (sentiment_result['neu'])

In [5]:
# apply pos_score to each tweet
twitter_data['pos_score'] = twitter_data['Cleaned_Tweets'].apply(pos_score)
twitter_data['neg_score'] = twitter_data['Cleaned_Tweets'].apply(neg_score)
twitter_data['neu_score'] = twitter_data['Cleaned_Tweets'].apply(neu_score)

In [6]:
twitter_data['Date'] = pd.to_datetime(twitter_data['Date']).dt.date

# Filter for tweets mentioning dogecoin
twitter_data = twitter_data[twitter_data['Tweets'].str.lower().str.contains('doge')]
print(twitter_data.shape)

(52, 8)


In [7]:
# Groups twitter data by date and applies sum function to pos, neg, and neu scores
grouped_data = twitter_data.groupby('Date').agg({'pos_score': 'sum', 'neg_score': 'sum', 'neu_score': 'sum'}).reset_index()

print(grouped_data)

          Date  pos_score  neg_score  neu_score
0   2022-02-21      0.577      0.000      0.423
1   2022-03-02      0.000      0.000      1.000
2   2022-03-07      0.516      0.000      0.484
3   2022-03-14      0.153      0.000      0.847
4   2022-04-10      0.000      0.167      0.833
5   2022-04-26      0.224      0.000      0.776
6   2022-04-28      0.326      0.000      0.674
7   2022-05-03      0.917      0.000      2.083
8   2022-05-17      0.390      0.000      0.610
9   2022-05-25      0.000      0.000      1.000
10  2022-05-26      1.000      0.000      1.000
11  2022-05-27      0.000      0.000      1.000
12  2022-05-31      1.283      0.000      1.717
13  2022-06-16      0.000      0.000      1.000
14  2022-06-19      0.420      0.000      0.580
15  2022-07-06      0.492      0.000      0.508
16  2022-07-07      0.287      0.000      0.713
17  2022-07-12      0.585      0.000      0.415
18  2022-07-13      0.000      0.000      1.000
19  2022-07-15      0.179      0.000    

In [8]:
# Graph Twitter Data
fig = px.bar(grouped_data, x='Date', y=['pos_score','neg_score','neu_score'])
fig.show()

In [9]:
# Get Dogecoin historical price data 
dogecoin_data = pd.read_csv('DOGE-USD.csv')
dogecoin_data['Date'] = pd.to_datetime(dogecoin_data['Date']).dt.date


# merge dogecoin data onto musk tweet data
merged_data = grouped_data.merge(dogecoin_data, on='Date')
print(merged_data)

          Date  pos_score  neg_score  neu_score      Open      High       Low  \
0   2022-02-21      0.577      0.000      0.423  0.136838  0.141345  0.128246   
1   2022-03-02      0.000      0.000      1.000  0.133964  0.136214  0.131232   
2   2022-03-07      0.516      0.000      0.484  0.120769  0.122563  0.115015   
3   2022-03-14      0.153      0.000      0.847  0.111607  0.118967  0.110817   
4   2022-04-10      0.000      0.167      0.833  0.144304  0.156972  0.143863   
5   2022-04-26      0.224      0.000      0.776  0.157777  0.165278  0.137424   
6   2022-04-28      0.326      0.000      0.674  0.139691  0.142618  0.136058   
7   2022-05-03      0.917      0.000      2.083  0.130935  0.131833  0.127399   
8   2022-05-17      0.390      0.000      0.610  0.087836  0.091112  0.087484   
9   2022-05-25      0.000      0.000      1.000  0.083628  0.084338  0.082319   
10  2022-05-26      1.000      0.000      1.000  0.082994  0.083600  0.076065   
11  2022-05-27      0.000   

In [1]:
# Create column for Price Change
merged_data['Price Change'] = merged_data['Open'] - merged_data['Close']

# Plot dogecoin data and the positive sentiment of Musk's tweets 
fig2 = px.scatter(merged_data, x='pos_score', y='Price Change', title='Positivity of Elon Musk Tweets vs. Dogecoin Closing Price', color='Cleaned_Tweets')
fig2.show()

# Pearson test 
correlation_coefficient, p_value = pearsonr(merged_data['pos_score'], merged_data['Volume'])
print("Pearson correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)


NameError: name 'merged_data' is not defined