Question 2: What is the correlation between the number of times a stock is discussed on the WSJ and its stock price? (could also do sentiment analysis)
- Data from: https://www.kaggle.com/datasets/marta99/elon-musks-tweets-dataset-2022?resource=download
- Dogecoin history from: https://www.kaggle.com/datasets/dhruvildave/dogecoin-historical-data/data

In [1]:
import pandas as pd
import plotly.express as px 
from scipy.stats import pearsonr, chi2_contingency
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime, date


In [2]:
# read twitter data for each day 
twitter_data = pd.read_csv('cleandata.csv')
twitter_data['Date'] = pd.to_datetime(twitter_data['Date'])


def filter_2022(df):
    df = df[df['Date'].dt.year == 2022]
    return df


twitter_data.pipe(filter_2022)


Unnamed: 0,Tweets,Retweets,Likes,Date,Cleaned_Tweets
0,@PeterSchiff 🤣 thanks,209,7021,2022-10-27 16:17:39,thanks
1,@ZubyMusic Absolutely,755,26737,2022-10-27 13:19:25,Absolutely
2,Dear Twitter Advertisers https://t.co/GMwHmInPAS,55927,356623,2022-10-27 13:08:00,Dear Twitter Advertisers
3,Meeting a lot of cool people at Twitter today!,9366,195546,2022-10-26 21:39:32,Meeting a lot of cool people at Twitter today!
4,Entering Twitter HQ – let that sink in! https:...,145520,1043592,2022-10-26 18:45:58,Entering Twitter HQ – let that sink in!
...,...,...,...,...,...
2663,@LimitingThe @baglino Just that manganese is a...,171,3173,2022-01-27 22:01:06,Just that manganese is an alternative to iron ...
2664,@incentives101 @ICRicardoLara Exactly,145,4234,2022-01-27 21:23:20,Exactly
2665,@ICRicardoLara Your policies are directly resp...,421,6144,2022-01-27 21:13:57,Your policies are directly responsible for the...
2666,@ICRicardoLara You should be voted out of office,484,7029,2022-01-27 21:12:27,You should be voted out of office


In [3]:
# returns the positive sentiment intensity for a sentence
def pos_score(sentence: str):
    sentence_analyzer = SentimentIntensityAnalyzer()
    temp_sentence = sentence
    sentiment_result = sentence_analyzer.polarity_scores(temp_sentence)
    return (sentiment_result['pos'])

# returns the negative sentiment intensity for a sentence
def neg_score(sentence: str):
    sentence_analyzer = SentimentIntensityAnalyzer()
    temp_sentence = sentence
    sentiment_result = sentence_analyzer.polarity_scores(temp_sentence)
    return (sentiment_result['neg'])

# returns the neutral sentiment intensity for a sentence
def neu_score(sentence: str):
    sentence_analyzer = SentimentIntensityAnalyzer()
    temp_sentence = sentence
    sentiment_result = sentence_analyzer.polarity_scores(temp_sentence)
    return (sentiment_result['neu'])

In [4]:
# apply pos_score to each tweet
twitter_data['pos_score'] = twitter_data['Cleaned_Tweets'].apply(pos_score)
twitter_data['neg_score'] = twitter_data['Cleaned_Tweets'].apply(neg_score)
twitter_data['neu_score'] = twitter_data['Cleaned_Tweets'].apply(neu_score)

In [5]:
print(twitter_data)

                                                 Tweets  Retweets    Likes  \
0                                 @PeterSchiff 🤣 thanks       209     7021   
1                                 @ZubyMusic Absolutely       755    26737   
2      Dear Twitter Advertisers https://t.co/GMwHmInPAS     55927   356623   
3        Meeting a lot of cool people at Twitter today!      9366   195546   
4     Entering Twitter HQ – let that sink in! https:...    145520  1043592   
...                                                 ...       ...      ...   
2663  @LimitingThe @baglino Just that manganese is a...       171     3173   
2664              @incentives101 @ICRicardoLara Exactly       145     4234   
2665  @ICRicardoLara Your policies are directly resp...       421     6144   
2666   @ICRicardoLara You should be voted out of office       484     7029   
2667         CB radios are free from govt/media control     11302   113429   

                    Date                                     Cl

In [7]:
twitter_data['Date'] = pd.to_datetime(twitter_data['Date']).dt.date
grouped_data = twitter_data.groupby('Date').agg({'pos_score': 'sum', 'neg_score': 'sum', 'neu_score': 'sum'}).reset_index()

print(grouped_data)

           Date  pos_score  neg_score  neu_score
0    2022-01-27      1.109      1.007      9.882
1    2022-01-28      1.499      0.171      8.330
2    2022-01-29      1.929      0.206      4.865
3    2022-01-30      1.682      0.589     10.728
4    2022-01-31      0.162      0.134      4.704
..          ...        ...        ...        ...
246  2022-10-23      1.995      0.000      9.005
247  2022-10-24      5.013      1.113     10.873
248  2022-10-25      1.779      0.160      8.061
249  2022-10-26      2.140      0.103      8.757
250  2022-10-27      1.565      0.000      1.435

[251 rows x 4 columns]


In [8]:
# Graph Twitter Data
fig = px.bar(grouped_data, x='Date', y=['pos_score','neg_score','neu_score'])
fig.show()

In [9]:
# Get Dogecoin historical price data 
dogecoin_data = pd.read_csv('DOGE-USD.csv')
dogecoin_data['Date'] = pd.to_datetime(dogecoin_data['Date']).dt.date


# merge dogecoin data onto musk tweet data
merged_data = grouped_data.merge(dogecoin_data, on='Date')
print(merged_data)

           Date  pos_score  neg_score  neu_score      Open      High  \
0    2022-01-27      1.109      1.007      9.882  0.143756  0.145010   
1    2022-01-28      1.499      0.171      8.330  0.141244  0.142413   
2    2022-01-29      1.929      0.206      4.865  0.141649  0.143984   
3    2022-01-30      1.682      0.589     10.728  0.143057  0.143515   
4    2022-01-31      0.162      0.134      4.704  0.139469  0.142264   
..          ...        ...        ...        ...       ...       ...   
246  2022-10-23      1.995      0.000      9.005  0.059683  0.060379   
247  2022-10-24      5.013      1.113     10.873  0.060341  0.060814   
248  2022-10-25      1.779      0.160      8.061  0.059502  0.064049   
249  2022-10-26      2.140      0.103      8.757  0.062856  0.073274   
250  2022-10-27      1.565      0.000      1.435  0.072799  0.084825   

          Low     Close  Adj Close        Volume  
0    0.136947  0.141247   0.141247  6.087076e+08  
1    0.138380  0.141656   0.14165

In [10]:
# Plot dogecoin data and the positive sentiment of Musk's tweets 
fig2 = px.scatter(merged_data, x='Close', y='pos_score')
fig2.show()
