In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import sent_tokenize

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/ishalyminov/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [3]:
TWEETS_FILE = 'elonmusk_tweets.csv'
tweet_data = pd.read_csv(TWEETS_FILE)

STOCK_FILE = 'MacroTrends_Data_Download_TSLA.csv'
stock_data = pd.read_csv(STOCK_FILE)

In [4]:
stock_data[:-10]

Unnamed: 0,date,open,high,low,close,volume
0,2010-06-29,19.0000,25.0000,17.5400,23.89,18766300
1,2010-06-30,25.7900,30.4192,23.3000,23.83,17187100
2,2010-07-01,25.0000,25.9200,20.2700,21.96,8218800
3,2010-07-02,23.0000,23.1000,18.7100,19.20,5139800
4,2010-07-06,20.0000,20.0000,15.8300,16.11,6866900
5,2010-07-07,16.4000,16.6300,14.9800,15.80,6921700
6,2010-07-08,16.1400,17.5200,15.5700,17.46,7711400
7,2010-07-09,17.5800,17.9000,16.5500,17.40,4050600
8,2010-07-12,17.9500,18.0700,17.0000,17.05,2202500
9,2010-07-13,17.3938,18.6400,16.9000,18.14,2680100


In [5]:
def annotate_sentiment(in_df):
    sent = SentimentIntensityAnalyzer()
    tweet_sentiment = [sent.polarity_scores(sentence) for sentence in in_df['text']]
    data_modified = in_df.copy()
    data_modified['neg'] = [item['neg'] for item in tweet_sentiment]
    data_modified['neu'] = [item['neu'] for item in tweet_sentiment]
    data_modified['pos'] = [item['pos'] for item in tweet_sentiment]
    data_modified['sentiment_polarity'] = [item['pos'] - item['neg'] for item in tweet_sentiment]
    data_modified['compound'] = [item['compound'] for item in tweet_sentiment]
    return data_modified

In [6]:
def annotate_stock(in_tweet_df, in_stock_df):
    data_modified = in_tweet_df.copy()
    stock_price = []
    missing_count = 0
    for date in in_tweet_df['created_on']:
        if date in in_stock_df['date'].values:
            row = in_stock_df[in_stock_df['date'] == date].iloc[0]
            stock_price.append(np.mean([row['high'], row['low']]))
        else:
            missing_count += 1
            stock_price.append(None)
    print(missing_count)
    data_modified['stock_avg'] = stock_price
    return data_modified

In [7]:
def annotate_textual_features(in_tweet_df):
    data_modified = in_tweet_df.copy()
    word_count = []
    sent_count = []
    unique_words = []
    mentions = []
    is_rt = []
    tesla_mentioned = []

    tweet_tokenizer = TweetTokenizer()
    for tweet in in_tweet_df['text']:
        words = tweet_tokenizer.tokenize(tweet)
        word_count.append(len(words))
        sent_count.append(len(sent_tokenize(tweet)))
        unique_words.append(len(set(words)))
        mentions.append(sum([chr == '@' for chr in tweet]))
        is_rt.append(tweet.strip()[:2] == ['RT'])
        tesla_mentioned.append('tesla' in tweet.lower())
    data_modified['word_count'] = word_count
    data_modified['sent_count'] = sent_count
    data_modified['unique_words'] = unique_words
    data_modified['mentions'] = mentions
    data_modified['is_rt'] = is_rt
    data_modified['tesla_mentioned'] = tesla_mentioned

    return data_modified

In [8]:
tweets_with_sentiment = annotate_sentiment(tweet_data)

In [9]:
tweets_with_sentiment['created_on'] = [date_time.partition(' ')[0]
                                       for date_time in tweets_with_sentiment['created_at']]

In [10]:
tweets_with_sentiment_and_stock = annotate_stock(tweets_with_sentiment, stock_data)

907


In [11]:
tweets_all = annotate_textual_features(tweets_with_sentiment_and_stock)

In [29]:
tweets_all = tweets_all[np.isnan(tweets_all['stock_avg'])]

SyntaxError: invalid syntax (<ipython-input-29-a7fdf7a7cb99>, line 1)

In [27]:
tweets_all['stock_avg']

4      NaN
5      NaN
6      NaN
7      NaN
8      NaN
9      NaN
10     NaN
11     NaN
12     NaN
39     NaN
103    NaN
113    NaN
114    NaN
115    NaN
116    NaN
117    NaN
118    NaN
119    NaN
120    NaN
121    NaN
122    NaN
123    NaN
137    NaN
138    NaN
139    NaN
146    NaN
147    NaN
148    NaN
149    NaN
150    NaN
        ..
1875   NaN
1876   NaN
1886   NaN
1887   NaN
1891   NaN
1902   NaN
1905   NaN
1906   NaN
1907   NaN
1908   NaN
1909   NaN
1910   NaN
1911   NaN
1912   NaN
1913   NaN
1914   NaN
1915   NaN
1916   NaN
1929   NaN
1930   NaN
1931   NaN
1932   NaN
1959   NaN
1963   NaN
1964   NaN
1966   NaN
1968   NaN
1969   NaN
1970   NaN
1971   NaN
Name: stock_avg, Length: 665, dtype: float64

In [12]:
tweets_all = tweets_all[tweets_all['created_on'] >= '2014-01-01']

In [13]:
tweets_all[:10]

Unnamed: 0,id,created_at,text,neg,neu,pos,sentiment_polarity,compound,created_on,stock_avg,word_count,sent_count,unique_words,mentions,is_rt,tesla_mentioned
0,849636868052275200,2017-04-05 14:56:29,b'And so the robots spared humanity ... https:...,0.0,1.0,0.0,0.0,0.0,2017-04-05,299.54,9,1,9,0,False,False
1,848988730585096192,2017-04-03 20:01:01,"b""@ForIn2020 @waltmossberg @mims @defcon_5 Exa...",0.0,1.0,0.0,0.0,0.0,2017-04-03,291.79,29,2,26,4,False,True
2,848943072423497728,2017-04-03 16:59:35,"b'@waltmossberg @mims @defcon_5 Et tu, Walt?'",0.0,1.0,0.0,0.0,0.0,2017-04-03,291.79,11,1,10,3,False,False
3,848935705057280001,2017-04-03 16:30:19,b'Stormy weather in Shortville ...',0.0,1.0,0.0,0.0,0.0,2017-04-03,291.79,6,1,6,0,False,False
4,848416049573658624,2017-04-02 06:05:23,"b""@DaveLeeBBC @verge Coal is dying due to nat ...",0.0,1.0,0.0,0.0,0.0,2017-04-02,,18,2,16,2,False,False
5,848415731502923777,2017-04-02 06:04:07,"b""@Lexxxzis It's just a helicopter in helicopt...",0.0,1.0,0.0,0.0,0.0,2017-04-02,,11,1,10,1,False,False
6,848415356263702528,2017-04-02 06:02:38,"b""@verge It won't matter""",0.264,0.736,0.0,-0.264,-0.0191,2017-04-02,,7,1,6,1,False,False
7,848398971139629057,2017-04-02 04:57:31,b'@SuperCoolCube Pretty good',0.0,0.141,0.859,0.859,0.7269,2017-04-02,,6,1,5,1,False,False
8,848244577521647616,2017-04-01 18:44:01,"b""Why did we waste so much time developing sil...",0.237,0.726,0.036,-0.201,-0.7014,2017-04-01,,35,4,30,0,False,False
9,848243350993895424,2017-04-01 18:39:09,b'Technology breakthrough: turns out chemtrail...,0.0,1.0,0.0,0.0,0.0,2017-04-01,,19,1,19,0,False,False


In [14]:
tweets_all.to_csv('tesla.csv')