In [121]:
# Packages
import pandas as pd
import numpy as np
from collections import Counter
import nltk, spacy, re, json
from nltk.corpus import stopwords
from nltk.util import ngrams
nltk.download("punkt")
nltk.download('stopwords')

# Set up
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)

[nltk_data] Downloading package punkt to /jet/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /jet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
filename="../../data/twitter/elon_clean.csv"

In [21]:
elon = pd.read_csv(filename)

In [22]:
elon.head()

Unnamed: 0,username,date,retweets,favorites,text,mentions,hashtags,permalink
0,elonmusk,2019-04-02 20:38,993,9263,Dogecoin value may vary https://www. theonion.com/bitcoin-plunge -reveals-possible-vulnerabilities-in-craz-1821134169 …,,,https://twitter.com/elonmusk/status/1113178951403180032
1,elonmusk,2019-04-02 20:16,7238,55271,Dogecoin rulz pic.twitter.com/flWWUgAgLU,,,https://twitter.com/elonmusk/status/1113173498384441344
2,elonmusk,2019-04-02 19:40,368,9159,Uh oh,,,https://twitter.com/elonmusk/status/1113164389929160706
3,elonmusk,2019-04-02 09:24,1317,6176,Dogecoin might be my fav cryptocurrency. It’s pretty cool.,,,https://twitter.com/elonmusk/status/1113009339743100929
4,elonmusk,2019-04-02 09:21,1557,26925,Yup https:// twitter.com/nasa/status/11 12860196043452417 …,,,https://twitter.com/elonmusk/status/1113008497006804992


In [96]:
tweets=elon['text']

In [97]:
tweets

0       Dogecoin value may vary https://www. theonion.com/bitcoin-plunge -reveals-possible-vulnerabilities-in-craz-1821134169 …                             
1       Dogecoin rulz pic.twitter.com/flWWUgAgLU                                                                                                            
2       Uh oh                                                                                                                                               
3       Dogecoin might be my fav cryptocurrency. It’s pretty cool.                                                                                          
4       Yup https:// twitter.com/nasa/status/11 12860196043452417 …                                                                                         
5       Some challenges with ice formation in the cryogenic propellant prevalves. Hopefully overcome soon.                                                  
6       What could possibly go wrong?                     

In [98]:
nltk_twitter_tokens=[]
for tweet in tweets:
    nltk_twitter_tokens.append(nltk.casual_tokenize(tweet))

In [99]:
### Remember to follow the order here: match from first to last
regexes=(
    # Keep usernames together (any token starting with @, followed by A-Z, a-z, 0-9)
    r"(?:@[\w_]+)",
    # Keep hashtags together (any token starting with #, followed by A-Z, a-z, 0-9, _, or -)
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)",
    # Keep words with apostrophes, hyphens and underscores together
    r"(?:[a-z][a-z’'\-_]+[a-z])",
    # Keep all other sequences of A-Z, a-z, 0-9, _ together
    r"(?:[\w_]+)",
    # Everything else that's not whitespace
    r"(?:\S)")
big_regex="|".join(regexes)
extensible_tokenizer = re.compile(big_regex, re.VERBOSE | re.I | re.UNICODE)

def extensible_tokenize(text):
    return extensible_tokenizer.findall(text)

In [100]:
# Tokenize tweets data 
tweets_tokens=[]
for tweet in tweets:
    tokens=extensible_tokenize(tweet.lower())
    tweets_tokens.append(tokens)

In [101]:
tweets_tokens

[['dogecoin',
  'value',
  'may',
  'vary',
  'https',
  ':',
  '/',
  '/',
  'www',
  '.',
  'theonion',
  '.',
  'com',
  '/',
  'bitcoin-plunge',
  '-',
  'reveals-possible-vulnerabilities-in-craz',
  '-',
  '1821134169',
  '…'],
 ['dogecoin', 'rulz', 'pic', '.', 'twitter', '.', 'com', '/', 'flwwugaglu'],
 ['uh', 'oh'],
 ['dogecoin',
  'might',
  'be',
  'my',
  'fav',
  'cryptocurrency',
  '.',
  'it’s',
  'pretty',
  'cool',
  '.'],
 ['yup',
  'https',
  ':',
  '/',
  '/',
  'twitter',
  '.',
  'com',
  '/',
  'nasa',
  '/',
  'status',
  '/',
  '11',
  '12860196043452417',
  '…'],
 ['some',
  'challenges',
  'with',
  'ice',
  'formation',
  'in',
  'the',
  'cryogenic',
  'propellant',
  'prevalves',
  '.',
  'hopefully',
  'overcome',
  'soon',
  '.'],
 ['what', 'could', 'possibly', 'go', 'wrong', '?'],
 ['no'],
 ['you’re', 'so', 'right', '.', 'working', 'on', 'it', '!'],
 ['that',
  'car',
  'seemed',
  'so',
  'advanced',
  'when',
  'i',
  'watched',
  'that',
  'show',
  'a

In [102]:
# from nltk.stem import PorterStemmer
# ps = PorterStemmer()
# token=ps.stem(token)

## Extract Features

In [103]:
tweets_tokens_flat=[]
for tokens in tweets_tokens:
    for token in tokens:
        tweets_tokens_flat.append(token)

In [104]:
tweets_tokens_flat

['dogecoin',
 'value',
 'may',
 'vary',
 'https',
 ':',
 '/',
 '/',
 'www',
 '.',
 'theonion',
 '.',
 'com',
 '/',
 'bitcoin-plunge',
 '-',
 'reveals-possible-vulnerabilities-in-craz',
 '-',
 '1821134169',
 '…',
 'dogecoin',
 'rulz',
 'pic',
 '.',
 'twitter',
 '.',
 'com',
 '/',
 'flwwugaglu',
 'uh',
 'oh',
 'dogecoin',
 'might',
 'be',
 'my',
 'fav',
 'cryptocurrency',
 '.',
 'it’s',
 'pretty',
 'cool',
 '.',
 'yup',
 'https',
 ':',
 '/',
 '/',
 'twitter',
 '.',
 'com',
 '/',
 'nasa',
 '/',
 'status',
 '/',
 '11',
 '12860196043452417',
 '…',
 'some',
 'challenges',
 'with',
 'ice',
 'formation',
 'in',
 'the',
 'cryogenic',
 'propellant',
 'prevalves',
 '.',
 'hopefully',
 'overcome',
 'soon',
 '.',
 'what',
 'could',
 'possibly',
 'go',
 'wrong',
 '?',
 'no',
 'you’re',
 'so',
 'right',
 '.',
 'working',
 'on',
 'it',
 '!',
 'that',
 'car',
 'seemed',
 'so',
 'advanced',
 'when',
 'i',
 'watched',
 'that',
 'show',
 'as',
 'a',
 'kid',
 '!',
 'knight',
 'industries',
 'two',
 'thousa

In [105]:
# # find most frequent words
# Counter(tweets_tokens_flat).most_common(100)

Sentiment Analysis

In [106]:
tweets_tokens_flat

['dogecoin',
 'value',
 'may',
 'vary',
 'https',
 ':',
 '/',
 '/',
 'www',
 '.',
 'theonion',
 '.',
 'com',
 '/',
 'bitcoin-plunge',
 '-',
 'reveals-possible-vulnerabilities-in-craz',
 '-',
 '1821134169',
 '…',
 'dogecoin',
 'rulz',
 'pic',
 '.',
 'twitter',
 '.',
 'com',
 '/',
 'flwwugaglu',
 'uh',
 'oh',
 'dogecoin',
 'might',
 'be',
 'my',
 'fav',
 'cryptocurrency',
 '.',
 'it’s',
 'pretty',
 'cool',
 '.',
 'yup',
 'https',
 ':',
 '/',
 '/',
 'twitter',
 '.',
 'com',
 '/',
 'nasa',
 '/',
 'status',
 '/',
 '11',
 '12860196043452417',
 '…',
 'some',
 'challenges',
 'with',
 'ice',
 'formation',
 'in',
 'the',
 'cryogenic',
 'propellant',
 'prevalves',
 '.',
 'hopefully',
 'overcome',
 'soon',
 '.',
 'what',
 'could',
 'possibly',
 'go',
 'wrong',
 '?',
 'no',
 'you’re',
 'so',
 'right',
 '.',
 'working',
 'on',
 'it',
 '!',
 'that',
 'car',
 'seemed',
 'so',
 'advanced',
 'when',
 'i',
 'watched',
 'that',
 'show',
 'as',
 'a',
 'kid',
 '!',
 'knight',
 'industries',
 'two',
 'thousa

In [107]:
# remove punctuation
tweets_tokens_flat=[re.sub('[^a-zA-Z\-]' ,'',w) for w in tweets_tokens_flat] 
# remove non-word hyphen
tweets_tokens_flat=[re.sub('^\-' ,'',w) for w in tweets_tokens_flat] 
# remove empty tokens
tweets_tokens_flat=[w for w in tweets_tokens_flat if w]#

In [108]:
tweets_tokens_flat

['dogecoin',
 'value',
 'may',
 'vary',
 'https',
 'www',
 'theonion',
 'com',
 'bitcoin-plunge',
 'reveals-possible-vulnerabilities-in-craz',
 'dogecoin',
 'rulz',
 'pic',
 'twitter',
 'com',
 'flwwugaglu',
 'uh',
 'oh',
 'dogecoin',
 'might',
 'be',
 'my',
 'fav',
 'cryptocurrency',
 'its',
 'pretty',
 'cool',
 'yup',
 'https',
 'twitter',
 'com',
 'nasa',
 'status',
 'some',
 'challenges',
 'with',
 'ice',
 'formation',
 'in',
 'the',
 'cryogenic',
 'propellant',
 'prevalves',
 'hopefully',
 'overcome',
 'soon',
 'what',
 'could',
 'possibly',
 'go',
 'wrong',
 'no',
 'youre',
 'so',
 'right',
 'working',
 'on',
 'it',
 'that',
 'car',
 'seemed',
 'so',
 'advanced',
 'when',
 'i',
 'watched',
 'that',
 'show',
 'as',
 'a',
 'kid',
 'knight',
 'industries',
 'two',
 'thousand',
 'and',
 'even',
 'tinier',
 'ones',
 'for',
 'tardigrades',
 'yes',
 'going',
 'forward',
 'all',
 'tesla',
 'stores',
 'will',
 'be',
 'tiny',
 'have',
 'tiny',
 'cars',
 'meant',
 'to',
 'say',
 'young',
 '

In [109]:
# remove stopwords
tweets_tokens_flat = [w for w in tweets_tokens_flat if w not in stopwords.words("english")]

In [110]:
tweets_tokens_flat

['dogecoin',
 'value',
 'may',
 'vary',
 'https',
 'www',
 'theonion',
 'com',
 'bitcoin-plunge',
 'reveals-possible-vulnerabilities-in-craz',
 'dogecoin',
 'rulz',
 'pic',
 'twitter',
 'com',
 'flwwugaglu',
 'uh',
 'oh',
 'dogecoin',
 'might',
 'fav',
 'cryptocurrency',
 'pretty',
 'cool',
 'yup',
 'https',
 'twitter',
 'com',
 'nasa',
 'status',
 'challenges',
 'ice',
 'formation',
 'cryogenic',
 'propellant',
 'prevalves',
 'hopefully',
 'overcome',
 'soon',
 'could',
 'possibly',
 'go',
 'wrong',
 'youre',
 'right',
 'working',
 'car',
 'seemed',
 'advanced',
 'watched',
 'show',
 'kid',
 'knight',
 'industries',
 'two',
 'thousand',
 'even',
 'tinier',
 'ones',
 'tardigrades',
 'yes',
 'going',
 'forward',
 'tesla',
 'stores',
 'tiny',
 'tiny',
 'cars',
 'meant',
 'say',
 'young',
 'spelled',
 'wrong',
 'ironic',
 'stan',
 'cant',
 'solid',
 'work',
 'keen',
 'usual',
 'norway',
 'https',
 'twitter',
 'com',
 'elbilforeninge',
 'n',
 'status',
 'yes',
 'yes',
 'seems',
 'fair',
 '

In [111]:
# find most frequent words
Counter(tweets_tokens_flat).most_common(100)

[('com', 1210),
 ('tesla', 717),
 ('twitter', 643),
 ('https', 587),
 ('http', 430),
 ('pic', 376),
 ('www', 345),
 ('model', 333),
 ('yes', 290),
 ('good', 289),
 ('like', 250),
 ('spacex', 232),
 ('car', 229),
 ('v', 193),
 ('would', 192),
 ('rocket', 191),
 ('next', 181),
 ('great', 165),
 ('time', 162),
 ('people', 160),
 ('one', 159),
 ('cars', 156),
 ('falcon', 151),
 ('dont', 148),
 ('us', 144),
 ('launch', 144),
 ('first', 144),
 ('right', 135),
 ('status', 134),
 ('also', 134),
 ('thanks', 133),
 ('need', 130),
 ('new', 129),
 ('soon', 127),
 ('much', 127),
 ('make', 125),
 ('watch', 125),
 ('get', 123),
 ('year', 123),
 ('better', 122),
 ('x', 121),
 ('even', 120),
 ('high', 119),
 ('yeah', 119),
 ('youtube', 117),
 ('love', 117),
 ('really', 115),
 ('way', 114),
 ('thats', 113),
 ('true', 112),
 ('p', 112),
 ('production', 111),
 ('work', 107),
 ('maybe', 106),
 ('actually', 106),
 ('coming', 104),
 ('team', 101),
 ('exactly', 100),
 ('im', 97),
 ('instagram', 97),
 ('dragon

In [112]:
self_stopwords=['com','twitter','https','http','www','v','x','p','would']

In [114]:
tweets_tokens_flat=[w for w in tweets_tokens_flat if w not in self_stopwords]

In [115]:
Counter(tweets_tokens_flat).most_common(100)

[('tesla', 717),
 ('pic', 376),
 ('model', 333),
 ('yes', 290),
 ('good', 289),
 ('like', 250),
 ('spacex', 232),
 ('car', 229),
 ('rocket', 191),
 ('next', 181),
 ('great', 165),
 ('time', 162),
 ('people', 160),
 ('one', 159),
 ('cars', 156),
 ('falcon', 151),
 ('dont', 148),
 ('us', 144),
 ('launch', 144),
 ('first', 144),
 ('right', 135),
 ('status', 134),
 ('also', 134),
 ('thanks', 133),
 ('need', 130),
 ('new', 129),
 ('soon', 127),
 ('much', 127),
 ('make', 125),
 ('watch', 125),
 ('get', 123),
 ('year', 123),
 ('better', 122),
 ('even', 120),
 ('high', 119),
 ('yeah', 119),
 ('youtube', 117),
 ('love', 117),
 ('really', 115),
 ('way', 114),
 ('thats', 113),
 ('true', 112),
 ('production', 111),
 ('work', 107),
 ('maybe', 106),
 ('actually', 106),
 ('coming', 104),
 ('team', 101),
 ('exactly', 100),
 ('im', 97),
 ('instagram', 97),
 ('dragon', 97),
 ('think', 94),
 ('many', 91),
 ('week', 91),
 ('space', 91),
 ('going', 90),
 ('go', 89),
 ('test', 88),
 ('sure', 87),
 ('back', 

In [None]:
# sentiment analysis
# bi-gram
# dictionary based on eda

In [118]:
tweets_tokens_text=' '.join(tweets_tokens_flat)

In [123]:
tweets_tokens_text

