In [1]:
# imports

import pandas as pd
import numpy as np
import re
import nltk
from tqdm.notebook import tqdm
from IPython.core.interactiveshell import InteractiveShell
from nltk.corpus import stopwords

In [2]:
# import configurations

InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()
nltk.download('stopwords')
english_stop_words = stopwords.words('english')
add_stopwords = ['bitcoin', 'btc', 'crypto', 'cryptocurrency', 'cryptocurrencies',
                'ethereum', 'eth', 'price', 'prices', 'money', 'dollars']
english_stop_words = english_stop_words + add_stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kevctae/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# read file

df = pd.read_pickle("../data/interim/tweets_verified_2020-2021.pkl")
df = df.drop_duplicates(subset='id', keep=False)

display(df.head())

Unnamed: 0,created_at,id,conversation_id,user_id,username,name,tweet,language,mentions,urls,...,replies_count,retweets_count,likes_count,hashtags,cashtags,link,quote_url,video,thumbnail,reply_to
0,2020-01-01 00:08:28-05:00,1212239143687741440,1212239143687741440,20646945,dumbfoundead,dumbfoundead.eth,Bitcoin the worst decision i made this decade,en,[],[],...,9,15,409,[],[],https://twitter.com/dumbfoundead/status/121223...,,False,,[]
1,2020-01-01 00:54:35-05:00,1212250748815208448,1212250748815208448,636023721,neuroecology,Adam J Calhoun,If you ask someone what they'd do if they went...,en,[],[],...,6,0,4,[],[],https://twitter.com/neuroecology/status/121225...,,False,,[]
2,2020-01-01 02:00:25-05:00,1212267316789952512,1212267316789952512,631810714,cnbctv18news,CNBC-TV18,#bitcoin rally begun in 2013 and it reached a ...,en,[],['https://www.cnbctv18.com/market/currency/bit...,...,0,0,4,['bitcoin'],[],https://twitter.com/CNBCTV18News/status/121226...,,False,,[]
3,2020-01-01 02:18:48-05:00,1212271940502638593,1212271940502638593,14654494,excellion,Samson Mow,Wishing all #Bitcoin Ultra Enthusiasts a very ...,en,"[{'screen_name': 'blockstream', 'name': 'block...",[],...,10,17,158,"['bitcoin', 'liquidnetwork']",[],https://twitter.com/Excellion/status/121227194...,,True,https://pbs.twimg.com/media/ENLa5toU0AAvhC1.jpg,[]
4,2020-01-01 02:39:29-05:00,1212277146401402880,1212277146401402880,1066972567943053312,hindustantimes,Hindustan Times,"Breaking down Bitcoin’s 9,000,000% rise in las...",en,[],['http://www.hindustantimes.com/tech/breaking-...,...,0,0,0,[],[],https://twitter.com/HindustanTimes/status/1212...,,False,,[]


In [4]:
# define cleaning functions

def repl(matchObj):
    char = matchObj.group(1)
    return "%s%s" % (char, char)

def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

def clean_tweet_1(tweet):
    # remove URL "https://t.co/"
    new_url = re.sub(r"https://t.co/[A-Za-z0-9]+", " ", tweet)
    
    # remove mention
    new_mention = re.sub(r"@[A-Za-z0-9!#%&*;_\$\.]+", " ", new_url)
    
    # remove # ,turning hashtags to the typical words.
    new_symbol = re.sub(r"\W+", " ", new_mention)
    
    # remove number
    new_number = re.sub(r"[0-9]", "", new_symbol)
    
    return new_number

def clean_tweet_2(tweet):
    # reduce character sequences >3 to 3
    new_sequence = re.sub(re.compile(r"(\w)\1+"), repl, tweet)

    # remove 2-character words
    new_twochar = re.sub(r"\b[a-z]{1,2}\b", " ", new_sequence)
    
    # replace 2 or more spaces with a single space.
    new_space = re.sub(r"\s+", " ", new_twochar)
    
    # remove spaces front and back
    new_space_end = re.sub(r"^\s+|\s$", "", new_space)
    
    return new_space_end

def clean_tweet_money(tweet):

    new_money = re.sub(r"([0-9]+) ?(k)", r"\1000", tweet)
    remove_money = re.sub(r"([0-9]+)(,?)([0-9]+)(,?)([0-9]+?)", r"\1\3\5", new_money)
    all_money = re.findall(r"([0-9]+)", remove_money)
    
    return all_money

In [5]:
# language EN only

df = df[df['language'] == 'en']
df['tweet_clean'] = pd.DataFrame(df['tweet'].str.lower())

In [6]:
# extract money from tweets

df_money = df.copy()
df_money['money'] = df['tweet_clean'].apply(clean_tweet_money)
display(df_money['money'].head())

0                        []
1                        []
2    [2013, 20000, 2017, 3]
3     [2020, 1, 2, 2, 8, 0]
4           [9000000, 9478]
Name: money, dtype: object

In [7]:
# export as pickle

df_money.to_pickle("../data/interim/tweets_verified_money_2020-2021.pkl")

In [8]:
# try reading pickle
df_test = pd.read_pickle("../data/interim/tweets_verified_money_2020-2021.pkl")

display(df_test.dtypes)
display(df_test)

created_at         datetime64[ns, tzlocal()]
id                                     int64
conversation_id                        int64
user_id                                int64
username                              string
name                                  string
tweet                                 string
language                              string
mentions                              object
urls                                  object
photos                                object
replies_count                          int64
retweets_count                         int64
likes_count                            int64
hashtags                              object
cashtags                              object
link                                  string
quote_url                             string
video                                   bool
thumbnail                             string
reply_to                              object
tweet_clean                           object
money     

Unnamed: 0,created_at,id,conversation_id,user_id,username,name,tweet,language,mentions,urls,...,likes_count,hashtags,cashtags,link,quote_url,video,thumbnail,reply_to,tweet_clean,money
0,2020-01-01 00:08:28-05:00,1212239143687741440,1212239143687741440,20646945,dumbfoundead,dumbfoundead.eth,Bitcoin the worst decision i made this decade,en,[],[],...,409,[],[],https://twitter.com/dumbfoundead/status/121223...,,False,,[],bitcoin the worst decision i made this decade,[]
1,2020-01-01 00:54:35-05:00,1212250748815208448,1212250748815208448,636023721,neuroecology,Adam J Calhoun,If you ask someone what they'd do if they went...,en,[],[],...,4,[],[],https://twitter.com/neuroecology/status/121225...,,False,,[],if you ask someone what they'd do if they went...,[]
2,2020-01-01 02:00:25-05:00,1212267316789952512,1212267316789952512,631810714,cnbctv18news,CNBC-TV18,#bitcoin rally begun in 2013 and it reached a ...,en,[],['https://www.cnbctv18.com/market/currency/bit...,...,4,['bitcoin'],[],https://twitter.com/CNBCTV18News/status/121226...,,False,,[],#bitcoin rally begun in 2013 and it reached a ...,"[2013, 20000, 2017, 3]"
3,2020-01-01 02:18:48-05:00,1212271940502638593,1212271940502638593,14654494,excellion,Samson Mow,Wishing all #Bitcoin Ultra Enthusiasts a very ...,en,"[{'screen_name': 'blockstream', 'name': 'block...",[],...,158,"['bitcoin', 'liquidnetwork']",[],https://twitter.com/Excellion/status/121227194...,,True,https://pbs.twimg.com/media/ENLa5toU0AAvhC1.jpg,[],wishing all #bitcoin ultra enthusiasts a very ...,"[2020, 1, 2, 2, 8, 0]"
4,2020-01-01 02:39:29-05:00,1212277146401402880,1212277146401402880,1066972567943053312,hindustantimes,Hindustan Times,"Breaking down Bitcoin’s 9,000,000% rise in las...",en,[],['http://www.hindustantimes.com/tech/breaking-...,...,0,[],[],https://twitter.com/HindustanTimes/status/1212...,,False,,[],"breaking down bitcoin’s 9,000,000% rise in las...","[9000000, 9478]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312516,2021-12-31 23:20:12-05:00,1477132523826147330,1477131257301520385,151805556,adeldmeyer,Adel,@Bitcoin__art Fair enough 😅😅😅,en,[],[],...,1,[],[],https://twitter.com/AdeldMeyer/status/14771325...,,False,,"[{'screen_name': 'Bitcoin__art', 'name': 'bitc...",@bitcoin__art fair enough 😅😅😅,[]
312517,2021-12-31 23:28:45-05:00,1477134677148323841,1477134677148323841,1414971,jaygould,Jay Gould,Happy New Year #Bitcoin https://t.co/HdUQNDKUUA,en,[],[],...,12,['bitcoin'],[],https://twitter.com/jaygould/status/1477134677...,,True,https://pbs.twimg.com/media/FH_WKPYXsAQRYg_.jpg,[],happy new year #bitcoin https://t.co/hduqndkuua,[]
312518,2021-12-31 23:38:06-05:00,1477137028894670855,1477132718341337095,10446482,nvk,DETERMINISTIC OPTIMISM,"@rubiconcapital_ Buy bitcoin &amp; chill, defu...",en,[],[],...,8,[],[],https://twitter.com/nvk/status/147713702889467...,,False,,"[{'screen_name': 'rubiconcapital_', 'name': 'K...","@rubiconcapital_ buy bitcoin &amp; chill, defu...",[]
312519,2021-12-31 23:43:13-05:00,1477138318982725633,1477138318982725633,22594051,staronline,The Star,Bitcoin faces uncertain 2022 after record year...,en,[],['https://www.thestar.com.my/tech/tech-news/20...,...,7,[],[],https://twitter.com/staronline/status/14771383...,,False,,[],bitcoin faces uncertain 2022 after record year...,"[2022, 76, 4, 1, 76, 4, 1]"


In [9]:
# apply furthur cleaning

df['tweet_clean'] = df['tweet_clean'].progress_apply(clean_tweet_1)
df['tweet_clean'] = remove_stop_words(df['tweet_clean'])
df['tweet_clean'] = df['tweet_clean'].progress_apply(clean_tweet_2)
display(df[['tweet','tweet_clean']].head())

  0%|          | 0/312204 [00:00<?, ?it/s]

  0%|          | 0/312204 [00:00<?, ?it/s]

Unnamed: 0,tweet,tweet_clean
0,Bitcoin the worst decision i made this decade,worst decision made decade
1,If you ask someone what they'd do if they went...,ask someone went back ten years use informatio...
2,#bitcoin rally begun in 2013 and it reached a ...,rally begun reached peak
3,Wishing all #Bitcoin Ultra Enthusiasts a very ...,wishing ultra enthusiasts happy new year much ...
4,"Breaking down Bitcoin’s 9,000,000% rise in las...",breaking rise last decade left skeptics aghast


In [10]:
# export as pickle

df = df.astype({'tweet':'string'}).reset_index(drop=True)
df.to_pickle("../data/cleaned/tweets_verified_2020-2021_cleaned.pkl")

In [11]:
# try reading pickle
df_test = pd.read_pickle("../data/cleaned/tweets_verified_2020-2021_cleaned.pkl")

display(df_test.dtypes)
display(df_test)

created_at         datetime64[ns, tzlocal()]
id                                     int64
conversation_id                        int64
user_id                                int64
username                              string
name                                  string
tweet                                 string
language                              string
mentions                              object
urls                                  object
photos                                object
replies_count                          int64
retweets_count                         int64
likes_count                            int64
hashtags                              object
cashtags                              object
link                                  string
quote_url                             string
video                                   bool
thumbnail                             string
reply_to                              object
tweet_clean                           object
dtype: obj

Unnamed: 0,created_at,id,conversation_id,user_id,username,name,tweet,language,mentions,urls,...,retweets_count,likes_count,hashtags,cashtags,link,quote_url,video,thumbnail,reply_to,tweet_clean
0,2020-01-01 00:08:28-05:00,1212239143687741440,1212239143687741440,20646945,dumbfoundead,dumbfoundead.eth,Bitcoin the worst decision i made this decade,en,[],[],...,15,409,[],[],https://twitter.com/dumbfoundead/status/121223...,,False,,[],worst decision made decade
1,2020-01-01 00:54:35-05:00,1212250748815208448,1212250748815208448,636023721,neuroecology,Adam J Calhoun,If you ask someone what they'd do if they went...,en,[],[],...,0,4,[],[],https://twitter.com/neuroecology/status/121225...,,False,,[],ask someone went back ten years use informatio...
2,2020-01-01 02:00:25-05:00,1212267316789952512,1212267316789952512,631810714,cnbctv18news,CNBC-TV18,#bitcoin rally begun in 2013 and it reached a ...,en,[],['https://www.cnbctv18.com/market/currency/bit...,...,0,4,['bitcoin'],[],https://twitter.com/CNBCTV18News/status/121226...,,False,,[],rally begun reached peak
3,2020-01-01 02:18:48-05:00,1212271940502638593,1212271940502638593,14654494,excellion,Samson Mow,Wishing all #Bitcoin Ultra Enthusiasts a very ...,en,"[{'screen_name': 'blockstream', 'name': 'block...",[],...,17,158,"['bitcoin', 'liquidnetwork']",[],https://twitter.com/Excellion/status/121227194...,,True,https://pbs.twimg.com/media/ENLa5toU0AAvhC1.jpg,[],wishing ultra enthusiasts happy new year much ...
4,2020-01-01 02:39:29-05:00,1212277146401402880,1212277146401402880,1066972567943053312,hindustantimes,Hindustan Times,"Breaking down Bitcoin’s 9,000,000% rise in las...",en,[],['http://www.hindustantimes.com/tech/breaking-...,...,0,0,[],[],https://twitter.com/HindustanTimes/status/1212...,,False,,[],breaking rise last decade left skeptics aghast
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312199,2021-12-31 23:20:12-05:00,1477132523826147330,1477131257301520385,151805556,adeldmeyer,Adel,@Bitcoin__art Fair enough 😅😅😅,en,[],[],...,0,1,[],[],https://twitter.com/AdeldMeyer/status/14771325...,,False,,"[{'screen_name': 'Bitcoin__art', 'name': 'bitc...",fair enough
312200,2021-12-31 23:28:45-05:00,1477134677148323841,1477134677148323841,1414971,jaygould,Jay Gould,Happy New Year #Bitcoin https://t.co/HdUQNDKUUA,en,[],[],...,1,12,['bitcoin'],[],https://twitter.com/jaygould/status/1477134677...,,True,https://pbs.twimg.com/media/FH_WKPYXsAQRYg_.jpg,[],happy new year
312201,2021-12-31 23:38:06-05:00,1477137028894670855,1477132718341337095,10446482,nvk,DETERMINISTIC OPTIMISM,"@rubiconcapital_ Buy bitcoin &amp; chill, defu...",en,[],[],...,2,8,[],[],https://twitter.com/nvk/status/147713702889467...,,False,,"[{'screen_name': 'rubiconcapital_', 'name': 'K...",buy amp chill defund political parasites
312202,2021-12-31 23:43:13-05:00,1477138318982725633,1477138318982725633,22594051,staronline,The Star,Bitcoin faces uncertain 2022 after record year...,en,[],['https://www.thestar.com.my/tech/tech-news/20...,...,4,7,[],[],https://twitter.com/staronline/status/14771383...,,False,,[],faces uncertain record year
