In [1]:
# import packages
import twint
import nest_asyncio
from modules import bittwint
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from tqdm.notebook import tqdm

nest_asyncio.apply()

In [None]:
# scraping function
df = bittwint.scrape_twitter(
    "bitcoin", 
    100000, 
    "2021-01-14", 
    verified=True, 
    output="../data/raw/bitcoin_verified_tweets_5.csv"
)

display(df)

In [2]:
# get all scraped data
dfs = []

for i in tqdm(range(1, 6)):
    dfs.append(pd.read_csv("../data/raw/bitcoin_verified_tweets_" + str(i) + ".csv", low_memory=False))
    
df = pd.concat(dfs, ignore_index=True)

  0%|          | 0/5 [00:00<?, ?it/s]

In [3]:
# drop duplicates and unused columns
df = df.drop_duplicates(subset=['id'])

df = df.drop(columns=['date', 'time', 'timezone', \
                      'place', 'retweet', 'near', 'geo', 'source', \
                      'user_rt_id', 'user_rt', 'retweet_id', 'retweet_date', \
                      'translate', 'trans_src', 'trans_dest'])

display(df.head())

Unnamed: 0,id,conversation_id,created_at,user_id,username,name,tweet,language,mentions,urls,...,replies_count,retweets_count,likes_count,hashtags,cashtags,link,quote_url,video,thumbnail,reply_to
0,1488663161209249800,1488663161209249800,2022-02-01 18:58:50 EST,1248996730940448774,ratemyskyperoom,Room Rater,Good bookshelf setup. Love the white over red....,en,"[{'screen_name': 'markyusko', 'name': 'mark w....",[],...,3,1,35,[],[],https://twitter.com/ratemyskyperoom/status/148...,,1,https://pbs.twimg.com/media/FKjLQV8WYAE0hht.jpg,[]
1,1488662322067357696,1488662322067357696,2022-02-01 18:55:30 EST,32318850,calvinayre,Calvin Ayre,This was always where it would land and BSV wa...,en,[],['https://coingeek.com/us-now-defines-bitcoin-...,...,2,5,25,[],[],https://twitter.com/CalvinAyre/status/14886623...,,0,,[]
2,1488661724706316291,1488661724706316291,2022-02-01 18:53:07 EST,14115083,jcastros,JCS 🇨🇴⚡,This is why I am a TCP/IP maxi. #Bitcoin http...,en,[],[],...,3,9,62,['bitcoin'],[],https://twitter.com/JCastroS/status/1488661724...,,1,https://pbs.twimg.com/media/FKjJn0HXsAcwGJn.png,[]
3,1488661503305822208,1488573276775333889,2022-02-01 18:52:15 EST,14110443,muneeb,muneeb.btc,@davidmarcus We have the same thesis 🤝 Buildi...,en,[],[],...,1,0,47,[],[],https://twitter.com/muneeb/status/148866150330...,,0,,"[{'screen_name': 'davidmarcus', 'name': 'David..."
4,1488661430886977536,1488661430886977536,2022-02-01 18:51:57 EST,993530753014054912,decryptmedia,Decrypt,Drew Brees to Get Paid in Bitcoin via NYDIG’s ...,en,[],['https://decrypt.co/91807/drew-brees-bitcoin-...,...,0,3,14,[],[],https://twitter.com/decryptmedia/status/148866...,,1,https://pbs.twimg.com/media/FKjJrpwXsAATX0P.jpg,[]


In [4]:
# change data types
df["created_at"] = pd.to_datetime(df["created_at"])

df = df.astype({
    'username': 'string',
    'name': 'string',
    'tweet': 'string',
    'language': 'string',
    'link': 'string',
    'quote_url': 'string',
    'video': 'bool',
    'thumbnail': 'string',
})

display(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346456 entries, 0 to 349997
Data columns (total 21 columns):
 #   Column           Non-Null Count   Dtype                    
---  ------           --------------   -----                    
 0   id               346456 non-null  int64                    
 1   conversation_id  346456 non-null  int64                    
 2   created_at       346456 non-null  datetime64[ns, tzlocal()]
 3   user_id          346456 non-null  int64                    
 4   username         346455 non-null  string                   
 5   name             346456 non-null  string                   
 6   tweet            346456 non-null  string                   
 7   language         346456 non-null  string                   
 8   mentions         346456 non-null  object                   
 9   urls             346456 non-null  object                   
 10  photos           346456 non-null  object                   
 11  replies_count    346456 non-null  int64

None

In [5]:
# filter date from 2020 to 2021
df = df.set_index('created_at').sort_index().loc['2020':'2021'].reset_index()

display(df)

Unnamed: 0,created_at,id,conversation_id,user_id,username,name,tweet,language,mentions,urls,...,replies_count,retweets_count,likes_count,hashtags,cashtags,link,quote_url,video,thumbnail,reply_to
0,2020-01-01 00:08:28-05:00,1212239143687741440,1212239143687741440,20646945,dumbfoundead,dumbfoundead.eth,Bitcoin the worst decision i made this decade,en,[],[],...,9,15,409,[],[],https://twitter.com/dumbfoundead/status/121223...,,False,,[]
1,2020-01-01 00:54:35-05:00,1212250748815208448,1212250748815208448,636023721,neuroecology,Adam J Calhoun,If you ask someone what they'd do if they went...,en,[],[],...,6,0,4,[],[],https://twitter.com/neuroecology/status/121225...,,False,,[]
2,2020-01-01 02:00:25-05:00,1212267316789952512,1212267316789952512,631810714,cnbctv18news,CNBC-TV18,#bitcoin rally begun in 2013 and it reached a ...,en,[],['https://www.cnbctv18.com/market/currency/bit...,...,0,0,4,['bitcoin'],[],https://twitter.com/CNBCTV18News/status/121226...,,False,,[]
3,2020-01-01 02:18:48-05:00,1212271940502638593,1212271940502638593,14654494,excellion,Samson Mow,Wishing all #Bitcoin Ultra Enthusiasts a very ...,en,"[{'screen_name': 'blockstream', 'name': 'block...",[],...,10,17,158,"['bitcoin', 'liquidnetwork']",[],https://twitter.com/Excellion/status/121227194...,,True,https://pbs.twimg.com/media/ENLa5toU0AAvhC1.jpg,[]
4,2020-01-01 02:39:29-05:00,1212277146401402880,1212277146401402880,1066972567943053312,hindustantimes,Hindustan Times,"Breaking down Bitcoin’s 9,000,000% rise in las...",en,[],['http://www.hindustantimes.com/tech/breaking-...,...,0,0,0,[],[],https://twitter.com/HindustanTimes/status/1212...,,False,,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312516,2021-12-31 23:20:12-05:00,1477132523826147330,1477131257301520385,151805556,adeldmeyer,Adel,@Bitcoin__art Fair enough 😅😅😅,en,[],[],...,0,0,1,[],[],https://twitter.com/AdeldMeyer/status/14771325...,,False,,"[{'screen_name': 'Bitcoin__art', 'name': 'bitc..."
312517,2021-12-31 23:28:45-05:00,1477134677148323841,1477134677148323841,1414971,jaygould,Jay Gould,Happy New Year #Bitcoin https://t.co/HdUQNDKUUA,en,[],[],...,2,1,12,['bitcoin'],[],https://twitter.com/jaygould/status/1477134677...,,True,https://pbs.twimg.com/media/FH_WKPYXsAQRYg_.jpg,[]
312518,2021-12-31 23:38:06-05:00,1477137028894670855,1477132718341337095,10446482,nvk,DETERMINISTIC OPTIMISM,"@rubiconcapital_ Buy bitcoin &amp; chill, defu...",en,[],[],...,0,2,8,[],[],https://twitter.com/nvk/status/147713702889467...,,False,,"[{'screen_name': 'rubiconcapital_', 'name': 'K..."
312519,2021-12-31 23:43:13-05:00,1477138318982725633,1477138318982725633,22594051,staronline,The Star,Bitcoin faces uncertain 2022 after record year...,en,[],['https://www.thestar.com.my/tech/tech-news/20...,...,0,4,7,[],[],https://twitter.com/staronline/status/14771383...,,False,,[]


In [None]:
# export to pickle
df.to_pickle("../data/interim/tweets_verified_2020-2021.pkl")

In [6]:
# try reading pickle
df_test = pd.read_pickle("../data/interim/tweets_verified_2020-2021.pkl")

display(df_test.dtypes)
display(df_test)

created_at         datetime64[ns, tzlocal()]
id                                     int64
conversation_id                        int64
user_id                                int64
username                              string
name                                  string
tweet                                 string
language                              string
mentions                              object
urls                                  object
photos                                object
replies_count                          int64
retweets_count                         int64
likes_count                            int64
hashtags                              object
cashtags                              object
link                                  string
quote_url                             string
video                                   bool
thumbnail                             string
reply_to                              object
dtype: object

Unnamed: 0,created_at,id,conversation_id,user_id,username,name,tweet,language,mentions,urls,...,replies_count,retweets_count,likes_count,hashtags,cashtags,link,quote_url,video,thumbnail,reply_to
0,2020-01-01 00:08:28-05:00,1212239143687741440,1212239143687741440,20646945,dumbfoundead,dumbfoundead.eth,Bitcoin the worst decision i made this decade,en,[],[],...,9,15,409,[],[],https://twitter.com/dumbfoundead/status/121223...,,False,,[]
1,2020-01-01 00:54:35-05:00,1212250748815208448,1212250748815208448,636023721,neuroecology,Adam J Calhoun,If you ask someone what they'd do if they went...,en,[],[],...,6,0,4,[],[],https://twitter.com/neuroecology/status/121225...,,False,,[]
2,2020-01-01 02:00:25-05:00,1212267316789952512,1212267316789952512,631810714,cnbctv18news,CNBC-TV18,#bitcoin rally begun in 2013 and it reached a ...,en,[],['https://www.cnbctv18.com/market/currency/bit...,...,0,0,4,['bitcoin'],[],https://twitter.com/CNBCTV18News/status/121226...,,False,,[]
3,2020-01-01 02:18:48-05:00,1212271940502638593,1212271940502638593,14654494,excellion,Samson Mow,Wishing all #Bitcoin Ultra Enthusiasts a very ...,en,"[{'screen_name': 'blockstream', 'name': 'block...",[],...,10,17,158,"['bitcoin', 'liquidnetwork']",[],https://twitter.com/Excellion/status/121227194...,,True,https://pbs.twimg.com/media/ENLa5toU0AAvhC1.jpg,[]
4,2020-01-01 02:39:29-05:00,1212277146401402880,1212277146401402880,1066972567943053312,hindustantimes,Hindustan Times,"Breaking down Bitcoin’s 9,000,000% rise in las...",en,[],['http://www.hindustantimes.com/tech/breaking-...,...,0,0,0,[],[],https://twitter.com/HindustanTimes/status/1212...,,False,,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
312516,2021-12-31 23:20:12-05:00,1477132523826147330,1477131257301520385,151805556,adeldmeyer,Adel,@Bitcoin__art Fair enough 😅😅😅,en,[],[],...,0,0,1,[],[],https://twitter.com/AdeldMeyer/status/14771325...,,False,,"[{'screen_name': 'Bitcoin__art', 'name': 'bitc..."
312517,2021-12-31 23:28:45-05:00,1477134677148323841,1477134677148323841,1414971,jaygould,Jay Gould,Happy New Year #Bitcoin https://t.co/HdUQNDKUUA,en,[],[],...,2,1,12,['bitcoin'],[],https://twitter.com/jaygould/status/1477134677...,,True,https://pbs.twimg.com/media/FH_WKPYXsAQRYg_.jpg,[]
312518,2021-12-31 23:38:06-05:00,1477137028894670855,1477132718341337095,10446482,nvk,DETERMINISTIC OPTIMISM,"@rubiconcapital_ Buy bitcoin &amp; chill, defu...",en,[],[],...,0,2,8,[],[],https://twitter.com/nvk/status/147713702889467...,,False,,"[{'screen_name': 'rubiconcapital_', 'name': 'K..."
312519,2021-12-31 23:43:13-05:00,1477138318982725633,1477138318982725633,22594051,staronline,The Star,Bitcoin faces uncertain 2022 after record year...,en,[],['https://www.thestar.com.my/tech/tech-news/20...,...,0,4,7,[],[],https://twitter.com/staronline/status/14771383...,,False,,[]
