In [9]:
# import packages
import twint
import nest_asyncio
from modules import bittwint
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# configure packages
nest_asyncio.apply()

In [None]:
# scraping function
df = bittwint.scrape_twitter(
    "bitcoin", 
    10000, 
    "2020-01-01",
    until="2020-3-01",
    verified=False, 
    output="../data/raw/bitcoin_unverified_tweets_22.csv"
)

df.head()

In [10]:
# get all scraped data
dfs = []

for i in tqdm(range(1, 23)):
    dfs.append(pd.read_csv("../data/raw/bitcoin_unverified_tweets_" + str(i) + ".csv", low_memory=False))
    
df = pd.concat(dfs, ignore_index=True)

  0%|          | 0/22 [00:00<?, ?it/s]

In [11]:
# drop duplicates and unused columns
df = df.drop_duplicates(subset=['id'])

df = df.drop(columns=['date', 'time', 'timezone', \
                      'place', 'retweet', 'near', 'geo', 'source', \
                      'user_rt_id', 'user_rt', 'retweet_id', 'retweet_date', \
                      'translate', 'trans_src', 'trans_dest'])

display(df.head())

Unnamed: 0,id,conversation_id,created_at,user_id,username,name,tweet,language,mentions,urls,...,replies_count,retweets_count,likes_count,hashtags,cashtags,link,quote_url,video,thumbnail,reply_to
0,1477429414568902662,1477429414568902662,2022-01-01 18:59:56 EST,184514961,priconoticias,Daniel A Cohene,0.4MOT TOKENS IN #LATOKEN airdrop and maybe m...,en,[],[],...,0,0,0,"['latoken', 'bitcoin', 'cryptocurrency']",[],https://twitter.com/priconoticias/status/14774...,,0,,[]
1,1477429403042795522,1477429403042795522,2022-01-01 18:59:53 EST,835951609108643841,mmonlineonline,online forex trading,MARA for Bitcoin Exposure: Top Trade Q1 2022 ...,en,[],['http://dlvr.it/SGNDkR'],...,0,0,0,[],[],https://twitter.com/mmonlineonline/status/1477...,,1,https://pbs.twimg.com/media/FIDiNoPVkAAJwhE.jpg,[]
2,1477429359044612099,1477429359044612099,2022-01-01 18:59:43 EST,755434052,coachkevinrose,Kevin Rose,Every Time the Bell Rings The Titano makes MON...,en,[],[],...,0,0,0,['bitcoin'],[],https://twitter.com/CoachKevinRose/status/1477...,,0,,[]
3,1477429325993582597,1477429325993582597,2022-01-01 18:59:35 EST,1286761622967656448,robert54177167,fiatminer4🌽,If your interested in losing money in 2022 kee...,en,"[{'screen_name': 'peterschiff', 'name': 'peter...",[],...,0,2,7,['bitcoin'],[],https://twitter.com/robert54177167/status/1477...,,0,,[]
4,1477429324622016514,1477429324622016514,2022-01-01 18:59:34 EST,2395138046,worldcoinindex,WorldCoinIndex,Bitcoin price index https://t.co/o7UcHJUhC6 #...,en,[],['https://www.worldcoinindex.com/coin/bitcoin'],...,0,1,1,"['usd', 'eur', 'cny', 'gbp', 'rub']",[],https://twitter.com/WorldCoinIndex/status/1477...,,1,https://pbs.twimg.com/media/FIDiI7wXEAce_ow.png,[]


In [12]:
# change data types
df["created_at"] = pd.to_datetime(df["created_at"])

df = df.astype({
    'username': 'string',
    'name': 'string',
    'tweet': 'string',
    'language': 'string',
    'link': 'string',
    'quote_url': 'string',
    'video': 'bool',
    'thumbnail': 'string',
})

display(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7144428 entries, 0 to 7324793
Data columns (total 21 columns):
 #   Column           Dtype                    
---  ------           -----                    
 0   id               int64                    
 1   conversation_id  int64                    
 2   created_at       datetime64[ns, tzlocal()]
 3   user_id          int64                    
 4   username         string                   
 5   name             string                   
 6   tweet            string                   
 7   language         string                   
 8   mentions         object                   
 9   urls             object                   
 10  photos           object                   
 11  replies_count    int64                    
 12  retweets_count   int64                    
 13  likes_count      int64                    
 14  hashtags         object                   
 15  cashtags         object                   
 16  link             s

None

In [13]:
# filter date from 2020 to 2021
df = df.set_index('created_at').sort_index().loc['2020':'2021'].reset_index()

display(df)

Unnamed: 0,created_at,id,conversation_id,user_id,username,name,tweet,language,mentions,urls,...,replies_count,retweets_count,likes_count,hashtags,cashtags,link,quote_url,video,thumbnail,reply_to
0,2020-01-01 00:00:00-05:00,1212237011928702977,1212237011928702977,872234445499244552,cryptohourly,crypto Hourly,"@ #1, Bitcoin with unit price of $7,212.27, ma...",en,[],[],...,0,0,0,[],[],https://twitter.com/CryptoHourly/status/121223...,,False,,[]
1,2020-01-01 00:00:00-05:00,1212237012985663488,1212237012985663488,1070351212044783616,glassnodealerts,glassnode alerts,📊 Daily On-Chain Exchange Flow #Bitcoin $BTC ...,en,[],['https://glassno.de/netflow'],...,0,0,0,"['bitcoin', 'ethereum', 'tether']","['btc', 'eth', 'usdt']",https://twitter.com/glassnodealerts/status/121...,,False,,[]
2,2020-01-01 00:00:00-05:00,1212237010494271488,1212237010494271488,265207297,agcalculator,SilverCalculatorApp,#USD #bitcoin Index: 13919 satoshi’s = $1 ...,en,[],[],...,0,0,0,"['usd', 'bitcoin', 'pizza']",[],https://twitter.com/agCalculator/status/121223...,,False,,[]
3,2020-01-01 00:00:00-05:00,1212237009776857091,1212237009776857091,24411256,cryptopressnews,Crypto Press,$BTC | #BTC - bitcoin's Current Price: ▼ $7215...,en,[],['https://crypto.press/coins/BTC-bitcoin'],...,0,0,1,['btc'],['btc'],https://twitter.com/CryptoPressNews/status/121...,,False,,[]
4,2020-01-01 00:00:01-05:00,1212237014730465280,1212237014730465280,1147946864081350658,hourlybtcupdate,Hourly BTC Updates,Bitcoin: $7215.9 💔 -8.65 last 1 Hour (-0.12%) ...,en,[],[],...,0,0,0,"['bitcoinpriceupdates', 'bitcoin', 'hourlycryp...",[],https://twitter.com/HourlyBTCUpdate/status/121...,,False,,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6904826,2021-12-31 18:59:49-05:00,1477066999486242817,1477066999486242817,15865538,chuckt406,Charlie Taylor,SilasBender13 found #bitcoin in a User vault a...,en,[],['https://coinhunt.gsc.im/D3mmpLgMtl'],...,0,0,0,"['bitcoin', 'coinhuntworld', 'cryptocurrency']",[],https://twitter.com/chuckt406/status/147706699...,,True,https://pbs.twimg.com/media/FH-Ym99VgAEaiVT.jpg,[]
6904827,2021-12-31 18:59:53-05:00,1477067013310885888,1477067013310885888,303332491,rot13maxi,Rijndael,Friendly reminder that there’s no wash sale ru...,en,[],[],...,2,0,7,"['bitcoin', 'notanaccountant']",[],https://twitter.com/rot13maxi/status/147706701...,,False,,[]
6904828,2021-12-31 18:59:54-05:00,1477067019690381320,1477067019690381320,1321968381189410817,pascalabams,Pascal Abams⭕,On a quest to Tweet buy #Bitcoin every day for...,en,[],[],...,2,0,2,"['bitcoin', 'bitcoin', 'crypto']",[],https://twitter.com/PascalAbams/status/1477067...,,False,,[]
6904829,2021-12-31 18:59:55-05:00,1477067022949236737,1477022595593682949,1258757505229832192,domainsflx,Beisbol/,@PeterSchiff There was a time to buy gold and ...,en,[],[],...,0,0,0,[],[],https://twitter.com/DomainsFlx/status/14770670...,,False,,"[{'screen_name': 'PeterSchiff', 'name': 'Peter..."


In [14]:
# export to pickle
df.to_pickle("../data/interim/tweets_unverified_2020-2021.pkl")

In [15]:
# try reading pickle
df_test = pd.read_pickle("../data/interim/tweets_unverified_2020-2021.pkl")

display(df_test.dtypes)
display(df_test)

created_at         datetime64[ns, tzlocal()]
id                                     int64
conversation_id                        int64
user_id                                int64
username                              string
name                                  string
tweet                                 string
language                              string
mentions                              object
urls                                  object
photos                                object
replies_count                          int64
retweets_count                         int64
likes_count                            int64
hashtags                              object
cashtags                              object
link                                  string
quote_url                             string
video                                   bool
thumbnail                             string
reply_to                              object
dtype: object

Unnamed: 0,created_at,id,conversation_id,user_id,username,name,tweet,language,mentions,urls,...,replies_count,retweets_count,likes_count,hashtags,cashtags,link,quote_url,video,thumbnail,reply_to
0,2020-01-01 00:00:00-05:00,1212237011928702977,1212237011928702977,872234445499244552,cryptohourly,crypto Hourly,"@ #1, Bitcoin with unit price of $7,212.27, ma...",en,[],[],...,0,0,0,[],[],https://twitter.com/CryptoHourly/status/121223...,,False,,[]
1,2020-01-01 00:00:00-05:00,1212237012985663488,1212237012985663488,1070351212044783616,glassnodealerts,glassnode alerts,📊 Daily On-Chain Exchange Flow #Bitcoin $BTC ...,en,[],['https://glassno.de/netflow'],...,0,0,0,"['bitcoin', 'ethereum', 'tether']","['btc', 'eth', 'usdt']",https://twitter.com/glassnodealerts/status/121...,,False,,[]
2,2020-01-01 00:00:00-05:00,1212237010494271488,1212237010494271488,265207297,agcalculator,SilverCalculatorApp,#USD #bitcoin Index: 13919 satoshi’s = $1 ...,en,[],[],...,0,0,0,"['usd', 'bitcoin', 'pizza']",[],https://twitter.com/agCalculator/status/121223...,,False,,[]
3,2020-01-01 00:00:00-05:00,1212237009776857091,1212237009776857091,24411256,cryptopressnews,Crypto Press,$BTC | #BTC - bitcoin's Current Price: ▼ $7215...,en,[],['https://crypto.press/coins/BTC-bitcoin'],...,0,0,1,['btc'],['btc'],https://twitter.com/CryptoPressNews/status/121...,,False,,[]
4,2020-01-01 00:00:01-05:00,1212237014730465280,1212237014730465280,1147946864081350658,hourlybtcupdate,Hourly BTC Updates,Bitcoin: $7215.9 💔 -8.65 last 1 Hour (-0.12%) ...,en,[],[],...,0,0,0,"['bitcoinpriceupdates', 'bitcoin', 'hourlycryp...",[],https://twitter.com/HourlyBTCUpdate/status/121...,,False,,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6904826,2021-12-31 18:59:49-05:00,1477066999486242817,1477066999486242817,15865538,chuckt406,Charlie Taylor,SilasBender13 found #bitcoin in a User vault a...,en,[],['https://coinhunt.gsc.im/D3mmpLgMtl'],...,0,0,0,"['bitcoin', 'coinhuntworld', 'cryptocurrency']",[],https://twitter.com/chuckt406/status/147706699...,,True,https://pbs.twimg.com/media/FH-Ym99VgAEaiVT.jpg,[]
6904827,2021-12-31 18:59:53-05:00,1477067013310885888,1477067013310885888,303332491,rot13maxi,Rijndael,Friendly reminder that there’s no wash sale ru...,en,[],[],...,2,0,7,"['bitcoin', 'notanaccountant']",[],https://twitter.com/rot13maxi/status/147706701...,,False,,[]
6904828,2021-12-31 18:59:54-05:00,1477067019690381320,1477067019690381320,1321968381189410817,pascalabams,Pascal Abams⭕,On a quest to Tweet buy #Bitcoin every day for...,en,[],[],...,2,0,2,"['bitcoin', 'bitcoin', 'crypto']",[],https://twitter.com/PascalAbams/status/1477067...,,False,,[]
6904829,2021-12-31 18:59:55-05:00,1477067022949236737,1477022595593682949,1258757505229832192,domainsflx,Beisbol/,@PeterSchiff There was a time to buy gold and ...,en,[],[],...,0,0,0,[],[],https://twitter.com/DomainsFlx/status/14770670...,,False,,"[{'screen_name': 'PeterSchiff', 'name': 'Peter..."
