In [1]:
import os
import time

import pandas as pd
import numpy as np
import datetime as dt

from tqdm.notebook import tqdm

### Daily Prices of Cryptocurrencies, Stocks, Commodities, and World Indices
https://github.com/ranaroussi/yfinance <br/>
https://github.com/JECSand/yahoofinancials <br/>

_daily_ values of **Top-100 Coins (as of Nov 1st), Stocks, Commodities, World Idices (e.g., gold, silver, and oil) for OCLHV prices** <br/>
_extracted_ changes and returns

In [2]:
import yfinance as yf
import requests as requests
import bs4 as bs

from yahoofinancials import YahooFinancials

In [3]:
# interval: 1m,5m,15m,30m,60m,1h,1d,1wk,1mo,
# *Close price adjusted for splits. **Adjusted close price adjusted for splits and dividend and/or capital gain distributions.

In [4]:
with open("top_symbols.npy", "rb") as f:
    top_coins = np.load(f)

# Crude Oil, Gold, Silver
commodities = ['CL=F', 'GC=F', 'SI=F']

# Microsoft, Apple, NVIDIA, and etc.
tech_stocks = list(set(['MSFT', 'AAPL', 'NVDA', 'GOOGL', 'FB', 'NFLX', 'AMZN', 'TSLA', 'BABA', 'TSM', 'ASML', 'ADBE', 'CRM', 'ORCL', 'CSCO', 'ACN', 'AVGO', 'SHOP', 'INTC', 'QCOM', 'AMD', 'INTU', 'TXN', 'SAP', 'SONY', 'AMAT', 'NOW', 'SNOW', 'TEAM', 'IBM', 'SQ']))

# S&P 500, Dow 30, nasdaq, NYSE COMPOSITE (DJ), and etc.
world_indices = ['^GSPC', '^DJI', '^IXIC', '^NYA', '^XAX', '^BUK100P', '^RUT', '^VIX', '^FTSE', '^GDAXI', '^FCHI', '^STOXX50E', '^N100', '^BFX', 'IMOEX.ME', '^N225', '^HSI', '000001.SS', '399001.SZ', '^STI', '^AXJO', '^AORD', '^BSESN', '^JKSE', '^KLSE', '^NZ50', '^KS11', '^TWII', '^GSPTSE', '^BVSP', '^MXX', '^IPSA', '^MERV', '^TA125.TA', '^JN0U.JO']

# S&P 500 Each
resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})

sp500 = []
for row in table.findAll('tr')[1:]:
    ticker = row.findAll('td')[0].text
    sp500.append(ticker.strip().replace('.', '-'))

In [5]:
top_coins

array(['BTC-USD', 'ETH-USD', 'BNB-USD', 'USDT-USD', 'ADA-USD', 'SOL1-USD',
       'XRP-USD', 'DOT1-USD', 'HEX-USD', 'SHIB-USD', 'DOGE-USD',
       'USDC-USD', 'LUNA1-USD', 'UNI3-USD', 'AVAX-USD', 'LINK-USD',
       'LTC-USD', 'MATIC-USD', 'ALGO-USD', 'BCH-USD', 'XLM-USD',
       'VET-USD', 'AXS-USD', 'ATOM1-USD', 'ICP1-USD', 'FIL-USD',
       'TRX-USD', 'FTT1-USD', 'THETA-USD', 'ETC-USD', 'FTM-USD',
       'DAI1-USD', 'CRO-USD', 'HBAR-USD', 'EGLD-USD', 'XTZ-USD',
       'MANA-USD', 'XMR-USD', 'GRT2-USD', 'EOS-USD', 'CAKE-USD',
       'FLOW1-USD', 'AAVE-USD', 'RUNE-USD', 'MIOTA-USD', 'KSM-USD',
       'MKR-USD', 'QNT-USD', 'ONE2-USD', 'NEO-USD', 'BSV-USD', 'SAND-USD',
       'HNT1-USD', 'WAVES-USD', 'CHZ-USD', 'BTT1-USD', 'AR-USD',
       'STX1-USD', 'AMP1-USD', 'ENJ-USD', 'COMP-USD', 'HOT1-USD',
       'CELO-USD', 'ZEC-USD', 'DASH-USD', 'OMG-USD', 'TFUEL-USD',
       'XEM-USD', 'CRV-USD', 'LRC-USD', 'QTUM-USD', 'SUSHI-USD',
       'ICX-USD', 'DCR-USD', 'BAT-USD', 'CTC1-USD', 'SNX-USD',

In [5]:
def save_csv(loc, symbols, yf_data, interval):
    for symbol in symbols:
        yf_data[symbol].to_csv(f'datasets/{interval}/{loc}/{symbol}.csv')

In [None]:
# from 2018-01-01 to 2021-10-31 (daily)
start_date, end_date = '2018-01-01', '2021-11-01' # UTC datetime

yf_coins = yf.download(top_coins.tolist(), start=start_date, end=end_date, group_by='ticker', actions=True)
yf_tech = yf.download(tech_stocks, start=start_date, end=end_date, group_by='ticker', actions=True)
yf_commo = yf.download(commodities, start=start_date, end=end_date, group_by='ticker', actions=True)
yf_indices = yf.download(world_indices, start=start_date, end=end_date, group_by='ticker', actions=True)
yf_sp500 = yf.download(sp500, start=start_date, end=end_date, group_by='ticker', actions=True)

[*********************100%***********************]  100 of 100 completed
[*********************100%***********************]  31 of 31 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  35 of 35 completed
[*********************100%***********************]  505 of 505 completed


In [7]:
# from 2018-01-01 to 2021-10-31 (monthly)
start_date, end_date = '2017-12-01', '2021-11-01' # UTC datetime

yfm_coins = yf.download(top_coins.tolist(), start=start_date, end=end_date, group_by='ticker', actions=True, interval='1mo')
yfm_tech = yf.download(tech_stocks, start=start_date, end=end_date, group_by='ticker', actions=True, interval='1mo')
yfm_commo = yf.download(commodities, start=start_date, end=end_date, group_by='ticker', actions=True, interval='1mo')
yfm_indices = yf.download(world_indices, start=start_date, end=end_date, group_by='ticker', actions=True, interval='1mo')
yfm_sp500 = yf.download(sp500, start=start_date, end=end_date, group_by='ticker', actions=True, interval='1mo')

[*********************100%***********************]  100 of 100 completed
[*********************100%***********************]  31 of 31 completed
[*********************100%***********************]  3 of 3 completed
[*********************100%***********************]  35 of 35 completed

1 Failed download:
- ^IPSA: No data found for this date range, symbol may be delisted
[*********************100%***********************]  505 of 505 completed


In [8]:
# coin_tickers = yf.Tickers(top_coins.tolist())
# tech_tickers = yf.Tickers(tech_stocks)
# commo_tickers = yf.Tickers(commodities)
# indices_tickers = yf.Tickers(world_indices)

In [9]:
save_csv('coins', top_coins, yf_coins, 'daily')
save_csv('tech', tech_stocks, yf_tech, 'daily')
save_csv('commo', commodities, yf_commo, 'daily')
save_csv('indices', world_indices, yf_indices, 'daily')
save_csv('S&P_500', sp500, yf_sp500, 'daily')

save_csv('coins', top_coins, yfm_coins, 'monthly')
save_csv('tech', tech_stocks, yfm_tech, 'monthly')
save_csv('commo', commodities, yfm_commo, 'monthly')
save_csv('indices', world_indices, yfm_indices, 'monthly')
save_csv('S&P_500', sp500, yfm_sp500, 'monthly')

In [10]:
ticker_info = {'symbol': [], 'name': [], 'description': [], 'type': [], 'sector': [], 'industry': []}
symbols = top_coins.tolist() + commodities + tech_stocks + world_indices + sp500
all_tickers = yf.Tickers(symbols)

In [11]:
for sym in tqdm(symbols):
    info = all_tickers.tickers[sym].info
    ticker_info['symbol'].append(sym)
    ticker_info['name'].append(info['shortName'].replace(' USD', '').replace(' Dec 21', ''))
    ticker_info['description'].append(info['description'] if 'description' in info.keys() else '')
    ticker_info['type'].append(info['quoteType'])
    ticker_info['sector'].append(info['sector'] if 'sector' in info.keys() else '')
    ticker_info['industry'].append(info['industry'] if 'industry' in info.keys() else '')

  0%|          | 0/652 [00:00<?, ?it/s]

In [12]:
pd.DataFrame(ticker_info).to_csv('datasets/ticker_info.csv', index=False)

In [121]:
# Price change OR returns % = (V2 - V1) / V1 * 100 
# compute daily changes and % changes
# Volatility is often measured as either the standard deviation or variance between returns from that same security or market index.

def compute_change(qtype, symbol):
    yf_data = pd.read_csv(f'datasets/daily/{qtype}/{symbol}.csv')
    yfm_data = pd.read_csv(f'datasets/monthly/{qtype}/{symbol}.csv')
    
    for i in range(yf_data.shape[0]):
        if i > 0:
            yf_data.loc[i, 'change'] = yf_data.loc[i]['Close'] - yf_data.loc[i - 1]['Close']
            yf_data.loc[i, 'percent_change'] = (yf_data.loc[i]['Close'] - yf_data.loc[i - 1]['Close']) / yf_data.loc[i - 1]['Close'] * 100
            yf_data.loc[i, 'returns'] = (yf_data.loc[i]['Close'] / yf_data.loc[i - 1]['Close'] - 1) * 100
            yf_data.loc[i, 'volatility'] = np.std(yf_data.loc[i - 1:i + 1]['Close'])
    
    yf_data['Month'] = yf_data['Date'].apply(lambda x: x[:7])
    for i in range(yfm_data.shape[0]):
        if i > 0:
            yfm_data.loc[i, 'change'] = yfm_data.loc[i]['Close'] - yfm_data.loc[i - 1]['Close']
            yfm_data.loc[i, 'percent_change'] = (yfm_data.loc[i]['Close'] - yfm_data.loc[i - 1]['Close']) / yfm_data.loc[i - 1]['Close'] * 100
            yfm_data.loc[i, 'returns'] = (yfm_data.loc[i]['Close'] / yfm_data.loc[i - 1]['Close'] - 1) * 100
            yfm_data.loc[i, 'volatility'] = np.std(yf_data[yf_data['Month'] == yfm_data.loc[i]['Date'][:7]]['Close'])

### Reddit Posts
https://github.com/gabrielpreda/reddit_extract_content + https://pushshift.io/<br/>

/r/cryptocurrency and /r/* for each coin name

In [131]:
import praw
import snscrape.modules.reddit as snreddit

In [142]:
start_unix, end_unix = 1514764800, 1635724799

results = []
for post in tqdm(snreddit.RedditSearchScraper('cryptocurrency', comments = False, before=end_unix, after=start_unix).get_items()):
    results.append({
        'id': post.id,
        'text': post.selftext,
        'topic': post.subreddit,
        'title': post.title,
        'date': post.created.strftime('%Y-%m-%d')
    })
    break

0it [00:00, ?it/s]

In [133]:
pd.DataFrame(results).to_csv('datasets/reddit/cryptocurrency.csv', index=False)

In [None]:
start_unix, end_unix = 1514764800, 1635724799
day_unix = 86400

current = end_unix - day_unix
results = []

while current >= start_unix:
    start = current
    end = current + day_unix
    
    daily_results = []
    for post in snreddit.RedditSubredditScraper('cryptocurrency', comments = False, before=end, after=start).get_items():
        daily_results.append({
            'id': post.id,
            'text': post.selftext,
            'topic': post.subreddit,
            'title': post.title,
            'date': post.created.strftime('%Y-%m-%d')
        })

        if len(daily_results) >= 100:
            break

    results = results + daily_results
    pd.DataFrame(results).to_csv('datasets/reddit/cryptocurrency.csv', index=False)
    print(f'saved reddit {len(results)} submissions')
    current = current - day_unix

In [10]:


reddit = praw.Reddit(client_id=personal_use_script, client_secret=client_secret, user_agent=user_agent, username=username, password=password)

In [11]:
# MaticNetwork + 0xPolygon -> polygon
# XRP + Ripple -> XRP

# reddit topic (subreddit) by coinmarketcap.com
topics = ['cryptocurrency', 
          'Bitcoin', 'ethereum', 'binance', 'Tether', 'cardano', 'Solana', 'XRP', 'Ripple', 'Polkadot', 'HEXcrypto', 
          'SHIBArmy', 'dogecoin', 'USDC', 'terraluna', 'UniSwap', 'Avax', 'Chainlink', 'Litecoin', 'MaticNetwork', '0xPolygon', 
          'AlgorandOfficial', 'BitcoinCash', 'stellar', 'vechain', 'AxieInfinity', 'cosmosnetwork', 'dfinity', 'Filecoin', 'Tronix', 'FTXToken', 
          'theta_network', 'EthereumClassic', 'FantomFoundation', 'DAItoken', 'Crypto_com', 'Hedera', 'elrondnetwork', 'Tezos', 'decentraland', 'Monero', 
          'thegraph', 'EOS', 'PancakeSwap', 'FlowBlockchain', 'Aave_Official', 'thorchain', 'Iota', 'Kusama', 'makerdao', 'QuantNetwork',
          'harmony_one', 'NEO', 'bitcoinsv', 'TheSandboxGaming', 'HeliumNetwork', 'Wavesplatform', 'chiliZ', 'BittorrentToken', 'Arweave', 'stacks',
          'AMPToken', 'EnjinCoin', 'Compound', 'holochain', 'celo', 'zec', 'dashpay', 'OMGnetwork', 'thetafuel', 'nem', 
          'CurveDAO', 'loopringorg', 'Qtum', 'SushiSwap', 'helloicon', 'decred', 'BATProject', 'Creditcoin', 'synthetix_io', 'Ravencoin',
          'zilliqa', 'yearn_finance', 'kadena', 'TUSD', 'xinfin', 'ECOMI', 'BitcoinGoldHQ', 'ProjectSerum', '0xProject', 'Bancor',
          'ANKR', 'CelsiusNetwork', 'Siacoin', 'Counosplatform', 'Horizen', 'IOStoken', 'Raydium', 'OntologyNetwork', 'SKALEnetwork', 'WAX_io',
          'Digibyte', 'nanocurrency'
         ]

df_dict = { 
    'date': [],
    'topic': [],
    'title': [],
    'selftext': [],
    'score': [],
    'upvote_ratio': [],
    'num_comments': [],
    'created_utc': []
}

In [36]:


# for topic in topics:
#     df_dict['topic'].append(topic)
#     subreddit = reddit.subreddit(topic)
#     last_sub = {'name': None, 'created_utc': start_unix}
    
#     while last_sub['created_utc'] >= start_unix:
#         for sub in subreddit.new(limit=1000, params={'after': last_sub['name']}):
#             if sub.created_utc >= start_unix and sub.created_utc <= end_unix:
#                 print(sub.title, sub.created_utc)
#         last_sub['name'] = sub.name
#         last_sub['created_utc'] = sub.created_utc
#         print(sub.created_utc)
#     break
        

#     it_start = start_unix
#     it_end = start_unix + 86400
#     while it_end <= end_unix:
#         print(it_start, it_end)
#         for sub in subreddit.new(limit=None):
#             print(sub.created_utc)
# #             if sub.created_utc >= start_unix and sub.created_utc <= end_unix:
# #                 print(sub.created_utc)
# #                 break
#             break
#         it_start = it_end
#         it_end = it_end + 86400
#     break
    
#     for submission in reddit.subreddit('redditdev').submissions(1475280000, 1480550400):
#     print(submission.created_utc)

In [22]:
test[5].created_utc

1636914047.0

In [82]:
f'timestamp:{start_unix}..{end_unix}'

'timestamp:1514764800..1635724799'

In [16]:
new_subreddit = subreddit.hot()

In [25]:
for submission in tqdm(new_subreddit):
    print(submission.comments)
    break
#     topics_dict["title"].append(submission.title)
#     topics_dict["score"].append(submission.score)
#     topics_dict["id"].append(submission.id)
#     topics_dict["url"].append(submission.url)
#     topics_dict["comms_num"].append(submission.num_comments)
#     topics_dict["created"].append(submission.created)
#     topics_dict["body"].append(submission.selftext)

0it [00:00, ?it/s]

<praw.models.comment_forest.CommentForest object at 0x7fbeb4091dc0>


### Tweets
https://github.com/itsayushisaxena/Get_Old_Tweets-Python <br/>
https://www.kaggle.com/davidwallach/financial-tweets <br/>
https://github.com/dwallach1/Stocker <br/>
https://towardsdatascience.com/an-extensive-guide-to-collecting-tweets-from-twitter-api-v2-for-academic-research-using-python-3-518fcb71df2a

In [2]:
import re
import tweepy
import snscrape.modules.twitter as sntwitter

In [3]:
with open("top_symbols.npy", "rb") as f:
    top_coins = np.load(f)

symbols = [re.sub(r'\d', '', coin.split('-')[0]) for coin in top_coins]

# tags = ['crypto', 'cryptocurrency', 'bitcoin']

In [6]:
symbols[1:2]

['ETH']

In [5]:
# auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# auth.set_access_token(access_token, access_token_secret)
# api = tweepy.API(auth, wait_on_rate_limit=True)

In [None]:
# sntwitter.TwitterHashtagScraper
# sntwitter.TwitterSearchScraper

In [87]:
symbols = [re.sub(r'\d', '', coin.split('-')[0]) for coin in top_coins]

In [93]:
keywords = ['Cryptocurrency', 'Bitcoin', 'Ethereum', 'Binance Coin', 'Tether', 'Cardano', 'Solana', 'XRP', 'Ripple', 'Polkadot', 
           'HEX cryptocurrency', 'SHIBA INU coin', 'Dogecoin', 'USDCoin', 'Terra Luna', 'Uniswap', 'Avalanche coin', 'Chainlink', 'Litecoin', 'Matic Network', 'Polygon cryptocurrency', 
           'Algorand', 'Bitcoin Cash', 'Stellar', 'VeChain', 
           'Axie Infinity coin', 'cosmos coin', 'Dfinity', 'Internet Computer coin', 'Filecoin', 'TRON cryptocurrency', 'FTX Token', 'THETA Token', 'Ethereum Classic', 
           'Fantom cryptocurrency', 'Dai cryptocurrency', 'Crypto.com Coin', 'Hedera Hashgraph', 
           'Elrond cryptocurrency', 'Tezos', 'Decentraland coin', 'Monero', 'The Graph coin', 'EOSIO', 'PancakeSwap', 'Flow coin', 'Aave', 'THORChain', 'IOTA', 'Kusama coin',
           'Maker cryptocurrency', 'Quant network', 'Harmony coin', 'NEO cryptocurrency', 
           'Bitcoin SV', 'The Sandbox coin', 'Helium coin', 'Waves coin', 'Chiliz', 'BitTorrent Token', 'Arweave', 'Stacks coin', 'Amp token', 'Enjin Coin', 
           'Compound token', 'Holo token', 'Celo cryptocurrency', 'Zcash', 'Dash cryptocurrency', 'OmiseGO', 
           'Theta Fuel', 'NEM coin', 'Curve DAO Token', 'Loopring', 'Qtum', 'SushiSwap', 'ICON cryptocurrency', 'Decred', 'Basic Attention Token', 'Creditcoin', 
           'Synthetix', 'Synthetix Network Token', 'Ravencoin', 'Zilliqa', 
           'yearn.finance', 'Kadena coin', 'TrueUSD', 'XinFin Network', 'ECOMI', 'BitcoinGold', 'Serum coin', 'Project Serum', '0x', 'Bancor', 'Ankr', 
           'Celsius cryptocurrency', 'Siacoin', 'Counos X', 'Horizen', 'IOST', 'Raydium cryptocurrency', 
           'Ontology coin', 'SKALE', 'SKALE Network', 'WAX cryptocurrency', 'DigiByte', 'Nano cryptocurrency']

In [125]:
maxTweets = 10
tweets = []

# for kw in []

# Using TwitterSearchScraper to scrape data and append tweets to list
for i, tweet in tqdm(enumerate(sntwitter.TwitterSearchScraper(f'{words[0]} since:2021-10-31 until:2021-11-01 -filter:retweets').get_items())):
    if (tweet.replyCount + tweet.retweetCount + tweet.likeCount + tweet.quoteCount) >= 10:
        tweets.append({
            'id': tweet.id,
            'date': tweet.date.strftime('%Y-%m-%d'),
            'text': tweet.content,
            'reply_count': tweet.replyCount,
            'retweet_count': tweet.retweetCount,
            'like_count': tweet.likeCount,
            'quote_count': tweet.quoteCount,
            'hashtags': tweet.hashtags,
            'cashtags': tweet.cashtags
        })
#     if len(tweets) > maxTweets:
#         break

0it [00:00, ?it/s]

In [None]:
start_date = dt.date(start_year, start_mon, 1) 
stop_date = dt.date(stop_year, stop_mon, 1) # get_last_date_of_month(stop_year, stop_mon)

current = start_date
results = []

while current <= stop_date:
    gnews = GoogleNews(lang='en', start=current.strftime('%m/%d/%Y'), end=current.strftime('%m/%d/%Y'))
    gnews.search(kw)

    # title, media, datetime, desc, link
#         for page in tqdm(range(1, 31)):
#             page_results = gnews.page_at(page)

#             if page_results == []:
#                 break
#             else:
#                 results = results + page_results

#             time.sleep(np.random.randint(7, 18))

    results = results + gnews.results()
    print(f"{kw}:{current.strftime('%m/%d/%Y')}")
    current = current + dt.timedelta(days=1)
    gnews.clear()

In [146]:
(dt.date(2018, 1, 31) - dt.timedelta(days = 1)).strftime('%Y-%m-%d')

'2018-01-30'

### Coin Features and Internal Characteristics and Wikipedia Page View 
https://wikipedia.readthedocs.io/en/latest/quickstart.html <br/>
https://en.wikipedia.org/wiki/List_of_cryptocurrencies <br/>
https://en.wikipedia.org/wiki/Stablecoin <br/>
https://www.investopedia.com/tech/most-important-cryptocurrencies-other-than-bitcoin/ <br/>
https://www.laptopmag.com/best-picks/best-cryptocurrency-to-buy-in-2021 <br/>
https://levelup.gitconnected.com/the-7-types-of-cryptocurrencies-you-must-know-3b26b2ce0eb8 <br/>
https://www.sofi.com/learn/content/understanding-the-different-types-of-cryptocurrency/ </br>

coin characteristics, country of origination, utility, and etc.

In [9]:
import wikipedia as wp

from mwviews.api import PageviewsClient

In [25]:
# coin features from coinmarketcap.com property and tags
coin_features = pd.read_csv('datasets/coin_features.csv')
coin_names = pd.read_csv('datasets/ticker_info.csv')[:100]['name'].to_list()

In [10]:
# Sends a descriptive User-Agent header with every request
p = PageviewsClient(user_agent="<patara.t@kaist.ac.kr> Cryptocurrency-related articles pageview")

#N/A: Loopring, DigiByte, IOST, WAX, SKALE Network, Horizon, raydium, Ontology, Ankr, Celsius, Serum, Siacoin, CounosX, XinFin Network, Kadena, yearn finance, Zilliqa, Synthetix Network Token, Ravencoin, CreditCoin, ICON, Decred, Sushi, Qtum, NEM, TFUEL, Omise GO, celo, HEX, Terra, Polygon (Matic Network), VeChain, Cosmos, Theta, Fantom, Elrond, The Graph, pancake swap, Flow, THOR, Kusama, Maker, Quant, Harmony, Bitcoin SV, Helium, Waves, Chiliz, Arweave, Amp, EnjinCoin, Compound, Holo
article_names = ['Cryptocurrency', 'Cryptocurrency_exchange', 'Cryptocurrency_wallet', 'Bitcoin', 'Ethereum', 'Binance', 'Tether_(cryptocurrency)', 'Cardano_(blockchain_platform)', 'Solana_(blockchain_platform)', 'Ripple_(payment_protocol)', 
                'Polkadot_(cryptocurrency)', 'Shiba_Inu_(cryptocurrency)', 'Dogecoin', 'USD_Coin', 'Uniswap', 'Avalanche_(cryptocurrency)', 'Chainlink_(blockchain)', 'Litecoin', 
                'Algorand', 'Bitcoin_Cash', 'Stellar_(payment_network)', 'Axie_Infinity', 'Dfinity', 'Filecoin', 'Tron_(cryptocurrency)', 'FTX_(company)', 'Ethereum_Classic',
                'Dai_(cryptocurrency)', 'Crypto.com', 'Hashgraph', 'Tezos', 'Decentraland', 'Monero', 'EOS.IO', 'IOTA_(technology)', 'NEO_(cryptocurrency)', 'The_Sandbox_(video_game)',
                'Rainberry,_Inc.', 'Stacks_blockchain', 'Zcash', 'Dash_(cryptocurrency)', 'The_DAO_(organization)', 'Brave_(web_browser)', 'Stablecoin', 'Bitcoin_Gold', 
                '0x_(decentralized_exchange_infrastructure)', 'Bancor_(cryptocurrency)', 'Nano_(cryptocurrency)']

article_views = p.article_views('en.wikipedia', article_names, granularity='daily', start='20180101', end='20211031')

In [12]:
views = []
for key in list(article_views.keys()):
    view = article_views[key]
    view['Date'] = key.strftime('%Y-%m-%d')
    views.append(view)

In [18]:
# pd.DataFrame(views).to_csv('datasets/wiki_pageviews.csv', index=False)

### Google Trends and News
https://github.com/GeneralMills/pytrends <br/>
https://github.com/Iceloof/GoogleNews <br/>
cryptocoinsnews.com

In [2]:
from pytrends.request import TrendReq
from pytrends.dailydata import get_daily_data, get_last_date_of_month, convert_dates_to_timeframe

In [3]:
keywords = ['Cryptocurrency', 'Cryptocurrency exchange', 'Cryptocurrency wallet', 'Bitcoin', 'Ethereum', 'Binance Coin', 'Tether', 'Cardano', 'Solana', 'XRP', 'Ripple', 'Polkadot', 
           'HEX cryptocurrency', 'SHIBA INU coin', 'Dogecoin', 'USDCoin', 'Terra Luna', 'Uniswap', 'Avalanche coin', 'Chainlink', 'Litecoin', 'Matic Network', 'Polygon cryptocurrency', 
           'Algorand', 'Bitcoin Cash', 'Stellar', 'VeChain', 
           'Axie Infinity coin', 'cosmos coin', 'Dfinity', 'Internet Computer coin', 'Filecoin', 'TRON cryptocurrency', 'FTX Token', 'THETA Token', 'Ethereum Classic', 
           'Fantom cryptocurrency', 'Dai cryptocurrency', 'Crypto.com Coin', 'Hedera Hashgraph', 
           'Elrond cryptocurrency', 'Tezos', 'Decentraland coin', 'Monero', 'The Graph coin', 'EOSIO', 'PancakeSwap', 'Flow coin', 'Aave', 'THORChain', 'IOTA', 'Kusama coin',
           'Maker cryptocurrency', 'Quant network', 'Harmony coin', 'NEO cryptocurrency', 
           'Bitcoin SV', 'The Sandbox coin', 'Helium coin', 'Waves coin', 'Chiliz', 'BitTorrent Token', 'Arweave', 'Stacks coin', 'Amp token', 'Enjin Coin', 
           'Compound token', 'Holo token', 'Celo cryptocurrency', 'Zcash', 'Dash cryptocurrency', 'OmiseGO', 
           'Theta Fuel', 'NEM coin', 'Curve DAO Token', 'Loopring', 'Qtum', 'SushiSwap', 'ICON cryptocurrency', 'Decred', 'Basic Attention Token', 'Creditcoin', 
           'Synthetix', 'Synthetix Network Token', 'Ravencoin', 'Zilliqa', 
           'yearn.finance', 'Kadena coin', 'TrueUSD', 'XinFin Network', 'ECOMI', 'BitcoinGold', 'Serum coin', 'Project Serum', '0x', 'Bancor', 'Ankr', 
           'Celsius cryptocurrency', 'Siacoin', 'Counos X', 'Horizen', 'IOST', 'Raydium cryptocurrency', 
           'Ontology coin', 'SKALE', 'SKALE Network', 'WAX cryptocurrency', 'DigiByte', 'Nano cryptocurrency']

In [None]:
for kw in tqdm(keywords):
    df = get_daily_data(kw, start_year=2018, start_mon=1, stop_year=2021, stop_mon=10, geo='')
    df.to_csv(f"datasets/gtrends/{kw.replace(' ', '_')}.csv")

In [4]:
import nltk
import requests

from GoogleNews import GoogleNews
from newspaper import Article
from newspaper import Config
 
from gnews import GNews, utils

In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/patara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [37]:
# params = {
#     'token': '2397b8faad1e2a92243f4e43803b17b3',
#     'q': 'Cryptocurrency',
#     'lang': 'en',
#     'from': '2021-10-31T00:00:00Z',
#     'to': '2021-10-31T23:59:59Z',
#     'sortby': 'relevance'
# }
# res = requests.get('https://gnews.io/api/v4/search', params)

In [4]:
# df = {'Cryptocurrency': [], 'Cryptocurrency exchange': [], 'Cryptocurrency market': []}

In [19]:
cryptocurrency = pd.read_csv(f"datasets/news/Cryptocurrency.csv")
cryptocurrency_exchange = pd.read_csv(f"datasets/news/Cryptocurrency_exchange.csv")
cryptocurrency_market = pd.read_csv(f"datasets/news/Cryptocurrency_market.csv")

### Global Economic Policay Uncertainty Index (GEPU) and Geopolitical Uncertainty (GPR)
http://www.policyuncertainty.com <br/>
https://www.policyuncertainty.com/global_monthly.html <br/>
https://www.matteoiacoviello.com/gpr.htm <br/>

The Caldara and Iacoviello GPR index reflects automated text-search results of the electronic archives of 10 newspapers: Chicago Tribune, the Daily Telegraph, Financial Times, The Globe and Mail, The Guardian, the Los Angeles Times, The New York Times, USA Today, The Wall Street Journal, and The Washington Post. Caldara and Iacoviello calculate the index by counting the number of articles related to adverse geopolitical events in each newspaper for each month (as a share of the total number of news articles).

The search is organized in eight categories: War Threats (Category 1), Peace Threats (Category 2), Military Buildups (Category 3), Nuclear Threats (Category 4), Terror Threats (Category 5), Beginning of War (Category 6), Escalation of War (Category 7), Terror Acts (Category 8). Based on the search groups above, Caldara and Iacoviello also constructs two subindexes. The Geopolitical Threats (GPRT) includes words belonging to categories 1 to 5 above. The Geopolitical Acts (GPRA) index includes words belonging to categories 6 to 8.

In [22]:
gepu = pd.read_csv('datasets/Global_Policy_Uncertainty_Data.csv')
gepu_country = pd.read_csv('datasets/GEPU_Country_Data.csv')

In [23]:
twu = pd.read_csv('datasets/Twitter_Economic_Uncertainty.csv')

In [24]:
gpr_country = pd.read_csv('datasets/gpr_export.csv')
gpr_daily = pd.read_csv('datasets/gpr_daily_recent.csv')