## Setup and Initialization

In [1]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import requests
from bs4 import BeautifulSoup
from datetime import datetime, date

# Download the VADER lexicon
nltk.download('vader_lexicon')
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\windows\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


## Define Function to Scrape News

In [2]:
def get_news(ticker, pages=1):
    base_url = f"https://cointelegraph.com/tags/{ticker}"
    all_news = []
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
    }

    for page in range(1, pages + 1):
        response = requests.get(base_url + f"?page={page}", headers=headers)
        soup = BeautifulSoup(response.content, 'html5lib')
        articles = soup.find_all('li', class_='group-[.inline]:mb-8 col-span-1')

        for article in articles:
            try:
                commercial_name = ticker
                title = article.find('span', class_='post-card-inline__title').text.strip()
                date_element = article.find('time', class_='post-card-inline__date')
                date = date_element['datetime'] if date_element else None

                all_news.append({
                    'commercial_name': commercial_name,
                    'title': title,
                    'date': date
                })
            except Exception as e:
                continue

    return pd.DataFrame(all_news)

## Define Cryptocurrency List and Mapping

In [3]:
# List of cryptocurrencies
cryptocurrency_names = ['bitcoin', 'ethereum', 'cardano']
crypto_code_mapping = {
    'bitcoin': 'BTC-USD',
    'ethereum': 'ETH-USD',
    'cardano': 'ADA-USD'
}

## News for Each Cryptocurrency

In [4]:
# List to store all news data
all_news = []

In [5]:
for ticker in cryptocurrency_names:
    news_df = get_news(ticker, pages=50)
    all_news.append(news_df)
    print(f'News for {ticker} successfully downloaded.')

News for bitcoin successfully downloaded.
News for ethereum successfully downloaded.
News for cardano successfully downloaded.


In [6]:
# Combine all news DataFrames into a single DataFrame
combined_news_df = pd.concat(all_news)
combined_news_df['crypto_code'] = np.where(
    combined_news_df['commercial_name'].isin(crypto_code_mapping.keys()),
    combined_news_df['commercial_name'].map(crypto_code_mapping),
    np.nan  # Assign NaN if the symbol is not in the dictionary
)

## Preprocessing and Tokenization

In [7]:
# Preprocessing and tokenization
combined_news_df['cleaned_title'] = combined_news_df['title'].apply(lambda x: ' '.join(re.findall(r'\b\w+\b', x.lower())))
# Remove apostrophes
combined_news_df['cleaned_title'] = combined_news_df['cleaned_title'].str.replace(r"n't", " not")

In [8]:
combined_news_df

Unnamed: 0,commercial_name,title,date,crypto_code,cleaned_title
0,bitcoin,"How 1,500 new Bitcoin millionaires per day dea...",2024-05-30,BTC-USD,how 1 500 new bitcoin millionaires per day dea...
1,bitcoin,Here’s what happened in crypto today,2024-05-30,BTC-USD,here s what happened in crypto today
2,bitcoin,Traders say Bitcoin price fights ‘last resista...,2024-05-30,BTC-USD,traders say bitcoin price fights last resistan...
3,bitcoin,Bitcoin price aims for $69K as ‘hot’ US macro ...,2024-05-30,BTC-USD,bitcoin price aims for 69k as hot us macro dat...
4,bitcoin,"Exclusive: Joe Lubin unpacks SEC battle, Ether...",2024-05-30,BTC-USD,exclusive joe lubin unpacks sec battle ethereu...
...,...,...,...,...,...
745,cardano,President Joe Biden is trying hard to ‘kill cr...,2024-05-10,ADA-USD,president joe biden is trying hard to kill cry...
746,cardano,"Price analysis 5/8: BTC, ETH, BNB, SOL, XRP, D...",2024-05-08,ADA-USD,price analysis 5 8 btc eth bnb sol xrp doge to...
747,cardano,"Price analysis 5/6: SPX, DXY, BTC, ETH, BNB, S...",2024-05-06,ADA-USD,price analysis 5 6 spx dxy btc eth bnb sol xrp...
748,cardano,Why is Cardano (ADA) price up this week?,2024-05-06,ADA-USD,why is cardano ada price up this week


## Sentiment Analysis

In [9]:
# Calculate sentiment using VADER
combined_news_df['sentiment_SIA'] = combined_news_df['cleaned_title'].apply(lambda x: sid.polarity_scores(x)['compound'])
combined_news_df.to_csv('data/combined_crypto_news_with_sentiment.csv', sep=';', index=False)
print('Combined and analyzed news saved to combined_crypto_news_with_sentiment.csv')

Combined and analyzed news saved to combined_crypto_news_with_sentiment.csv


In [10]:
combined_news_df

Unnamed: 0,commercial_name,title,date,crypto_code,cleaned_title,sentiment_SIA
0,bitcoin,"How 1,500 new Bitcoin millionaires per day dea...",2024-05-30,BTC-USD,how 1 500 new bitcoin millionaires per day dea...,0.5574
1,bitcoin,Here’s what happened in crypto today,2024-05-30,BTC-USD,here s what happened in crypto today,0.0000
2,bitcoin,Traders say Bitcoin price fights ‘last resista...,2024-05-30,BTC-USD,traders say bitcoin price fights last resistan...,-0.4019
3,bitcoin,Bitcoin price aims for $69K as ‘hot’ US macro ...,2024-05-30,BTC-USD,bitcoin price aims for 69k as hot us macro dat...,-0.3182
4,bitcoin,"Exclusive: Joe Lubin unpacks SEC battle, Ether...",2024-05-30,BTC-USD,exclusive joe lubin unpacks sec battle ethereu...,-0.2732
...,...,...,...,...,...,...
745,cardano,President Joe Biden is trying hard to ‘kill cr...,2024-05-10,ADA-USD,president joe biden is trying hard to kill cry...,-0.7269
746,cardano,"Price analysis 5/8: BTC, ETH, BNB, SOL, XRP, D...",2024-05-08,ADA-USD,price analysis 5 8 btc eth bnb sol xrp doge to...,0.0000
747,cardano,"Price analysis 5/6: SPX, DXY, BTC, ETH, BNB, S...",2024-05-06,ADA-USD,price analysis 5 6 spx dxy btc eth bnb sol xrp...,0.0000
748,cardano,Why is Cardano (ADA) price up this week?,2024-05-06,ADA-USD,why is cardano ada price up this week,0.0000
