In [50]:
import pandas as pd
import requests
import json

## Scraping News Using NewsCatcher

Credits: https://docs.newscatcherapi.com/knowledge-base/guides-and-tutorials/export-news-into-a-csv-with-python

In [55]:
# URL of our News API
base_url = 'https://api.newscatcherapi.com/v2/search'

# Your API key
X_API_KEY = open(r'api_newscatcher.key').read()


In [139]:
def get_news_articles(filter):
    # Put your API key to headers in order to be authorized to perform a call
    headers = {'x-api-key': X_API_KEY}

    articles = []
    # Define your desired parameters
    for i in range(1,101):
        try:
            params = {
                'q': filter,
                'lang': 'en',
                'to_rank': 1000,
                'page_size': 100,
                'page': i
                }

            # Make a simple call with both headers and params
            response = requests.get(base_url, headers=headers, params=params)

            # Encode received results
            results = json.loads(response.text.encode())
            articles = articles + results['articles']
        except:
            pass
        news = pd.DataFrame(articles)[['title','excerpt','summary','published_date']].fillna('')
        news['full_content'] = news.apply(lambda x: x['title']+' '+x['summary']+' '+x['excerpt'],axis=1)

    return news

In [140]:
filter = 'Bitcoin OR Ethereum OR Dogecoin OR Ukraine OR Russia OR war OR blockchain OR money'
news_catcher = get_news_articles(filter=filter)

# Data Cleaning

In [146]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

In [147]:
def clean_text(text):
    word_tokens = word_tokenize(text)
    filtered_sentence = [w.lower() for w in word_tokens if not w.lower() in stop_words]
    return ' '.join(filtered_sentence)

In [148]:
news_catcher['full_content'] = news_catcher['full_content'].apply(clean_text)

In [149]:
[x for x in news_catcher['full_content'].sample(5)]

["rockets strike ukraine 's lviv biden says putin ' remain power ' warsaw/lviv , ukraine , march 27 ( reuters ) - u.s. president joe biden described russian leader vladimir putin butcher `` remain power '' meeting ukrainian refugees poland , kremlin forces stepped attacks across ukraine , including western city lviv.biden 's comments , escalation u.s. rhetoric towards moscow invasion ukraine , call regime change russia , white house official said later , meant prepare world 's democracies extended conflict . u.s. president joe biden described russian leader vladimir putin butcher `` remain power '' meeting ukrainian refugees poland , kremlin forces stepped attacks across…",
 "hackers swipe nearly $ 600 million 'play earn ' crypto game digital thieves pulled another major crypto heist . motherboard learned hackers stole 173,600 ethereum ( $ 591.2 million ) ronin blockchain powers axie infinity , popular `` play earn '' game players receive crypto exchange playing paying starting costs .

# Using BERTopic to identify trends

In [94]:
from bertopic import BERTopic

In [150]:
model = BERTopic(nr_topics=10,
                 verbose=True,
                 n_gram_range=(1,3),
                 min_topic_size=5,
                 embedding_model='all-MiniLM-L6-v2')


In [151]:
topics, probabilities = model.fit_transform(news_catcher['full_content'].values)

Batches: 100%|██████████| 57/57 [02:42<00:00,  2.85s/it]
2022-03-31 21:07:36,146 - BERTopic - Transformed documents to Embeddings
2022-03-31 21:07:44,426 - BERTopic - Reduced dimensionality with UMAP
2022-03-31 21:07:44,606 - BERTopic - Clustered UMAP embeddings with HDBSCAN
2022-03-31 21:07:50,396 - BERTopic - Reduced number of topics from 85 to 11


In [152]:
news_catcher['topic'] = topics

In [153]:
news_catcher = pd.merge(news_catcher, model.get_topic_info(), how='left',left_on='topic',right_on='Topic').drop(['topic','Count'],axis=1).rename(columns={'Name':'TopicName'})

In [154]:
news_catcher.head()

Unnamed: 0,title,excerpt,summary,published_date,full_content,Topic,TopicName
0,Crypto prices rise by more than 5%; Bitcoin up...,Cryptocurrency prices were higher early Monday...,Cryptocurrency prices were higher early Monday...,2022-03-28 08:57:31,crypto prices rise 5 % ; bitcoin 12 % since ma...,0,0_bitcoin_crypto_ethereum_cryptocurrency
1,Ethereum's major upgrade is coming. Should you...,A weekly look at the most important moves and ...,"Hello! Welcome back to Distributed Ledger, our...",2022-03-31 19:27:00,ethereum 's major upgrade coming . bullish bit...,0,0_bitcoin_crypto_ethereum_cryptocurrency
2,"Stocks lower as inflation fears grow, gold dow...",FOX Business is providing real-time updates on...,6PostsSort BySort by NewestSort by OldestCrypt...,2022-03-28 09:20:10,"stocks lower inflation fears grow , gold , gas...",-1,-1_ukraine_russia_russian_war
3,Mobile phones bring direct aid to Ukraine,"Tech solutions like cryptocurrencies, Airbnb, ...",Mobile phone users are donating to Ukraine via...,2022-03-31 13:00:42,mobile phones bring direct aid ukraine mobile ...,-1,-1_ukraine_russia_russian_war
4,Panels at Quinnipiac University's Virtual GAME...,Press release content from PR Newswire. The AP...,"HAMDEN, Conn., March 28, 2022 /PRNewswire/ -- ...",2022-03-28 19:07:54,panels quinnipiac university 's virtual game x...,-1,-1_ukraine_russia_russian_war


In [155]:
model.get_topic_info()['Name'].values

array(['-1_ukraine_russia_russian_war',
       '0_bitcoin_crypto_ethereum_cryptocurrency',
       '1_kyiv_ukraine_forces_russia', '2_stocks_talks_ukraine_russia',
       '3_refugees_million_ukraine_fled', '4_putin_ukraine_war_russian',
       '5_putin_russian_intelligence_ukraine',
       '6_gas_russian_germany_russia',
       '7_crypto_cryptocurrency_ethereum_nft',
       '8_biden_putin_president_power',
       '9_russian_ukraine_crimes_ukrainian'], dtype=object)

In [156]:
model.visualize_barchart()


In [157]:
model.visualize_heatmap()

In [159]:
topics_over_time = model.topics_over_time(news_catcher['full_content'].values,topics, list(news_catcher['published_date'].values))

1563it [08:51,  2.94it/s]


In [160]:
topics_over_time

Unnamed: 0,Topic,Words,Frequency,Timestamp
0,-1,"europe, ukraine, acted, outage, europe acted",2,2022-03-25 00:00:00
1,0,"arw, arowana, bittrex, bittrex global, exchange",1,2022-03-25 00:00:00
2,9,"prosecutor general, prosecutor, rape, invading...",1,2022-03-25 00:00:53
3,-1,"tutor, ukraine panto, panto, glasgow universit...",2,2022-03-25 00:01:00
4,-1,"india, shaky response, somewhat shaky response...",1,2022-03-25 00:09:30
...,...,...,...,...
1638,6,"gas, deadline, resume, ending five week, war s...",1,2022-04-01 02:15:00
1639,-1,"canberra, australia, house, parliament, via",1,2022-04-01 02:47:51
1640,4,"sputnik afp, information war, sputnik, afp, uk...",1,2022-04-01 03:43:55
1641,3,"visa, scheme, visa scheme, born, wife",1,2022-04-01 03:45:00


In [161]:
model.visualize_topics_over_time(topics_over_time, top_n_topics=4)

In [162]:
topics_over_time_cum = topics_over_time.sort_values('Timestamp').drop('Words',axis=1) \
                                       .query('Topic != -1').groupby(['Topic', 'Name','Timestamp'])['Frequency'].sum() \
                                       .groupby(level=0).cumsum().reset_index() \
                                       .merge(topics_over_time[["Topic",'Words','Timestamp','Name']])

In [163]:
model.visualize_topics_over_time(topics_over_time_cum, top_n_topics=20)

In [165]:
topics_over_time.query('Topic == 3').sort_values('Timestamp',ascending=False)

Unnamed: 0,Topic,Words,Frequency,Timestamp,Name
1641,3,"visa, scheme, visa scheme, born, wife",1,2022-04-01 03:45:00,3_refugees_million_ukraine_fled
1590,3,"refugees, przemysl, million, ukraine, forced",1,2022-03-31 19:45:00,3_refugees_million_ukraine_fled
1574,3,"border, southern border, moldova, ukraine, sou...",1,2022-03-31 18:45:00,3_refugees_million_ukraine_fled
1559,3,"miroshnychenko, fleeing, refugees, war, family...",1,2022-03-31 17:15:19,3_refugees_million_ukraine_fled
1552,3,"petryk, philadelphia, star, refuge philadelphi...",1,2022-03-31 16:58:27,3_refugees_million_ukraine_fled
1538,3,"jolie, humanitarian, refugees, hospital amid r...",1,2022-03-31 15:52:00,3_refugees_million_ukraine_fled
1496,3,"refugees, find jobs, slows, jobs, number",1,2022-03-31 13:30:00,3_refugees_million_ukraine_fled
1474,3,"family, hawaii, started, money, nowhere go march",1,2022-03-31 11:53:00,3_refugees_million_ukraine_fled
1374,3,"visas, refugees, shambolic, visas granted, gra...",1,2022-03-31 02:02:00,3_refugees_million_ukraine_fled
1329,3,"fled, million, ukraine, nazarov, four million",1,2022-03-30 20:58:06,3_refugees_million_ukraine_fled
