# Playing Around With News Data

In this notebook, <br>
First part: News from Google News which is an news aggregator (a perfect candidate for bulk news retrieval) is scrape and visualize in network to identify any correlation between trending news among a few countries.

Implemented:<br>

1. [Google News Scraper](#gnscraper)<br>
1. [Using scraper for bulk scrpaing](#bulk_scraping)<br>
1. [Plotting in network](#news_network)<br>

Second part: Instead of analyst acrossing multiple news media brand, I use twitter to scrape extractly single media to further investigate.

1. [Getting Tweets](#tweets)<br>
1. [Trending Topic Coverage](#trending_topic)<br>
1. [Topic Connectivity And Continuity](#topic_modelling)<br>
1. [Sentiment Analysis](#sentiment)<br>
1. [WordCloud of Entity](#wcentity)<br>

[Dependencies](#dependencies)

## Google News Scraper<a id='gnscraper'></a>

In [None]:
import copy
import typing
import urllib

import feedparser
import requests
from bs4 import BeautifulSoup
from dateparser import parse as parse_date

In [None]:
class Scraper:
    def __init__(self, language='en', country='MY') -> None:
        """
        :param str country: two string country code, example: 'MY', 'US'
        :param str language: news language
        """
        self.lang = language.lower()
        self.country = country.upper()
        self.BASE_URL = 'https://news.google.com/rss'

    def __news_parser(self, text) -> str:
        try:
            bs4_html = BeautifulSoup(text, 'html.parser')
            lis = bs4_html.find_all('li')
            sub_articles = []
            for li in lis:
                try:
                    sub_articles.append(
                        {
                            'url': li.a['href'],
                            'title': li.a.text,
                            'publisher': li.font.text,
                        }
                    )
                except:
                    pass
            return sub_articles
        except:
            return text

    def __ceid(self) -> dict:
        return '?ceid={}:{}&hl={}&gl={}'.format(
            self.country, self.lang, self.lang, self.country
        )

    def __add_sub_articles(self, entries) -> dict:
        for i, val in enumerate(entries):
            if 'summary' in entries[i].keys():
                entries[i]['sub_articles'] = self.__news_parser(entries[i]['summary'])
            else:
                entries[i]['sub_articles'] = None

        return entries

    def __parse_feed(self, feed_url, proxies=None) -> dict:
        if proxies:
            r = requests.get(feed_url, proxies=proxies)
        else:
            r = requests.get(feed_url)

        r = requests.get(feed_url)

        if "https://news.google.com/rss/unsupported" in r.url:
            raise Exception('This feed is not available')

        d = feedparser.parse(r.text)

        if not proxies and len(d['entries']) == 0:
            d = feedparser.parse(feed_url)

        return dict((k, d[k]) for k in ('feed', 'entries'))

    def __search_helper(self, query):
        return urllib.parse.quote_plus(query)

    def __from_to_helper(self, validate=None) -> typing.Optional[str]:
        try:
            validate = parse_date(validate).strftime('%Y-%m-%d')
            return str(validate)
        except:
            raise Exception('Could not parse your date')

    def __extract_summary(self, text: str) -> list:
        result = list()
        length = len(text.split('target='_blank'>'))

        if length > 2:
            for i in text.split('target='_blank'>')[:-1]:
                if '</a>' not in i:
                    continue
                else:
                    text = i.split('</a>')[0]
                    result.append(text)
            result = '. '.join(result)
        else:
            if '</a>' in text:
                text = text.split('target='_blank'>')[1]
                text = text.split('</a>')[0]
                result.append(text)

        return result

    def __clean_news(self, r, n: int, show: bool) -> dict:
        r = copy.copy(r)
        required, present = 0, len(r.get('entries'))

        if n < present:
            required = copy.copy(n)
        if n > present:
            required = copy.copy(present)

        titles, publishers, published_times, summaries, links = [], [], [], [], []

        i = 0
        for i in range(required):
            if r.get('entries')[i].get('title').count('-') > 1:
                title = r.get('entries')[i].get('title').rsplit('-', 1)[0]
                publisher = r.get('entries')[i].get('title').rsplit('-', 1)[-1].strip()
            else:
                title = r.get('entries')[i].get('title').split('-')[0]
                publisher = r.get('entries')[i].get('title').split('-')[-1].strip()
            published_time = r.get('entries')[i].get('published')
            raw_summary = r.get('entries')[i].get('summary')
            summary = self.__extract_summary(raw_summary)
            link = r.get('entries')[i].get('link')

            titles.append(title)
            publishers.append(publisher)
            published_times.append(published_time)
            summaries.append(summary)
            links.append(link)

            if show:
                print('Title: ', title)
                print('Publisher: ', publisher)
                print('Published Time: ', published_time)
                print('Summary: ', summary)
                print('Link: ', link)
                print('\n')

        return {
            'titles': titles,
            'publishers': publishers,
            'published_times': published_times,
            'summaries': summaries,
            'links': links,
        }

    def get_news(self, nums: int, show: bool, proxies=None) -> dict:
        """
        :param int nums: number of news to retrieve
        :param bool show: print searched results
        :return dict d: dictionary of curated results
        """
        d = self.__parse_feed(self.BASE_URL + self.__ceid(), proxies=proxies)
        d['entries'] = self.__add_sub_articles(d['entries'])
        d = self.__clean_news(d, nums, show)

        return d

    def get_news_by_topics(
        self, topic: str, nums: int, show: bool, proxies=None
    ) -> typing.Optional[dict]:
        """
        :param str topic: news topic to query
        :param int nums: number of news to retrieve
        :param bool show: print searched results
        :return dict d: dictionary of curated results
        """
        if topic.upper() in [
            'WORLD',
            'NATION',
            'BUSINESS',
            'TECHNOLOGY',
            'ENTERTAINMENT',
            'SCIENCE',
            'SPORTS',
            'HEALTH',
        ]:
            d = self.__parse_feed(
                self.BASE_URL
                + '/headlines/section/topic/{}'.format(topic.upper())
                + self.__ceid(),
                proxies=proxies,
            )

        else:
            d = self.__parse_feed(
                self.BASE_URL + '/topics/{}'.format(topic) + self.__ceid(),
                proxies=proxies,
            )

        d['entries'] = self.__add_sub_articles(d['entries'])
        if len(d['entries']) > 0:
            d = self.__clean_news(d, nums, show)
            return d
        else:
            raise Exception('unsupported topic')

    def search(
        self,
        query: str,
        nums: int,
        show: bool,
        helper=True,
        when=None,
        from_=None,
        to_=None,
        proxies=None,
    ) -> dict:
        """
        :param str query: news title to query
        :param int nums: number of news to retrieve
        :param bool show: print searched results
        :param str when: results in an article published in last _, example: '30m', '1h', '7d'
        :return dict d: dictionary of curated results
        """
        if when:
            query += ' when:' + when

        if from_ and not when:
            from_ = self.__from_to_helper(validate=from_)
            query += ' after:' + from_

        if to_ and not when:
            to_ = self.__from_to_helper(validate=to_)
            query += ' before:' + to_

        if helper == True:
            query = self.__search_helper(query)

        search_ceid = self.__ceid()
        search_ceid = search_ceid.replace('?', '&')

        d = self.__parse_feed(
            self.BASE_URL + '/search?q={}'.format(query) + search_ceid, proxies=proxies
        )
        d['entries'] = self.__add_sub_articles(d['entries'])
        d = self.__clean_news(d, nums, show)

        return d

In [None]:
## Sample
scraper = Scraper()
news = scraper.get_news(nums=1, show=True)
news_topics = scraper.get_news_by_topics(topic='science', nums=1, show=True)
news_searched = scraper.search(query='5G', nums=1, show=True)

## Bulk Scraping Google News<a id='bulk_scraping'></a>

In [None]:
import datetime
import re

import pandas as pd
import tqdm
from deep_translator import GoogleTranslator
from langdetect import detect

In [None]:
def clean_detect_translate(
    text: str, source_language: str = None, trg_language: str = None
) -> str:
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()

    if source_language == None:
        source_language = detect(text)
        lang_dict = {
            'en': 'en',
            'zh-cn': 'zh-CN',
            'zh-tw': 'zh-TW',
            'de': 'de',
            'fr': 'fr',
            'ko': 'ko',
            'ja': 'ja',
            'id': 'id',
        }
        source_language = lang_dict.get(source_language)

        if source_language == None:
            return None

    if source_language == trg_language:
        return text
    else:
        # print('Translating news')
        translator = GoogleTranslator(source=source_language, target=trg_language)
        translated_text = translator.translate(text)
        # GoogleTranslator.get_supported_languages(as_dict=True)
        translated_text = translator.translate(text)

    return translated_text


def detect_cjk(text: str) -> bool:
    text = re.sub('[^a-zA-Z]', ' ', str(text))
    if re.search('[\u4e00-\u9FFF]', text):
        return False
    if re.search('[\uac00-\ud7a3]', text):
        return False
    if re.search('[\u3040-\u30ff]', text):
        return False
    return True


def preprocessing_news(country_list: list) -> pd.DataFrame:
    news_dict = dict()

    for c in tqdm.tqdm(country_list):
        extracted_news = list()
        country_lang_dict = {
            'MY': 'en',
            'US': 'en',
            'GB': 'en',
            'IN': 'en',
            'CN': 'CN',
            'DE': 'de',
            'FR': 'fr',
            'TW': 'TW',
            'HK': 'TW',
            'AU': 'en',
            'KR': 'ko',
            'JP': 'ja',
            'CA': 'en',
            'SG': 'en',
            'ID': 'en',
            'NZ': 'en',
            'IE': 'en',
            'IL': 'en',
            'PK': 'en',
            'ZA': 'en',
            'CH': 'de',
            'IT': 'it',
        }

        language_use = country_lang_dict.get(c)
        scraper = Scraper(language=language_use, country=c)

        try:
            news = scraper.get_news_by_topics(topic='world', nums=50, show=False)
        except:
            print('Unable to get news for {}'.format(c))
            pass

        lang = 'zh-TW' if c == 'HK' or c == 'TW' else ('zh-CN' if c == 'CN' else None)
        if lang == None:
            lang = country_lang_dict.get(c)

        for i in range(len(news.get('titles'))):
            cleaned_text = clean_detect_translate(
                news.get('titles')[i], source_language=lang, trg_language='en'
            )
            if detect_cjk(text=cleaned_text):
                extracted_news.append(cleaned_text)

        # print('Successfully scrape {} news'.format(c)) if len(extracted_news) > \
        # 0 else print('Failed to scrape {} news'.format(c))
        news_dict[c] = extracted_news
        # print(c)
        # print(extracted_news)
        # print('\n')

    compiled_news = pd.DataFrame()
    for c in country_list:
        compiled_news = pd.concat(
            [compiled_news, pd.DataFrame({c: news_dict.get(c)})], axis=1
        )

    return compiled_news

In [None]:
COUNTRIES = [
    'MY',
    'US',
    'GB',
    'IN',
    'CN',
    'DE',
    'FR',
    'TW',
    'HK',
    'AU',
    'KR',
    'JP',
    'CA',
    'SG',
    'ID',
    'NZ',
    'IE',
    'IL',
    'PK',
    'ZA',
    'CH',
    'IT',
]

news_data = preprocessing_news(country_list=COUNTRIES)
news_data

## News meets Network<a id='news_network'></a>

In [None]:
from string import punctuation

import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
import spacy
from sklearn import feature_extraction, metrics

### Metrics

In [None]:
## Levenshtein Distance
def levenshtein_dis(seq1: str, seq2: str) -> int:
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros((size_x, size_y))
    for x in range(size_x):
        matrix[x, 0] = x
    for y in range(size_y):
        matrix[0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x - 1] == seq2[y - 1]:
                matrix[x, y] = min(
                    matrix[x - 1, y] + 1, matrix[x - 1, y - 1], matrix[x, y - 1] + 1
                )
            else:
                matrix[x, y] = min(
                    matrix[x - 1, y] + 1, matrix[x - 1, y - 1] + 1, matrix[x, y - 1] + 1
                )

    # print (matrix)
    return matrix[size_x - 1, size_y - 1]


## Cosine Similarity and TFIDF
def cosimilarity_tfidf(text1, text2) -> int:

    text_list = list([text1, text2])

    vectorizer = feature_extraction.text.TfidfVectorizer()
    textX = vectorizer.fit_transform(text_list)

    return metrics.pairwise.cosine_similarity(textX[0], textX[1]).flatten()[0]

### News Similarity from Single Country as Sources

In [None]:
%%time


def single_news_network(
    country: str, search_keywords: str, nums: int, when: str, formula: int
) -> None:
    scraper = Scraper(country=country)
    searched_news = scraper.search(
        query=search_keywords, nums=nums, show=False, when=when
    )
    titles_list = searched_news.get('titles')

    sources_distances = list()
    sources_distances_df = pd.DataFrame(index=titles_list)

    if formula == 1:
        for i in range(len(titles_list)):
            for j in range(len(titles_list)):
                sources_distances.append(
                    cosimilarity_tfidf(titles_list[i], titles_list[j])
                )
    elif formula == 2:
        for i in range(len(titles_list)):
            for j in range(len(titles_list)):
                sources_distances.append(
                    levenshtein_dis(titles_list[i], titles_list[j])
                )

    chunks = [
        sources_distances[i : i + len(titles_list)]
        for i in range(0, len(sources_distances), len(titles_list))
    ]

    for i, t in enumerate(titles_list):
        sources_distances_df[t] = chunks[i]

    stacked_df = sources_distances_df.stack().reset_index()
    stacked_df.columns = ['Source_1', 'Source_2', 'Distances']

    filtered_stacked_df = stacked_df.loc[
        (stacked_df['Distances'] > 0)
        & (stacked_df['Source_1'] != stacked_df['Source_2'])
    ]

    G = nx.from_pandas_edgelist(
        filtered_stacked_df, source='Source_1', target='Source_2'
    )

    plt.figure(figsize=(100, 50), dpi=200)
    plt.title(
        'News Similarity under the same keywords: {}'.format(search_keywords),
        fontsize=75,
    )
    nx.draw(
        G,
        with_labels=True,
        node_color='orange',
        node_size=400,
        edge_color='grey',
        style='dashed',
        linewidths=1,
        font_size=30,
    )
    plt.show()


single_news_network(
    country='MY', search_keywords='blockchain', nums=20, when='12h', formula=2
)  # 1 for lev_dis, 2 for cosine_tfidf

### News Similarity Multiple Country as Sources

In [None]:
%%time


def multi_news_networks(
    country: list, search_keywords: str, nums: int, when: str
) -> None:

    scraper_list, titles_list_of_list, titles_list = ([] for i in range(3))

    for c in country:
        scraper_list.append(Scraper(country=c))

    for i in range(len(country)):
        searched_news = scraper_list[i].search(
            query=search_keywords, nums=nums, show=False, when=when
        )
        titles = searched_news.get('titles')
        titles_list_of_list.append(titles)

    titles_df = pd.DataFrame(data=titles_list_of_list).T
    titles_df.columns = country.copy()
    titles_df = pd.melt(titles_df, value_vars=country)
    titles_df.rename(columns={'variable': 'country', 'value': 'title'}, inplace=True)

    titles_list = [
        t for sublist in titles_list_of_list for t in sublist
    ]  # list of list to list
    titles_list = list(set(titles_list))  # remove duplicates title

    node_characteristic = pd.DataFrame(
        {'ID': country + titles_list, 'type': country + ['t'] * len(titles_list)}
    )

    plt.figure(figsize=(50, 50), dpi=150)
    plt.title('Multicountry News Similarity: The significant', fontsize=50)

    G = nx.from_pandas_edgelist(
        titles_df, source='country', target='title', create_using=nx.Graph()
    )

    node_characteristic = node_characteristic.set_index('ID')
    node_characteristic = node_characteristic.reindex(G.nodes())
    node_characteristic['type'] = pd.Categorical(node_characteristic['type'])

    cmap = mpl.colors.ListedColormap(
        ['yellow', 'C0', 'green', 'red', 'darkorange', 'thistle']
    )

    node_sizes = [4000 if entry != 't' else 300 for entry in node_characteristic.type]

    nx.draw(
        G,
        with_labels=True,
        node_size=node_sizes,
        node_color=node_characteristic['type'].cat.codes,
        cmap=cmap,
        edge_color='grey',
        style='dashed',
        linewidths=1,
        font_size=20,
    )
    plt.show()


multi_news_networks(
    country=['MY', 'US', 'GB', 'SG', 'IN'],
    search_keywords='blockchain',
    nums=20,
    when='12h',
)

## Getting Tweets<a id='tweets'></a>

In [None]:
import datetime
import json
import os
import time

import pandas as pd
import tweepy

In [None]:
consumer_key = ""
consumer_secret = ""
access_token = ""
access_token_secret = ""

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [None]:
def query_timeline(username: str, count: int, to_date) -> pd.DataFrame:

    tweets = list()
    tweets_df = list()

    result = api.user_timeline(id=username, count=count, tweet_mode='extended')
    for t in result:
        tweets.append(t)

    try:
        while result[-1].created_at > to_date:
            # time.sleep(2)
            print('Last tweet @', result[-1].created_at, 'querying more')
            result = api.user_timeline(
                id=username,
                count=count,
                max_id=result[-1].id - 1,
                tweet_mode='extended',
            )
            for t in result:
                tweets.append(t)
    except:
        pass

    for t in tweets:
        t_created_at = t._json['created_at']
        t_tweet_id = t._json['id']
        t_text = t._json['full_text']
        tweets_df.append([t_created_at, t_tweet_id, t_text])

    tweets_df = pd.DataFrame(tweets_df, columns=['created_at', 'tweet_id', 'text'])
    return tweets, tweets_df

In [None]:
yesterday = datetime.datetime.now() - datetime.timedelta(1)
to_date = datetime.datetime(yesterday.year, yesterday.month, yesterday.day)
USERNAME = 'TheEconomist'
count = 10

tweets, tweets_df = query_timeline(USERNAME, count, to_date)
tweets_df

## Helper Functios

In [None]:
def lemmatize_process(word_token: str) -> str:
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = list()
    for word, tag in pos_tag(word_token):
        if tag.startswith('NN'):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence

In [None]:
def clean_the_text(text: str):

    text = re.sub(r'http\S+', '', text)
    text = str(text).lower()
    text = re.sub('(@[A-Za-z0-9]+)', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()

    tokenizer = TweetTokenizer()
    word_token = tokenizer.tokenize(text)

    new_words = ['rt', 'RT']
    stop_words = set(corpus.stopwords.words('english') + new_words)
    word_token = [w for w in word_token if w not in stop_words]

    # word_token = [w for w in word_token if len(w) >= 3]
    # stemmer = PorterStemmer()
    # word_token = [stemmer.stem(word) for word in word_token]
    # word_token = lemmatize_process(word_token)

    return ' '.join(word_token)

In [None]:
def clean_the_text2(text: str) -> str:
    text = re.sub('(@[A-Za-z0-9]+)', '', text)
    text = re.sub(r'http\S+', r'', text)
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text)
    text = text.replace('RT', '')
    # text = text.lstrip()
    text = text.strip()
    return text

## Trending Topic Coverage<a id='trending_topic'></a>

Extract keywords from tweets [^1],[^2],[^3],[^4],[^5], using Google Trends [^6] as benchmark (everyone Google) to evaluate the hotness(relevancy) of trending topic. Matcher [^7] with predefined word pattern is use to filter extracted keyword. To penalty lengthy tweets, the score constitute two compartment, keyword score and length score whereas 90|10 is assigned with hundred percent as max. Optimal tweet length in this project define at 85 characters (71-100/2). Average interest is computed with each keywords's interest in a single tweets.

References: <br>
Keywords extration -
[1](https://towardsdatascience.com/keyword-extraction-a-benchmark-of-7-algorithms-in-python-8a905326d93f)
[2](https://www.kaggle.com/akhatova/extract-keywords)<br>
Google trends -
[1](https://github.com/GeneralMills/pytrends)
[2](https://github.com/pat310/google-trends-api/wiki/Google-Trends-Categories)<br>
Optimal length -
[1](https://buffer.com/library/optimal-length-social-media/#:~:text=The%20optimal%20length%20of%20a%20tweet%20%E2%80%94%2071%20to%20100%20characters)
[2](https://influencermarketinghub.com/best-length-for-social-media-posts/)<br>

As of 2022-03-15

In [None]:
import contextlib
import csv
import math
import random
import re
import time
import typing

import gensim
import matplotlib.pyplot as plt
import numpy as np
import requests
import spacy
import yake
from keybert import KeyBERT
from nltk import corpus
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import TweetTokenizer
from pytrends.request import TrendReq as UTrendReq
from rake_nltk import Rake
from scipy import stats
from spacy.matcher import Matcher
from tqdm import tqdm

nlp = spacy.load("en_core_web_sm")
bert = KeyBERT()

### Define Extractor

In [None]:
def spacy_extractor(text: str) -> typing.Optional[list]:
    kw = nlp(text)
    kw = list(kw.ents)
    return [str(w) for w in kw]


def rake_extractor(text: str) -> typing.Optional[list]:
    r = Rake(max_length=2, include_repeated_phrases=False)
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()[:5]


def gensim_extractor(text: str) -> typing.Optional[list]:
    kw = gensim.summarization.keywords(text, split=True, lemmatize=True, deacc=True)
    return kw


def yake_extractor(text: str) -> typing.Optional[list]:
    kw = yake.KeywordExtractor(lan='en', n=2, dedupLim=0.9).extract_keywords(text)
    return [w[0] for w in kw][:5]


def bert_extractor(text: str) -> typing.Optional[list]:
    kw = bert.extract_keywords(
        text, keyphrase_ngram_range=(1, 2), use_mmr=True, diversity=0.7
    )
    return [w[0] for w in kw][:5]


def extract_aggregator(raw_text: str, preprocessed_text: str) -> typing.Optional[list]:
    # extractor = [spacy_extractor, rake_extractor, gensim_extractor, yake_extractor, bert_extractor]
    # suitable_text = [preprocessed_text, raw_text, preprocessed_text, preprocessed_text, preprocessed_text]
    suitable_text = [raw_text, preprocessed_text, preprocessed_text]
    extractor = [rake_extractor, yake_extractor, bert_extractor]
    # for e in extractor:
    #     print('{} with raw text: {}'.format(str(e.__name__), e(raw_text)))
    #     print('{} with raw text: {}'.format(str(e.__name__), e(preprocessed_text)))
    #     print('\n')

    keywords_list_of_list = [e(t) for e, t in zip(extractor, suitable_text)]
    all_keywords = [w for kw_list in keywords_list_of_list for w in kw_list]
    return all_keywords

### Pattern Matching for Filtering

In [None]:
def match(keyword: str) -> bool:
    patterns = [
        [{'POS': 'PROPN'}, {'POS': 'VERB'}],
        [{'POS': 'PROPN'}, {'POS': 'NOUN'}],
        [{'POS': 'PROPN'}, {'POS': 'PROPN'}],
        [{'POS': 'NOUN'}, {'POS': 'NOUN'}],
        [{'POS': 'NOUN'}, {'POS': 'VERB'}],
        [{'POS': 'ADJ'}, {'POS': 'NOUN'}],
        [{'POS': 'ADJ'}, {'POS': 'PROPN'}],
    ]
    matcher = Matcher(nlp.vocab)
    matcher.add('pos-matcher', patterns)
    doc = nlp(keyword)
    matches = matcher(doc, as_spans=True)
    return True if len(matches) > 0 else False

In [None]:
def pattern_matcher(keywords_list: list) -> typing.Optional[list]:
    qualified_keywords_list = list()
    for kw in keywords_list:
        if match(kw):
            qualified_keywords_list.append(kw)
            # qualified_keywords_list.extend(kw)

    qualified_keywords_list = list(dict.fromkeys(qualified_keywords_list))

    return qualified_keywords_list

### Interest Trends

In [None]:
# Supress printing from a function with @supress_stdout decorator
def supress_stdout(func):
    def wrapper(*a, **ka):
        with open(os.devnull, 'w') as devnull:
            with contextlib.redirect_stdout(devnull):
                return func(*a, **ka)

    return wrapper

In [None]:
# https://stackoverflow.com/questions/50571317/pytrends-the-request-failed-google-returned-a-response-with-code-429
# https://github.com/GeneralMills/pytrends/issues/369

headers = {
    'authority': 'trends.google.com',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-language': 'en-US,en;q=0.9',
    # Requests sorts cookies= alphabetically
    # 'cookie': '__utma=10102256.912980743.1646273337.1648110105.1648954678.12; __utmz=10102256.1648954678.12.10.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); __utmc=10102256; __utmt=1; __utmb=10102256.4.9.1648954699511; SEARCH_SAMESITE=CgQIk5QB; HSID=Adt-kzWvriIqL_C-6; SSID=AAS2WPOTsuEs7Z7U0; APISID=AzfErRa5847V_lok/AZu6H-u0FGHzfAYoj; SAPISID=jmwWfOBRs4eWnGwN/AzGKH00UU6t-X1dPK; __Secure-1PAPISID=jmwWfOBRs4eWnGwN/AzGKH00UU6t-X1dPK; __Secure-3PAPISID=jmwWfOBRs4eWnGwN/AzGKH00UU6t-X1dPK; SID=IQjsXvn7hMEiJ6yMEs52o8xXnk6Wj2EV_r9DTXfGpg-sgozU2_DaNvoyKL6WAD0J0XMnWQ.; __Secure-1PSID=IQjsXvn7hMEiJ6yMEs52o8xXnk6Wj2EV_r9DTXfGpg-sgozUqNw5STWS4KVh9tq64kzvIg.; __Secure-3PSID=IQjsXvn7hMEiJ6yMEs52o8xXnk6Wj2EV_r9DTXfGpg-sgozUfWbHxyAmcW5QeURO8kZrww.; AEC=AVQQ_LC6H13rsTBRK_BWKQHbwwRFokgnDNYiSCCHysyzthb8E-cm-Ih_Cw; 1P_JAR=2022-04-03-02; NID=511=uHvmScTmiFPM7AePqt_iTPyTu1iz59YYIo0kv7hoSJMd5ZNeDcQb4I8PxN8Av53wnGKHfP1oQ6fErGdnpFN8-BGFvDVABxeiu5eMSYxQb04Ckr4Y_QbOBoRLU3M7xtPMWjiP1MVWlLYF-0ZtMh9vjHNh0zMsXbK6wZ-pJJpix5S0-6mQd0w8JeWsrLmGhpJBhPaMk9wqHFCGCBpudXi6Vkhgm9HeUXdeSqKvVPzeOuZ1wPbTHq6xS-Wmr_T1hJ53nPHAXKjxiLfznWE9mCS1EZAWLtxB84FRTmYllmZCbHlqFWPa71fBjSa-xMnmDerwm2VlTqTAmY-AaiUmKSm9nw7BvO8wi8RjgnRlstL6kkbMSXOixUZ5fMvBdFQ8ssEcne7uWFRW-GbU2jTOjkw_pdr5T7m5XHnp_8fxNFKJ6_b1vEUBcXB3GTaFnXnCHwi2jpztJHkg_u_w0GHf2zjKQDUG; SIDCC=AJi4QfEyIfFUwtE8qTLeRja8wT2nUNDIAgxgShPmV5CK9rZFBjcKZQu71NJuDswrsmzU0XGtpLg; __Secure-3PSIDCC=AJi4QfFN8DbWAYPKXrK4_xgqteQtyQM0pwXa2yDQ80L9nCj7FE1baLmsJTD_TkmySdnS4xx3Vizc',
    'referer': 'https://trends.google.com/trends/?geo=MY',
    'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="100", "Google Chrome";v="100"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'document',
    'sec-fetch-mode': 'navigate',
    'sec-fetch-site': 'same-origin',
    'sec-fetch-user': '?1',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36',
    'x-client-data': 'CLC1yQEIiLbJAQijtskBCMG2yQEIqZ3KAQiM0soBCISDywEIlqHLAQjr8ssBCJ75ywEI54TMAQiljswBCJqPzAEImaHMAQjPoswBCIGkzAEIsqTMAQ==',
}


GET_METHOD = 'GET'


class TrendReq(UTrendReq):
    def _get_data(self, url, method=GET_METHOD, trim_chars=0, **kwargs):
        return super()._get_data(
            url, method=GET_METHOD, trim_chars=trim_chars, headers=headers, **kwargs
        )

In [None]:
def interest_retriever(keywords_list: list, tweet_date) -> int:
    '''
    Input with a list of keywords from a single tweet and tweet date(d), get interest for each keyword with d-3 days.
    ...

    Attributes:
    -----------
    keywords_list: list of string
        A list containing keywords as string from a single tweet
    tweet_date: date
        The correspond tweet date

    Return:
    -------
    interest_score: int
        The correspond interest score of the input tweet

    '''
    from pytrends.exceptions import ResponseError

    if 'pytrends' not in locals():
        pytrend = TrendReq()

    date = '{} {}'.format(tweet_date[:10], tweet_date[:10])
    max_n = 5
    iteration = len(keywords_list) / max_n
    results_interest = list()

    for _ in range(math.ceil(iteration)):
        try:
            pytrend.build_payload(kw_list=keywords_list[:max_n], cat=16, timeframe=date)
            if pytrend.interest_over_time().shape[0] != 0:
                results = pytrend.interest_over_time().iloc[0, :-1].values
                results_interest.append(results.tolist())
                del keywords_list[:max_n]
                # display(pytrend.interest_over_time())
        except ResponseError:
            pass
        except requests.exceptions.Timeout:
            pass

    results_interest = [item for sublist in results_interest for item in sublist]
    # print('results_interest: ', results_interest)

    if len(results_interest) != 0:
        interest_score = sum(results_interest) / len(results_interest)
        # print(interest_score)
        return round(interest_score, 4)
    else:
        return 0

### Pipeline

In [None]:
def trending_topic_coverage(df, text_col: str, date_col: str) -> dict:
    date_list = pd.to_datetime(df[date_col]).tolist()
    s_docs = df[text_col].apply(lambda x: clean_the_text2(x)).tolist()
    f_docs = df[text_col].apply(lambda x: clean_the_text(x)).tolist()

    scores = list()
    iteration = len(df)
    score_dict = dict()

    for i in tqdm(range(iteration)):
        raw_text, preprocessed_text = s_docs[i], f_docs[i]
        all_keywords = extract_aggregator(raw_text, preprocessed_text)
        qualified_keywords_list = pattern_matcher(all_keywords)
        tweet_date = date_list[i] - datetime.timedelta(3)
        text_length = len(df[text_col][i])

        if len(qualified_keywords_list) != 0:
            keyword_score = interest_retriever(qualified_keywords_list, str(tweet_date))
            length_score = np.where(
                text_length > 170,
                0,
                np.where(
                    text_length <= 85, text_length / 85, abs(text_length / 85 - 2)
                ),
            )
            score = (keyword_score * 0.8) + (length_score / 0.2)
            scores.append(score)

        print('Text: ', df[text_col][i])
        print('Total Score: ', score)
        print('\n')

        score_dict[df[text_col][i]] = score

    return score_dict

In [None]:
ttc_results = trending_topic_coverage(
    df=tweets_df, text_col='text', date_col='created_at'
)

### PLot Result

In [None]:
ttc_results = pd.DataFrame(ttc_results, index=['score']).T.reset_index()
ttc_results.columns = ['text', 'score']

In [None]:
def ecdf(data: typing.Optional[str] = pd.Series):
    """Compute ECDF for a one-dimensional array of measurements."""
    n = len(data)
    x = np.sort(data)
    y = np.arange(1, n + 1) / n
    return x, y

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(8, 3), dpi=200)
ax[0].axvline(x=ttc_results['score'].median(), label='Median', color='red')
ax[0].axvline(
    x=ttc_results['score'].quantile(0.75), label='75th quantile', color='orange'
)
ttc_results['score'].plot(kind='hist', ax=ax[0], title='Score Distributions')
ax[0].legend()
ax[0].set_xlabel('Score')
# In an ECDF, the x-axis is the range of possible values for the data & for any given x-value, the correspond y-value is the
# proportion of data points less than or equal to that x-value.
ecdf_x, ecdf_y = ecdf(ttc_results['score'])
ax[1].plot(ecdf_x, ecdf_y, marker='.', linestyle='none')
ax[1].set_title('With ECDF')
ax[1].set_xlabel('Score')
fig.tight_layout()
plt.show()

## Topic Modelling for Topic Connectivity And Continuity<a id='topic_modelling'></a>

Topic modelling clustering implemented to identify or discover possible abstract topic among the tweets. With pretrained SentenceTransformer [^8] to extract various embeddings based on the context of tweets, Uniform Manifold Approximation and Projection(umap) [^9] to lower the dimension before clustering with HDBSCAN [^10] and later using c-TF-IDF for topic creation.

References: <br>
[1](https://towardsdatascience.com/topic-modeling-with-bert-779f7db187e6)
[2](https://towardsdatascience.com/dynamic-topic-modeling-with-bertopic-e5857e29f872)<br>

As of 2022-03-15

In [None]:
import hdbscan
import numpy as np
import umap
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### C-TF-IDF

In [None]:
def c_tf_idf(documents: np.array, m: int, ngram_range=(1, 1)):
    count = CountVectorizer(ngram_range=ngram_range, stop_words='english').fit(
        documents
    )
    t = count.transform(documents).toarray()
    w = t.sum(axis=1)
    tf = np.divide(t.T, w)
    sum_t = t.sum(axis=0)
    idf = np.log(np.divide(m, sum_t)).reshape(-1, 1)
    tf_idf = np.multiply(tf, idf)

    return tf_idf, count

### Top Words for Topic

In [None]:
def extract_top_n_words_per_topic(tf_idf, count, docs_per_topic_topic, n=20) -> dict:
    words = count.get_feature_names()
    labels = list(docs_per_topic_topic)
    tf_idf_transposed = tf_idf.T
    indices = tf_idf_transposed.argsort()[:, -n:]
    top_n_words = {
        label: [(words[j], tf_idf_transposed[i][j]) for j in indices[i]][::-1]
        for i, label in enumerate(labels)
    }
    return top_n_words

### Pipeline

In [None]:
def topic_modelling(df, text_col: str) -> pd.DataFrame:
    raw_text = df[text_col].values
    docs = df[text_col].apply(lambda x: clean_the_text(x))
    model = SentenceTransformer("distilbert-base-nli-mean-tokens")
    embeddings = model.encode(docs, show_progress_bar=True)
    umap_embeddings = umap.UMAP(
        n_neighbors=15, n_components=5, metric='cosine', random_state=7, verbose=True
    ).fit_transform(embeddings)
    print('\n Perform Clustering..')
    cluster = hdbscan.HDBSCAN(
        min_cluster_size=2, metric='euclidean', cluster_selection_method='eom'
    ).fit(umap_embeddings)
    docs_df = pd.DataFrame(raw_text, columns=['doc'])
    docs_df['topic'] = cluster.labels_
    docs_df['doc_id'] = range(len(docs_df))
    docs_per_topic = docs_df.groupby(['topic'], as_index=False).agg({'doc': ' '.join})
    tf_idf, count = c_tf_idf(docs_per_topic.doc.values, m=len(raw_text))

    for i in range(10):
        similarities = cosine_similarity(tf_idf.T)
        np.fill_diagonal(similarities, 0)

        topic_sizes = (
            docs_df.groupby(['topic'])
            .count()
            .sort_values('doc', ascending=False)
            .reset_index()
        )
        topic_to_merge = topic_sizes.iloc[-1].topic
        topic_to_merge_into = np.argmax(similarities[topic_to_merge + 1]) - 1

        docs_df.loc[docs_df.topic == topic_to_merge, 'topic'] = topic_to_merge_into
        old_topics = docs_df.sort_values(by='topic').topic.unique()
        map_topics = {
            old_topics: index - 1 for index, old_topics in enumerate(old_topics)
        }
        docs_df.topic = docs_df.topic.map(map_topics)
        docs_per_topic = docs_df.groupby(['topic'], as_index=False).agg(
            {'doc': ' '.join}
        )

        tf_idf, count = c_tf_idf(docs_per_topic.doc.values, len(raw_text))
        top_n_words = extract_top_n_words_per_topic(
            tf_idf, count, docs_per_topic['topic'], n=20
        )

    topic_sizes = (
        docs_df.groupby(['topic'])
        .doc.count()
        .reset_index()
        .rename({'doc': 'size'}, axis='columns')
        .sort_values('size', ascending=False)
    )

    score = round(sum(topic_sizes['size'].values) / topic_sizes.shape[0], 4)
    print(
        '\nA total of {} topic(s) found, with an average of {} tweet(s)'.format(
            len(topic_sizes['topic']), score
        )
    )

    return topic_sizes

In [None]:
topic_sizes = topic_modelling(df=tweets_df, text_col='text')

### PLot Results

In [None]:
abs_z_scores = np.abs(stats.zscore(topic_sizes))
topic_sizes_of = topic_sizes[(abs_z_scores < 3).all(axis=1)]
fig, ax = plt.subplots(1, 2, figsize=(10, 3), dpi=100)
ax[0].bar(topic_sizes['topic'], topic_sizes['size'])
ax[0].set_title('Without Normalization')
ax[0].set_xlabel('N Topics')
ax[0].set_ylabel('Sizes of Topics')
ax[1].bar(topic_sizes_of['topic'], topic_sizes_of['size'])
ax[1].axhline(y=topic_sizes_of['size'].median(), label='Median', color='red')
ax[1].set_title('With Normalization')
ax[1].set_xlabel('N Topics')
ax[1].set_ylabel('Sizes of Topics')
plt.legend()
plt.suptitle('Topics Sizes Overview', y=1.02)
plt.show()

## Sentiment Analysis<a id='sentiment'></a>

In [None]:
from flair.data import Sentence
from flair.models import TextClassifier
from transformers import pipeline

In [None]:
def get_sentiment(df, text_col: str) -> pd.DataFrame:
    text_data = df[text_col].apply(lambda x: clean_the_text2(x))

    sentiment_pipeline = pipeline(model="cardiffnlp/twitter-roberta-base-sentiment")
    bert_results = pd.DataFrame(sentiment_pipeline(list(text_data.values)))
    sentiment_dict = {
        'LABEL_0': 'Negative',
        'LABEL_1': 'Neutral',
        'LABEL_2': 'Positive',
    }
    bert_results['label'] = bert_results['label'].map(sentiment_dict)
    bert_results.columns = ['bert_label', 'bert_score']

    fair_results = list()
    classifier = TextClassifier.load("en-sentiment")
    for sentence in text_data:
        s = Sentence(sentence)
        classifier.predict(s)
        fair_results.append(str(s.labels)[1:-1].split(' '))

    fair_results = pd.DataFrame(fair_results, columns=['label', 'score'])
    fair_results['score'] = (
        fair_results['score'].str.replace('(', '').str.replace(')', '')
    )
    fair_results.columns = ['flair_label', 'flair_score']

    sentiments = pd.concat([df, bert_results, fair_results], axis=1)
    return sentiments

In [None]:
sentiments = get_sentiment(df=tweets_df, text_col='text')

### PLot Results

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(10, 5), dpi=200)
sentiments['bert_label'].value_counts().plot(
    kind='barh', ax=ax[0, 0], title='Sentiment Distributions'
)
ax[0, 0].set_xlabel('Count')
ax[0, 0].set_ylabel('Twitter-roBERTa-base Model')
sentiments.groupby('bert_label').agg({'bert_score': 'median'}).plot(
    kind='barh',
    ax=ax[0, 1],
    legend=False,
    xlim=[0, 1],
    title='Median score of sentiment class',
)
ax[0, 1].set_xlabel('Score')
ax[0, 1].set_ylabel('Sentiment')

sentiments['flair_label'].value_counts().plot(
    kind='barh', ax=ax[1, 0], title='Sentiment Distributions'
)
ax[1, 0].set_xlabel('Count')
ax[1, 0].set_ylabel('sentiment-curated-distilbert Model')
sentiments.groupby('flair_label').agg({'flair_score': 'median'}).plot(
    kind='barh',
    ax=ax[1, 1],
    legend=False,
    xlim=[0, 1],
    title='Median score of sentiment class',
)
ax[1, 1].set_xlabel('Score')
ax[1, 1].set_ylabel('Sentiment')

fig.tight_layout()
plt.show()

## WordCloud of Named Entity Recognition<a id='wcentity'></a>

In [None]:
import string

from nltk.corpus import stopwords
from tqdm import tqdm
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
from wordcloud import WordCloud

In [None]:
def get_frequent_entity(df, text_col: str) -> None:
    print('Preparing model...')
    tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
    model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")
    ner_model = pipeline('ner', model=model, tokenizer=tokenizer)

    ner_list = list()

    print('Extracting...')
    for line in tqdm(list(df[text_col].values)):
        ner_results = ner_model(line)
        selected_entity = ['B-ORG', 'I-ORG', 'B-LOC', 'I-LOC']
        results_entity = [
            entity
            for entity in ner_results
            if entity.get('entity') in (selected_entity)
        ]
        results_word = [entity.get('word') for entity in results_entity]
        ner_list.extend(results_word)

    print('Concatenating...')
    single_line_ner = ''
    for word in ner_list:
        single_line_ner += word + ' '

    print('Building word cloud...')
    sws = stopwords.words('english') + list(string.punctuation)
    wc = WordCloud(
        stopwords=sws, background_color='white', width=1000, height=1000
    ).generate(single_line_ner)

    print('Plotting...')
    plt.figure(figsize=(20, 20))
    plt.imshow(wc)
    plt.tight_layout()
    plt.axis('off')
    plt.show()

In [None]:
get_frequent_entity(df=tweets_df, text_col='text')

# Dependencies<a id='dependencies'></a>

[^1]: https://spacy.io/usage/linguistic-features#named-entities
[^10]: https://hdbscan.readthedocs.io/en/latest/
[^2]: https://csurfer.github.io/rake-nltk/_build/html/index.html
[^3]: https://radimrehurek.com/gensim_3.8.3/summarization/keywords.html
[^4]: https://github.com/LIAAD/yake
[^5]: https://github.com/MaartenGr/KeyBERT
[^6]: https://trends.google.com/trends/
[^7]: https://spacy.io/api/matcher
[^8]: https://github.com/UKPLab/sentence-transformers
[^9]: https://github.com/lmcinnes/umap

In [None]:
## For Google News Scraper
# !pip install dateparser
# !pip install feedparser==6.0.8
# !pip install requests==2.26.0
# !pip install bs4==0.0.1

In [None]:
## For plotting in network and bulk scraping
# !pip install numpy==1.21.4
# !pip install pandas==1.3.5
# !pip install networkx==2.6.3
# !pip install matplotlib==3.5.1
# !pip install scikit-learn==1.0.1
# !pip install spacy==3.2.1
# !pip install langdetect
# !pip install deep_translator
# !pip install tqdm==4.62.3

In [None]:
## For tweets related
# !pip install tweepy==3.10.0
# !pip install pandas==1.3.4
# !pip install nltk==3.7
# !pip install spacy=3.2.0
# !pip install rake-nltk==1.0.6
# !pip install gensim==3.8.3
# !pip install yake==0.4.8
# !pip install keybert==0.5.0
# !pip install pytrends==4.8.0
# !pip install tqdm==4.62.3
# !pip install numpy==1.19.5
# !pip install sklearn==0.23.2
# !pip install sentence_transformers==2.2.0
# !pip install umap==0.5.2
# !pip install hdbscan==0.8.27
# !pip install matplotlib==3.5.1
# !pip install scipy==1.5.4
# !pip install transformers==4.16.2
# !pip install wordcloud==1.8.1