## Set-up

In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from pprint import pprint
import re
from tqdm import tqdm
tqdm.pandas()
import numpy as np

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [2]:
#!pwd
import os

# Google Bucket
# file name checkpoint_0512_sent_split.parquet
path_bucket = 'gs://msca-sp23-bucket/nlp_data'
path_bucket_df_cleaned = path_bucket + '/' + 'df_cleaned_0514.parquet'
runtime_path = '/home/jupyter/data/nlp_final'

os.chdir(runtime_path)
print(os.getcwd())

/home/jupyter/data/nlp_final


In [3]:
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 500)

import warnings
warnings.filterwarnings("ignore")

In [4]:
import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

In [5]:
import nltk as nltk
from nltk.corpus import stopwords

import multiprocessing

In [6]:
import spacy
import spacy.cli
spacy.cli.download("en_core_web_md")
nlp = spacy.load("en_core_web_md")

Collecting en-core-web-md==3.5.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.5.0/en_core_web_md-3.5.0-py3-none-any.whl (42.8 MB)
[2K     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 42.8/42.8 MB 15.4 MB/s eta 0:00:00
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [7]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/anthony/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/anthony/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/anthony/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /home/anthony/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/anthony/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
# read data
df_raw = pd.read_parquet(path_bucket_df_cleaned, engine='pyarrow')

# take a sample
df = df_raw.sample(10000, random_state=42)
print(df.shape)
df.head(1)

(10000, 5)


Unnamed: 0,url,date,title,text_split,text
158935,https://cio.economictimes.indiatimes.com/news/business-analytics/anil-kumar-aays-analytics-on-how-large-enterprises-are-democratising-data-science-ml-in-the-corporate-finance-space/96214109,2022-12-14,"Anil Kumar, Aays Analytics on how large enterprises are democratising data science, ML in the corporate finance space, CIO News, ET CIO","[Accept the updated privacy cookie policy Dear user, ET CIO privacy and cookie policy has been updated to align with the new data regulations in European Union. Please review and accept these changes below to continue using the website.You can see our privacy policy our cookie policy. We use cookies to ensure the best experience for you on our website.If you choose to ignore this message, we'll assume that you are happy to receive all cookies on ET CIO. AnalyticsNecessaryNewsletter Name...","Accept the updated privacy cookie policy Dear user, ET CIO privacy and cookie policy has been updated to align with the new data regulations in European Union. Please review and accept these changes below to continue using the website.You can see our privacy policy our cookie policy. We use cookies to ensure the best experience for you on our website.If you choose to ignore this message, we'll assume that you are happy to receive all cookies on ET CIO. AnalyticsNecessaryNewsletter NameP..."


## LDA, on overall topics

Helper Functions

In [10]:
%%time

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

# Define functions for stopwords, bigrams, trigrams and lemmatization
stop_words = stopwords.words('english')
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

CPU times: user 457 µs, sys: 175 µs, total: 632 µs
Wall time: 438 µs


### Text Prep

I want to utilize parallelization as much as possile to save time

In [13]:
# select the text
df_text = df[['text']]
#df_title = df['title']

# remove punctuation and numbers using parallel_apply
df_text['text_cleaned'] = df_text['text'].parallel_apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))
#df_title['title_cleaned'] = df_title.parallel_apply(lambda x: re.sub('[^A-Za-z]+', ' ', x))

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1250), Label(value='0 / 1250'))), …

In [16]:
# drop na and duplicates
df_text = df_text.dropna().drop_duplicates()
# convert to str type
df_text['text_cleaned'] = df_text['text_cleaned'].astype(str)
df_text.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9990 entries, 158935 to 307
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          9990 non-null   object
 1   text_cleaned  9990 non-null   object
dtypes: object(2)
memory usage: 234.1+ KB


In [17]:
# save a copy
df_text_before = df_text.copy()

# define a function to handle errors
def handle_errors(func):
    def wrapper(x):
        try:
            return func(x)
        except Exception as e:
            print(f"Error processing row: {x}")
            return np.nan
    return wrapper

# define the remove_stopwords function with the handle_errors decorator
@handle_errors
def remove_stopwords(row): 
    return [i for i in simple_preprocess(row) if i not in stopwords.words('english')]

# apply remove_stopwords function with try/except
df_text['text_cleaned'] = df_text['text_cleaned'].parallel_apply(remove_stopwords)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1249), Label(value='0 / 1249'))), …

In [20]:
df_text.head()

Unnamed: 0,text,text_cleaned
158935,"Accept the updated privacy cookie policy Dear user, ET CIO privacy and cookie policy has been updated to align with the new data regulations in European Union. Please review and accept these changes below to continue using the website.You can see our privacy policy our cookie policy. We use cookies to ensure the best experience for you on our website.If you choose to ignore this message, we'll assume that you are happy to receive all cookies on ET CIO. AnalyticsNecessaryNewsletter NameP...","['accept', 'updated', 'privacy', 'cookie', 'policy', 'dear', 'user', 'et', 'cio', 'privacy', 'cookie', 'policy', 'updated', 'align', 'new', 'data', 'regulations', 'european', 'union', 'please', 'review', 'accept', 'changes', 'continue', 'using', 'website', 'see', 'privacy', 'policy', 'cookie', 'policy', 'use', 'cookies', 'ensure', 'best', 'experience', 'website', 'choose', 'ignore', 'message', 'assume', 'happy', 'receive', 'cookies', 'et', 'cio', 'google', 'analyticsgoogle', 'yearhttpsto', '..."
77297,"Skip to contentWeatherCOVID 19 CoverageWho's HiringLivestreamElection PodcastHomeElection ResultsNewsAPContestsNationalWildlife WatchWCAX News Livestream WeatherClosingsRadarSki Board ReportWeather CamsWCAX Weather AppWeather FAQWeather FeaturesAstronomySportsCommunity CalendarContact UsMeet the TeamAdvertise With UsCommunity PartnersGot a Story Idea?Order Channel 3 News StoriesWCAX JobsPaid InternshipsStream News Live and On DemandNewsletterECHO, Leahy Center for Lake ChamplainJumpOnITVid...","['skip', 'coveragewho', 'watchwcax', 'news', 'livestream', 'board', 'reportweather', 'camswcax', 'weather', 'appweather', 'faqweather', 'calendarcontact', 'usmeet', 'teamadvertise', 'uscommunity', 'partnersgot', 'story', 'idea', 'order', 'channel', 'news', 'storieswcax', 'jobspaid', 'news', 'live', 'leahy', 'center', 'lake', 'resultscovid', 'mapfoodbankshow', 'watch', 'listingssubmit', 'photos', 'videoscircle', 'country', 'music', 'lifestylegray', 'dc', 'fencebuy', 'vermont', 'firstmade', 'v..."
99045,Nvidia puts AI at center of latest GeForce graphics card upgrade The Seattle Times Traffic Lab Law Justice Local Politics Education Education Lab Eastside Environment Health Data Mental Health Project Homeless Times Watchdog Boeing Aerospace Amazon Microsoft Technology Real Estate Economy Artificial Intelligence Seahawks Mariners Huskies Cougars Storm Sounders Kraken Reign High School Sports On TV/Radio Movies Books Music Theater Classical Music TV/Streaming Comics Games Puzzles H...,"['nvidia', 'puts', 'ai', 'center', 'latest', 'geforce', 'graphics', 'card', 'upgrade', 'seattle', 'times', 'traffic', 'lab', 'law', 'justice', 'local', 'politics', 'education', 'education', 'lab', 'eastside', 'environment', 'health', 'data', 'mental', 'health', 'project', 'homeless', 'times', 'watchdog', 'boeing', 'aerospace', 'amazon', 'microsoft', 'technology', 'real', 'estate', 'economy', 'artificial', 'intelligence', 'seahawks', 'mariners', 'huskies', 'cougars', 'storm', 'sounders', 'kra..."
79651,"BusinessTechnologyWorldNationalPoliticsMedia CultureOpinionSportsSocial CapitalCryptoListings MoreSpotlightGlossaryEditionsAustralia EditionIndia EditionInternational EditionSingapore EditionUnited KingdomUnited StatesNEWSLETTERFollow Us KEY POINTSThe beta version of SK 's ""A."" pronounced ""A dot"" was launched in South Korea in May 2022A. is reportedly based on generative AI just like ChatGPTSKT will reportedly integrate various services it owns into the chatbotSouth Korea's SK is ...","['editionindia', 'editionunited', 'kingdomunited', 'us', 'key', 'pointsthe', 'beta', 'version', 'sk', 'pronounced', 'dot', 'launched', 'south', 'korea', 'may', 'reportedly', 'based', 'generative', 'ai', 'like', 'chatgptskt', 'reportedly', 'integrate', 'various', 'services', 'owns', 'chatbotsouth', 'korea', 'sk', 'planning', 'year', 'fully', 'launch', 'artificial', 'intelligence', 'chatbot', 'executive', 'pany', 'said', 'offer', 'lot', 'booming', 'chatgpt', 'mobile', 'world', 'congress', 'bar..."
4568,Skip to contentAsk the ExpertContestsDay Trippin'Healthcare TodayHeroes Among UsThe ShoppesProud to be a FarmerWatch LiveNewsWeatherSportsLocker Room Report2022 ElectionsLatest VideoCoronavirusContact UsHomeWatch WALB LiveWALB on RokuNewsBinge It!Civil Rights LegacyCrimeEducationEditorialsHealthGood NewsMultimediaInvestigateNationalPoliticsStateSweet Tea History PodcastWeatherFirst View 10 CamerasHurricane CoverageClosingsRadarCoronavirusFind out where the COVID 19 vaccine is available in ...,"['skip', 'contentask', 'trippin', 'healthcare', 'todayheroes', 'among', 'usthe', 'shoppesproud', 'farmerwatch', 'room', 'report', 'electionslatest', 'ushomewatch', 'walb', 'livewalb', 'rokunewsbinge', 'civil', 'rights', 'tea', 'history', 'view', 'covid', 'vaccine', 'available', 'areaarea', 'hospitals', 'daily', 'covid', 'numbersphoebe', 'covid', 'numbers', 'sportsfish', 'game', 'forecasthigh', 'school', 'sportssports', 'talklocker', 'room', 'reportlocker', 'room', 'report', 'weekplayer', 'we..."


### titleza LDA

In [22]:
%%time
# tokenize the text
data_list = df_text['text_cleaned'].tolist()
data_tokens = list(sent_to_words(data_list))

CPU times: user 24.7 s, sys: 195 ms, total: 24.8 s
Wall time: 24.8 s


In [23]:
%%time
# create bigrams & trigrams
bigram = gensim.models.Phrases(data_tokens, min_count=1, threshold=1)
trigram = gensim.models.Phrases(bigram[data_tokens], threshold=1)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

CPU times: user 38.2 s, sys: 387 ms, total: 38.6 s
Wall time: 38.6 s


In [24]:
%%time
# Remove Stop Words
#data_tokens_nostops = remove_stopwords(data_tokens)

# Create n-grams
data_words_bigrams = make_bigrams(data_tokens)
data_words_trigrams = make_trigrams(data_tokens)

# Combine tokens and n-grams
# data_tokens_cobnined = data_tokens_nostops + data_words_bigrams + data_words_trigrams
data_tokens_cobnined = data_words_trigrams

# Lemmatize text keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_tokens_cobnined, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(*data_lemmatized[:1])

['cio_privacy_cookie_policy', 'updated_align_new', 'data_regulations_european_union', 'please_review_accept_change', 'use_cookies_ensure_b', 'experience_website_choose_ignore', 'message_assume_happy_receive', 'cookies_et_cio_google', 'analyticsgoogle_yearhttpsto_track_visitor', 'site_origin', 'track_article_yearhttpsto_track', 'visitors_site', 'behaviour_optouttime', 'dayhttpsstores_user', 'internet_hourshttpsto_serve_content', 'relevant_internet', 'identify_userostidtime', 'yearhttpsoauth_user', 'browsers_namepurpose_daily_daily', 'list_important', 'newspromo_mailersreceive', 'read_accepted_terms_condition', 'news_energy_new', 'real_estate_news_brand', 'equity_cfo_new', 'news_government_news_hospitality', 'news_et_travelworld_new', 'infra_new', 'news_hrsea_news_hrme', 'news_ing_event', 'cfo_meet_discussion_revise', 'driven_supply', 'chain', 'transformation', 'hr', 'agorajesh', 'communication', 'strategic_shift', 'cloud_first', 'cloud', 'smart', 'day', 'tech', 'apprentice', 'increase',

In [25]:
%%time

# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(data_lemmatized)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in data_lemmatized]

CPU times: user 6.59 s, sys: 95.9 ms, total: 6.68 s
Wall time: 6.68 s


In [None]:
num_processors = multiprocessing.cpu_count()
workers = num_processors-1
print(f'Using {workers} workers')

In [None]:
# supporting function
def compute_coherence_values(corpus, dictionary, k, a, b):
    
    lda_model = LdaMulticore(corpus=doc_term_matrix,
                       id2word=dictionary,
                       num_topics=k,
                       random_state=100,                  
                       passes=10,
                       alpha=a,
                       eta=b,
                       workers=workers)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()