In [49]:
#!pip install pandarallel

In [50]:
from pandarallel import pandarallel
import multiprocessing

num_processors = multiprocessing.cpu_count()
print(f'Available CPUs: {num_processors}')

Available CPUs: 16


In [51]:
pandarallel.initialize(nb_workers=num_processors-1, use_memory_fs=False, progress_bar = True)

INFO: Pandarallel will run on 15 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [1]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

In [2]:
import warnings

# warnings.simplefilter('once')
warnings.simplefilter('ignore')
# warnings.filterwarnings("ignore", category=FutureWarning)
# warnings.filterwarnings("ignore", category=DeprecationWarning)
# warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

In [3]:
%%time
df = pd.read_parquet("filtered_data.parquet")

CPU times: user 29.6 s, sys: 13 s, total: 42.5 s
Wall time: 32.4 s


In [4]:
df.head()

Unnamed: 0,url,date,title,text,main_text,relevant,Tokens,cleaned_tokens
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,"Photo taken on July 1, 2019, shows a sign for ...",1,"[Photo, taken, on, July, 1, ,, 2019, ,, shows,...","[Photo, taken, July, show, sign, electronic, t..."
1,http://newsparliament.com/2020/02/27/children-...,2020-02-27,Children With Autism Saw Their Learning and So...,\nChildren With Autism Saw Their Learning and ...,Children With Autism Saw Their Learning and So...,1,"[Children, With, Autism, Saw, Their, Learning,...","[Children, With, Autism, Saw, Their, Learning,..."
2,http://www.dataweek.co.za/12835r,2021-03-26,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce...","Forget ML, AI and Industry 4.0 – obsolescence ...",1,"[Forget, ML, ,, AI, and, Industry, 4.0, –, obs...","[Forget, ML, AI, Industry, obsolescence, focus..."
3,http://www.homeoffice.consumerelectronicsnet.c...,2021-03-10,Strategy Analytics: 71% of Smartphones Sold Gl...,\n\nStrategy Analytics: 71% of Smartphones Sol...,Strategy Analytics: 71% of Smartphones Sold Gl...,1,"[Strategy, Analytics, :, 71, %, of, Smartphone...","[Strategy, Analytics, Smartphones, Sold, Globa..."
4,http://www.itbusinessnet.com/2020/10/olympus-t...,2020-10-20,Olympus to Support Endoscopic AI Diagnosis Edu...,\n\nOlympus to Support Endoscopic AI Diagnosis...,Olympus to Support Endoscopic AI Diagnosis Edu...,1,"[Olympus, to, Support, Endoscopic, AI, Diagnos...","[Olympus, Support, Endoscopic, AI, Diagnosis, ..."


In [5]:
df.shape

(127739, 8)

## LDA with gensim

In [None]:
!pip install textblob
!pip install nltk
!pip install gensim
!pip install pyLDAvis

In [None]:
import time
import math
import re
from textblob import TextBlob


import nltk as nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

import string


import gensim
from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from nltk.stem.wordnet import WordNetLemmatizer

# import pyLDAvis.gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

import multiprocessing

num_processors = multiprocessing.cpu_count()
num_processors
workers=num_processors-1

from tqdm import tqdm

In [None]:
# define helper function
def dict_doc_term(text_df):
    
    doc_clean = text_df.to_list()
    
    # Creating the term dictionary of our courpus, where every unique term is assigned an index. 
    dictionary = corpora.Dictionary(doc_clean)

    # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
    
    return doc_clean, dictionary, doc_term_matrix


def n_topic(doc_clean, dictionary, doc_term_matrix, topic_ls):
    
    coherence_score = []
    
    for n in tqdm(topic_ls):

        lda_model = LdaMulticore(corpus = doc_term_matrix,
                   id2word = dictionary,
                   num_topics = n,
                   random_state = 100,
                   passes = 10,
                   alpha = 'symmetric',
                   eta = 'auto',
                   workers = workers)
        
        coherence_model_lda = CoherenceModel(model = lda_model, texts = doc_clean, dictionary = dictionary, coherence = 'c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        coherence_score.append(coherence_lda)
        
        print(f'\nWith {n} topics:')
        print(*lda_model.print_topics(num_topics = n, num_words = 10), sep='\n')
        
    return pd.DataFrame({"n": topic_ls, "coherence_score": coherence_score})

In [None]:
%%time
df_cleaned, df_dict, df_matrix = dict_doc_term(df['cleaned_tokens'])

CPU times: user 2min 21s, sys: 1.96 s, total: 2min 22s
Wall time: 2min 23s


In [None]:
#%%time
df_score = n_topic(df_cleaned, df_dict, df_matrix, [16])

100%|██████████| 1/1 [1:03:06<00:00, 3786.06s/it]


With 16 topics:
(0, '0.023*"Paid" + 0.021*"Program" + 0.019*"Best" + 0.017*"BrandVoice" + 0.014*"AI" + 0.012*"Forbes" + 0.010*"Richest" + 0.008*"The" + 0.007*"Insurance" + 0.006*"Credit"')
(1, '0.038*"market" + 0.029*"Market" + 0.021*"Artificial" + 0.021*"Intelligence" + 0.018*"report" + 0.015*"The" + 0.014*"AI" + 0.010*"Global" + 0.009*"growth" + 0.008*"analysis"')
(2, '0.020*"Finance" + 0.014*"Screener" + 0.013*"Yahoo" + 0.010*"News" + 0.009*"AI" + 0.009*"Fund" + 0.008*"ETF" + 0.008*"Stocks" + 0.007*"Markets" + 0.007*"stock"')
(3, '0.013*"patient" + 0.010*"The" + 0.010*"AI" + 0.008*"healthcare" + 0.008*"health" + 0.006*"medical" + 0.005*"data" + 0.005*"disease" + 0.005*"study" + 0.005*"care"')
(4, '0.012*"AI" + 0.012*"The" + 0.009*"said" + 0.006*"country" + 0.006*"people" + 0.006*"We" + 0.005*"government" + 0.005*"China" + 0.005*"technology" + 0.004*"also"')
(5, '0.012*"AI" + 0.011*"The" + 0.010*"technology" + 0.007*"system" + 0.007*"intelligence" + 0.006*"artificial" + 0.005*"state




In [None]:
df_score

Unnamed: 0,n,coherence_score
0,16,0.463993


In [None]:
n_topics = [4, 6, 8, 10, 12, 14, 16]
coherence_scores = [0.525972, 0.491855, 0.483164, 0.481007, 0.490952, 0.477404, 0.463993]
time = ["10:30", "13:46", "16:37", "26:21", "36:22", "48:30", "1:03:06"]

gensim_res = pd.DataFrame({"n_topic": n_topics, "coherence_score": coherence_scores, "time": time})
gensim_res

Unnamed: 0,n_topic,coherence_score,time
0,4,0.525972,10:30
1,6,0.491855,13:46
2,8,0.483164,16:37
3,10,0.481007,26:21
4,12,0.490952,36:22
5,14,0.477404,48:30
6,16,0.463993,1:03:06


## LDA with K-Train

In [11]:
#!pip install bokeh 
#!pip install ktrain 

In [12]:
import bokeh
import ktrain
print('Bokeh Version: ' + bokeh.__version__)
print('Ktrain Version: ' + ktrain.__version__)

Bokeh Version: 2.4.3
Ktrain Version: 0.37.0


In [13]:
texts = df['main_text'].tolist()

In [14]:
%%time
tm = ktrain.text.get_topic_model(
    texts=texts, 
    n_topics=20, 
    n_features=10000, 
    min_df=5, 
    max_df=0.5, 
    stop_words='english', 
    model_type='lda', 
    lda_max_iter=5, 
    verbose=1)

lang: en
preprocessing texts...
fitting model...
iteration: 1 of max_iter: 5
iteration: 2 of max_iter: 5
iteration: 3 of max_iter: 5
iteration: 4 of max_iter: 5
iteration: 5 of max_iter: 5
done.
CPU times: user 2h 10min 21s, sys: 4h 58min 1s, total: 7h 8min 23s
Wall time: 34min 39s


In [15]:
%%time
tm.build(texts, threshold=0.25)

done.
CPU times: user 20min 28s, sys: 43min 24s, total: 1h 3min 52s
Wall time: 6min 25s


In [16]:
tm.print_topics(show_counts=True)

topic:0 | count:18040 | solutions platform business cloud software learning company applications machine customers
topic:15 | count:14281 | market report analysis global growth industry key research players forecast
topic:5 | count:13395 | like human learning time make people model machine just used
topic:7 | count:7731 | health healthcare medical care patients clinical drug research patient cancer
topic:17 | count:7659 | customer experience content platform customers business marketing digital company conversational
topic:11 | count:7603 | india news said world china global live indian government covid-19
topic:10 | count:7553 | video content app news users opens music art like image
topic:14 | count:6816 | google microsoft openai said search chatgpt company bing tech users
topic:6 | count:6398 | security said use systems government people law privacy work rights
topic:12 | count:5932 | said people says images musk like news company media years
topic:9 | count:4879 | chatgpt text open

In [37]:
%%time
topic_doc = []

for i in range(20):
    topic_doc.append([text['text'] for text in tm.get_docs(topic_ids=[i], rank=True)])

CPU times: user 130 ms, sys: 0 ns, total: 130 ms
Wall time: 128 ms


In [38]:
def assign_topic(text):
    
    for idx, topic in enumerate(topic_doc):
        if text in topic:
            return str(idx)

In [39]:
df['ktrain_topic'] = df['main_text'].parallel_apply(assign_topic)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=8516), Label(value='0 / 8516'))), …

In [40]:
df.head()

Unnamed: 0,url,date,title,text,main_text,relevant,Tokens,cleaned_tokens,ktrain_topic
0,http://en.people.cn/n3/2021/0318/c90000-983012...,2021-03-18,Artificial intelligence improves parking effic...,\n\nArtificial intelligence improves parking e...,"Photo taken on July 1, 2019, shows a sign for ...",1,"[Photo, taken, on, July, 1, ,, 2019, ,, shows,...","[Photo, taken, July, show, sign, electronic, t...",5
1,http://newsparliament.com/2020/02/27/children-...,2020-02-27,Children With Autism Saw Their Learning and So...,\nChildren With Autism Saw Their Learning and ...,Children With Autism Saw Their Learning and So...,1,"[Children, With, Autism, Saw, Their, Learning,...","[Children, With, Autism, Saw, Their, Learning,...",5
2,http://www.dataweek.co.za/12835r,2021-03-26,"Forget ML, AI and Industry 4.0 – obsolescence ...","\n\nForget ML, AI and Industry 4.0 – obsolesce...","Forget ML, AI and Industry 4.0 – obsolescence ...",1,"[Forget, ML, ,, AI, and, Industry, 4.0, –, obs...","[Forget, ML, AI, Industry, obsolescence, focus...",5
3,http://www.homeoffice.consumerelectronicsnet.c...,2021-03-10,Strategy Analytics: 71% of Smartphones Sold Gl...,\n\nStrategy Analytics: 71% of Smartphones Sol...,Strategy Analytics: 71% of Smartphones Sold Gl...,1,"[Strategy, Analytics, :, 71, %, of, Smartphone...","[Strategy, Analytics, Smartphones, Sold, Globa...",0
4,http://www.itbusinessnet.com/2020/10/olympus-t...,2020-10-20,Olympus to Support Endoscopic AI Diagnosis Edu...,\n\nOlympus to Support Endoscopic AI Diagnosis...,Olympus to Support Endoscopic AI Diagnosis Edu...,1,"[Olympus, to, Support, Endoscopic, AI, Diagnos...","[Olympus, Support, Endoscopic, AI, Diagnosis, ...",7


In [41]:
df['ktrain_topic'].value_counts()

0     18040
15    14281
5     13395
7      7731
17     7659
11     7603
10     7553
14     6816
6      6398
12     5932
9      4879
3      4828
2      4543
4      3207
18     2445
13     2219
8      1670
1      1530
16     1381
19     1301
Name: ktrain_topic, dtype: int64

In [42]:
import pyarrow as pa
import pyarrow.parquet as pq
table = pa.Table.from_pandas(df)
pq.write_table(table, './data_topic.parquet')

In [7]:
#doc_topics=tm.get_doctopics()
#tm.visualize_documents(doc_topics=tm.get_doctopics())

## BERTopic

In [7]:
#!pip install bertopic

In [8]:
from bertopic import BERTopic

2023-05-25 23:48:53.804977: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-25 23:48:55.189069: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-05-25 23:48:55.189279: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such 

### Cleaned_tokens

In [9]:
%%time
articles = df['cleaned_tokens'].tolist()
text = [' '.join(tokens) for tokens in articles]

CPU times: user 4.86 s, sys: 331 ms, total: 5.19 s
Wall time: 5.2 s


In [10]:
%%time
# Create an instance of BERTopic
#topic_model = BERTopic(embedding_model = "bert-base-nli-mean-tokens",  min_topic_size = 150)
topic_model = BERTopic(embedding_model = "bert-base-nli-mean-tokens",  min_topic_size = 50)
# Fit the BERTopic model on your data
topics, _ = topic_model.fit_transform(text)

# Get the most frequent topics
top_topics = topic_model.get_topic_freq()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [11]:
# Print the top N topics
N = 20

for topic in top_topics.head(N)['Topic']:
    topic_words = [word[0] for word in topic_model.get_topic(topic)]
    print(f"Topic {topic}: {', '.join(topic_words)}")

Topic -1: ai, new, model, company, technology, the, data, said, like, use
Topic 0: market, report, growth, analysis, global, forecast, artificial, key, size, intelligence
Topic 1: patient, cancer, clinical, health, medical, disease, healthcare, care, protein, treatment
Topic 2: market, analysis, report, growth, forecast, global, key, players, corporation, size
Topic 3: china, chinese, baidu, beijing, ernie, alibaba, military, chip, government, us
Topic 4: watch, india, live, says, to, from, telecast, after, updates, day
Topic 5: stock, investor, trading, billion, year, company, investment, fool, million, quarter
Topic 6: vehicle, car, road, traffic, driver, driving, safety, automotive, autonomous, fleet
Topic 7: customer, solution, nvidia, platform, edge, enterprise, ddn, supermicro, performance, capability
Topic 8: und, zu, auf, die, sie, im, bewerten, stoxx, nicht, al
Topic 9: trends, nyse, forecast, reports, dagoretti, times, opportunities, size, growth, analysis
Topic 10: student, 

In [12]:
# Save model
topic_model.save('bertopic2')

In [13]:
res = topic_model.get_document_info(text)
res['Name'].value_counts()

-1_ai_new_model_company                      53124
0_market_report_growth_analysis               7625
1_patient_cancer_clinical_health              4522
2_market_analysis_report_growth               2735
3_china_chinese_baidu_beijing                 1517
                                             ...  
389_aeye_diagnosable_retinopathy_diabetic       51
385_deepmotion_namco_bandai_forms               51
384_kheiron_areas_stanford_oncology             51
392_crypto_under_defi_brokersstock              50
391_chief_north_america_canada                  50
Name: Name, Length: 394, dtype: int64

### Title 

In [12]:
#!pip install nltk

In [13]:
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords

In [14]:
stop_words = stopwords.words('english')

In [15]:
title_ls = df['title'].tolist()
#convert to lowercase
title_ls = [title.lower() for title in title_ls]
#remove stopwords
cleaned_title = []
for title in title_ls: 
    tokens = title.split(' ')
    cleaned_tokens = []
    for token in tokens:
        if token not in stop_words:
            cleaned_tokens.append(token)
            
    cleaned_title.append(' '.join(cleaned_tokens))

In [16]:
%%time
# Create an instance of BERTopic
title_topic_model = BERTopic(embedding_model = "bert-base-nli-mean-tokens",  min_topic_size = 150)

# Fit the BERTopic model on your data
title_topics, _ = title_topic_model.fit_transform(cleaned_title)

# Get the most frequent topics
title_top_topics = title_topic_model.get_topic_freq()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [17]:
# Print the top N topics
N = 20

for topic in title_top_topics.head(N)['Topic']:
    topic_words = [word[0] for word in title_topic_model.get_topic(topic)]
    print(f"Topic {topic}: {', '.join(topic_words)}")

Topic -1: ai, new, chatgpt, data, platform, com, news, intelligence, launches, technology
Topic 0: market, growth, analysis, 2020, global, forecast, ibm, 2026, players, corporation
Topic 1: million, billion, raises, american, banking, usd, cagr, reach, news, 10
Topic 2: chatgpt, breaking, news, ai, breakinglatest, google, musk, latest, says, elon
Topic 3: cancer, health, healthcare, medical, breast, drug, patients, news, care, predict
Topic 4: google, bard, search, chatbot, rival, exbulletin, chatgpt, engineer, sentient, cloud
Topic 5: healthcare, medical, market, medicine, growth, corporation, 2020, drug, global, analysis
Topic 6: china, chinese, us, morning, chatgpt, baidu, south, frenzy, alibaba, post
Topic 7: automotive, cars, ford, driving, car, argo, market, self, corporation, autonomous
Topic 8: artificial, intelligence, future, new, human, art, need, com, news, digital
Topic 9: microsoft, bing, search, azure, chatgpt, openai, bakes, engine, word, excel
Topic 10: department, def

In [18]:
# Save model
title_topic_model.save('title_bertopic')

In [23]:
res = title_topic_model.get_document_info(cleaned_title)
res['Name'].value_counts()

-1_ai_new_chatgpt_data                           57733
0_market_growth_analysis_2020                    12755
1_million_billion_raises_american                10426
2_chatgpt_breaking_news_ai                        7706
3_cancer_health_healthcare_medical                3846
                                                 ...  
74_pen_channels_history_union                      164
75_ensure_safe_must_biden                          154
77_rokit_bioprinting_osteoarthritis_radiology      153
76_coursera_data_science_scientist                 153
78_israel_israeli_jerusalem_weizmann               150
Name: Name, Length: 80, dtype: int64

### Identify topics

In [17]:
import re

In [14]:
saved_model = BERTopic.load('bertopic2')

In [64]:
%%time
# Get the most frequent topics
new_top_topics = saved_model.get_topic_freq()
# Print the top N topics
N = 40

for topic in new_top_topics.head(N)['Topic']:
    topic_words = [word[0] for word in saved_model.get_topic(topic)]
    print(f"Topic {topic}: {', '.join(topic_words)}")

Topic -1: ai, new, model, company, technology, the, data, said, like, use
Topic 0: market, report, growth, analysis, global, forecast, artificial, key, size, intelligence
Topic 1: patient, cancer, clinical, health, medical, disease, healthcare, care, protein, treatment
Topic 2: market, analysis, report, growth, forecast, global, key, players, corporation, size
Topic 3: china, chinese, baidu, beijing, ernie, alibaba, military, chip, government, us
Topic 4: watch, india, live, says, to, from, telecast, after, updates, day
Topic 5: stock, investor, trading, billion, year, company, investment, fool, million, quarter
Topic 6: vehicle, car, road, traffic, driver, driving, safety, automotive, autonomous, fleet
Topic 7: customer, solution, nvidia, platform, edge, enterprise, ddn, supermicro, performance, capability
Topic 8: und, zu, auf, die, sie, im, bewerten, stoxx, nicht, al
Topic 9: trends, nyse, forecast, reports, dagoretti, times, opportunities, size, growth, analysis
Topic 10: student, 

In [44]:
#assign industry tag to each topic
industry_name = ['Others']

for i in range(len(new_top_topics['Topic'])-1):
    
    sample = [word[0] for word in saved_model.get_topic(i)]
    
    if re.search(r"crypto(currency)?|blockchain|finance|btc|bank(ing)?|loan|fintech", ' '.join(sample), flags=re.I): industry_name.append('Finance')
    elif re.search(r"patient|health(care)?|cancer|clinical|drug|medicine|genomics", ' '.join(sample), flags=re.I): industry_name.append('Healthcare&Biotech')
    elif re.search(r"student|classroom|education|coursera|essay|school|academic", ' '.join(sample), flags=re.I): industry_name.append('Education')
    elif re.search(r"music|song|photoshop|painting|artist|photography|film", ' '.join(sample), flags=re.I): industry_name.append('Media Creation')
    elif re.search(r"automative|vehicle|tesla|elon|car|energy|automobile", ' '.join(sample), flags=re.I): industry_name.append('Automotive')
    elif re.search(r"telecom|telecast|5g|IoT|mobile|satellite|fiber optics", ' '.join(sample), flags=re.I): industry_name.append('Telecommunication')
    elif re.search(r"agriculture|farm(ing)?|agribotix|farmer|livestock|pest|soil", ' '.join(sample), flags=re.I): industry_name.append('Agriculture')
    elif re.search(r"writer|publication(s)?|book(s)?|publishing|magazine|author|newspapers", ' '.join(sample), flags=re.I): industry_name.append('Publishing')
    elif re.search(r"google|ibm|microsoft|facebook|chip(s)|meta|chatgpt", ' '.join(sample), flags=re.I): industry_name.append('Technology')
    elif re.search(r"military|defense|aircraft|space|cybersecurity|drone|army", ' '.join(sample), flags=re.I): industry_name.append('Military & Defense')
    else: industry_name.append('Others')
    

In [45]:
topic_cnt = saved_model.get_document_info(text).groupby('Name').count().sort_values(by = 'Document', ascending = False).reset_index()
topic_cnt['industry'] = industry_name
topic_cnt.head()

Unnamed: 0,Name,Document,Topic,Top_n_words,Probability,Representative_document,industry
0,-1_ai_new_model_company,53124,53124,53124,53124,53124,Others
1,0_market_report_growth_analysis,7625,7625,7625,7625,7625,Others
2,1_patient_cancer_clinical_health,4522,4522,4522,4522,4522,Healthcare&Biotech
3,2_market_analysis_report_growth,2735,2735,2735,2735,2735,Others
4,3_china_chinese_baidu_beijing,1517,1517,1517,1517,1517,Military & Defense


In [46]:
# map the industry label to the dataframe
industry_label = {'Key': topic_cnt['Name'].tolist(),
                  'Value': topic_cnt['industry'].tolist()}

industry_label_df = pd.DataFrame(industry_label)

result_dict = industry_label_df.set_index('Key')['Value'].to_dict()

In [42]:
doc_info = saved_model.get_document_info(text)
doc_info.head()

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document
0,Photo taken July show sign electronic toll col...,6,6_vehicle_car_road_traffic,vehicle - car - road - traffic - driver - driv...,0.618081,False
1,Children With Autism Saw Their Learning Social...,-1,-1_ai_new_model_company,ai - new - model - company - technology - the ...,0.0,False
2,Forget ML AI Industry obsolescence focus Febru...,-1,-1_ai_new_model_company,ai - new - model - company - technology - the ...,0.0,False
3,Strategy Analytics Smartphones Sold Globally A...,-1,-1_ai_new_model_company,ai - new - model - company - technology - the ...,0.0,False
4,Olympus Support Endoscopic AI Diagnosis Educat...,-1,-1_ai_new_model_company,ai - new - model - company - technology - the ...,0.0,False


In [55]:
%%time
doc_info['Industry'] = doc_info['Name'].apply(lambda x: result_dict[x])

CPU times: user 26.8 ms, sys: 3.8 ms, total: 30.6 ms
Wall time: 28.7 ms


In [56]:
doc_info.head()

Unnamed: 0,Document,Topic,Name,Top_n_words,Probability,Representative_document,Industry
0,Photo taken July show sign electronic toll col...,6,6_vehicle_car_road_traffic,vehicle - car - road - traffic - driver - driv...,0.618081,False,Automotive
1,Children With Autism Saw Their Learning Social...,-1,-1_ai_new_model_company,ai - new - model - company - technology - the ...,0.0,False,Others
2,Forget ML AI Industry obsolescence focus Febru...,-1,-1_ai_new_model_company,ai - new - model - company - technology - the ...,0.0,False,Others
3,Strategy Analytics Smartphones Sold Globally A...,-1,-1_ai_new_model_company,ai - new - model - company - technology - the ...,0.0,False,Others
4,Olympus Support Endoscopic AI Diagnosis Educat...,-1,-1_ai_new_model_company,ai - new - model - company - technology - the ...,0.0,False,Others


In [60]:
doc_info.groupby('Industry').count().sort_values(by = 'Document', ascending = False)

Unnamed: 0_level_0,Document,Topic,Name,Top_n_words,Probability,Representative_document
Industry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Others,96240,96240,96240,96240,96240,96240
Healthcare&Biotech,7432,7432,7432,7432,7432,7432
Automotive,4758,4758,4758,4758,4758,4758
Finance,4179,4179,4179,4179,4179,4179
Technology,3191,3191,3191,3191,3191,3191
Military & Defense,3124,3124,3124,3124,3124,3124
Telecommunication,2474,2474,2474,2474,2474,2474
Publishing,2324,2324,2324,2324,2324,2324
Education,2300,2300,2300,2300,2300,2300
Media Creation,1070,1070,1070,1070,1070,1070


In [63]:
doc_info.to_csv('industry_info.csv')