use BERTopic to do topic modelling
https://maartengr.github.io/BERTopic/

In [26]:
import pandas as pd
from tqdm import tqdm

# we can not find 'PanGU-Σ', 'Luminous' in close Source index, cannot find 'Galatica', 'YaLM', 'PanGu-α' in open source list , so delete them.
openSourceList = ['T5', 'mT5', 'CPM-2','T0','GPT-NeoX-20B','CodeGen','Tk-Instruct','UL2','OPT','NLLB','BLOOM','GLM','Flan-T5','mT0','BLOOMZ','OPT-IML','Pythia','LLaMA','Vicuna','ChatGLM','CodeGeeX','Koala']
closeSourceList = ['GShard','GPT-3','LaMDA','HyperCLOVA','Codex','ERNIE 3','Jurassic-1','FLAN','MT-NLG','Yuan 1.0','Anthropic','WebGPT','Gopher','ERNIE 3.0 Titan','GLaM','InstructGPT','ChatGPT','AlphaCode','Chinchilla','PaLM','Cohere','AlexaTM','Sparrow','WeLM','U-PaLM','Flan-PaLM','Flan-U-PaLM','Alpaca','GPT-4' ,'Claude']

readfile = "/data/jx4237data/DataForChatGPTinnovationWaves/"
df = pd.read_csv(readfile + 'LLM09enhanced.csv',dtype=object)
# some paper we need to delete:
# “fake” palm paper 1909.02134
# fake openai 1506.04006
delete_list = ['1506.04006','1909.02134']
df = df[~df['id'].isin(delete_list)]

In [27]:
# model2paper
model2paper = {}
for model in tqdm(openSourceList+closeSourceList):
    if model in model2paper:
        print('error')
    model2paper[model] = set()
    for paper in df[df['abstract'].str.contains("\\b%s\\b"%model) | df['abstract'].str.lower().str.contains("\\b%s\\b"%model.lower())  | df['title'].str.contains("\\b%s\\b"%model) | df['title'].str.lower().str.contains("\\b%s\\b"%model.lower())]['id']  :
        model2paper[model].add(paper)

# paper2model
paper2model = {}
for model, paperSet in model2paper.items():
    for paper in paperSet:
        if paper not in paper2model:
            paper2model[paper] =[model]
        else:
            paper2model[paper].append(model)


openPaperSet = set()
closePaperSet = set()
for k,v in model2paper.items():
    if k in closeSourceList:
        for paper in v:
            closePaperSet.add(paper)
    if k in openSourceList:
        for paper in v:
            openPaperSet.add(paper)

100%|██████████| 52/52 [00:20<00:00,  2.56it/s]


In [28]:
# model2num
model2num = {}
num_seq = 0
for k,v in sorted(model2paper.items(),  key= lambda x:-len(x[1])):
    model2num[k] = num_seq
    num_seq += 1

In [29]:
# num2model
num2model = {}
for k,v in model2num.items():
    num2model[v]=k

In [30]:
paper2onemodel = {}
paper2onemodelrank = {}
for k,v in paper2model.items():
    paper2onemodel[k] = sorted(v, key= lambda x:-model2num[x])[0]
    paper2onemodelrank[k] = model2num[sorted(v, key= lambda x:-model2num[x])[0]]

In [31]:
pureOpen = openPaperSet - closePaperSet
pureClose = closePaperSet - openPaperSet
mixed = openPaperSet & closePaperSet
all = openPaperSet | closePaperSet

paper2pubdate = {}
for i in range(len(df)):
    paper2pubdate[list(df['id'])[i]] = list(df['update_date'])[i]

In [32]:
# get corups: title and abstract with model in their papers.

df['date'] = pd.to_datetime(df['publish_date_v1'])
df['text'] = df[['title', 'abstract']].agg(' '.join, axis=1)

model_df = df[df['id'].isin(all)]

In [33]:
def label_rep_model (row):
   return paper2onemodel[row['id']]
def label_rep_model_rank (row):
   return paper2onemodelrank[row['id']]

In [34]:
model_df['rep_model'] = model_df.apply (lambda row: label_rep_model(row), axis=1)
model_df['rep_model_rank'] = model_df.apply (lambda row: label_rep_model_rank(row), axis=1)


In [35]:
# model_df = model_df.loc[model_df['rep_model_rank'] < 10]
text = model_df['text']
date = model_df['date']
y = model_df['rep_model_rank']
cate_name = model_df['rep_model']

In [36]:
from bertopic import BERTopic

In [37]:
from bertopic import BERTopic
from bertopic.backend import BaseEmbedder
from bertopic.cluster import BaseCluster
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
# Prepare our empty sub-models and reduce frequent words while we are at it.
empty_embedding_model = BaseEmbedder()
empty_dimensionality_model = BaseDimensionalityReduction()
empty_cluster_model = BaseCluster()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Fit BERTopic without actually performing any clustering
topic_model= BERTopic(
        embedding_model=empty_embedding_model,
        umap_model=empty_dimensionality_model,
        hdbscan_model=empty_cluster_model,
        ctfidf_model=ctfidf_model,
        vectorizer_model=vectorizer_model,
        top_n_words=30,
)
topics, probs = topic_model.fit_transform(text, y=y)

In [38]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,901,0_chatgpt_ai_potential_generated,"[chatgpt, ai, potential, generated, students, ...",[Exploring User Perspectives on ChatGPT: Appli...
1,1,592,1_gpt_shot_learning_human,"[gpt, shot, learning, human, knowledge, prompt...",[Clinical Prompt Learning with Frozen Language...
2,2,486,2_gpt_chatgpt gpt_gpt gpt_llms,"[gpt, chatgpt gpt, gpt gpt, llms, reasoning, l...",[NavGPT: Explicit Reasoning in Vision-and-Lang...
3,3,222,3_t5_pre_pre trained_sequence,"[t5, pre, pre trained, sequence, bart, transfo...",[Investigating Pre-trained Language Models on ...
4,4,165,4_llama_quantization_tuning_lora,"[llama, quantization, tuning, lora, instructio...",[OmniQuant: Omnidirectionally Calibrated Quant...
5,5,98,5_codex_code_program_repair,"[codex, code, program, repair, openai codex, p...",[InferFix: End-to-End Program Repair with LLMs...
6,6,69,6_palm_gpt palm_med palm_med,"[palm, gpt palm, med palm, med, math, attribut...",[Towards Generalist Biomedical AI Medicine i...
7,7,58,7_opt_quantization_125m_perplexity,"[opt, quantization, 125m, perplexity, opt 175b...",[FineQuant: Unlocking Efficiency with Fine-Gra...
8,8,52,8_bloom_multilingual_176b_bloom 176b,"[bloom, multilingual, 176b, bloom 176b, bigsci...",[PolyLM: An Open Source Polyglot Large Languag...
9,9,48,9_instructgpt_negation_emergent abilities_negated,"[instructgpt, negation, emergent abilities, ne...",[Improving Patient Pre-screening for Clinical ...


In [39]:
topic_model.set_topic_labels(num2model)

In [40]:
topic_model.visualize_barchart(custom_labels=True, top_n_topics = 30, n_words = 30 , height= 1000)

In [41]:
topic_model.visualize_topics()

In [42]:
topic_model.visualize_heatmap(custom_labels =True)

In [43]:
# from sklearn.datasets import fetch_20newsgroups
# from sentence_transformers import SentenceTransformer
# from bertopic import BERTopic
# from umap import UMAP
# from hdbscan import HDBSCAN

# hdbscan_model = HDBSCAN(min_cluster_size=29, metric='euclidean', cluster_selection_method='eom', prediction_data=True)
# # Prepare embeddings
# sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
# embeddings = sentence_model.encode(text.tolist(), show_progress_bar=False)
# from sklearn.feature_extraction.text import CountVectorizer
# vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))
# # Train BERTopic
# topic_model = BERTopic(hdbscan_model=hdbscan_model,vectorizer_model=vectorizer_model,).fit(text.tolist(), embeddings)
# topics, probs = topic_model.fit_transform(text.tolist(), y=y)
# # Run the visualization with the original embeddings
# topic_model.visualize_documents(text.tolist(), embeddings=embeddings)

# # Reduce dimensionality of embeddings, this step is optional but much faster to perform iteratively:
# reduced_embeddings = UMAP(n_neighbors=10, n_components=2, min_dist=0.0, metric='cosine').fit_transform(embeddings)
# topic_model.visualize_documents(text.tolist(), reduced_embeddings=reduced_embeddings)

In [44]:
model_df = model_df.loc[model_df['rep_model_rank'] < 10]
text = model_df['text']
date = model_df['date']
y = model_df['rep_model_rank']
cate_name = model_df['rep_model']

In [45]:
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.dimensionality import BaseDimensionalityReduction
from sklearn.linear_model import LogisticRegression
vectorizer_model = CountVectorizer(stop_words="english", min_df=2, ngram_range=(1, 2))


# Skip over dimensionality reduction, replace cluster model with classifier,
# and reduce frequent words while we are at it.
empty_dimensionality_model = BaseDimensionalityReduction()
clf = LogisticRegression()
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Create a fully supervised BERTopic instance
topic_model= BERTopic(
        umap_model=empty_dimensionality_model,
        hdbscan_model=clf,
        ctfidf_model=ctfidf_model,
        vectorizer_model=vectorizer_model,
        top_n_words=20,
)
topics, probs = topic_model.fit_transform(text, y=y)


In [46]:
df = topic_model.get_topic_info()
df

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,0,901,0_chatgpt_ai_llms_research,"[chatgpt, ai, llms, research, study, potential...",[How Close is ChatGPT to Human Experts? Compar...
1,1,592,1_gpt3_models_language_model,"[gpt3, models, language, model, language model...",[An Empirical Study of GPT-3 for Few-Shot Know...
2,2,486,2_gpt4_llms_models_large language,"[gpt4, llms, models, large language, large, la...",[Benchmarking Large Language Models on CMExam ...
3,3,222,3_t5_pretrained_model_models,"[t5, pretrained, model, models, pretraining, p...",[M6-Rec: Generative Pretrained Language Models...
4,4,165,4_llama_llms_model_models,"[llama, llms, model, models, large language, f...",[BayLing: Bridging Cross-lingual Alignment and...
5,5,98,5_codex_code_program_programming,"[codex, code, program, programming, repair, pr...",[InferFix: End-to-End Program Repair with LLMs...
6,6,69,6_palm_language_models_math,"[palm, language, models, math, language models...",[PAL: Program-aided Language Models Large la...
7,7,58,7_opt_quantization_perplexity_inference,"[opt, quantization, perplexity, inference, mod...",[GPTQ: Accurate Post-Training Quantization for...
8,8,52,8_multilingual_languages_multilingual language...,"[multilingual, languages, multilingual languag...",[Answering Unseen Questions With Smaller Langu...
9,9,6,9_assumptions_similarity_definitive_break,"[assumptions, similarity, definitive, break, m...",[CSTS: Conditional Semantic Textual Similarity...


In [47]:
topic_model.set_topic_labels(num2model)

In [48]:
topic_model.visualize_barchart(custom_labels=True, top_n_topics = 20, n_words = 20 , height= 1000)
# lists more fingerprints, from 10 to 50

In [49]:
topic_model.visualize_topics()
# highlight all topics, d1, d2 (specification, fine-tuning) ??
# fingerprints
# quantization
# finances


In [50]:
topic_model.visualize_heatmap(custom_labels =True)