# Topic modelling of [AI incidents](https://incidentdatabase.ai/apps/incidents/) using [BERTopic](https://maartengr.github.io/BERTopic/index.html)

<video height="640" controls muted
        src="https://user-images.githubusercontent.com/25746895/218420473-4b2bb539-9dbe-407a-9674-a8317c7fb3bf.mp4"
        type=video/webm> 
</video>



## Installing and importing stuff needed
- pandas
- bertopic
- spacy

In [None]:
%pip install pandas


In [None]:
%pip install bertopic[spacy]

In [None]:
%pip install spacy

In [None]:
import pandas as pd
from bertopic import BERTopic

## Taking a look at the dataset

In [None]:
df=pd.read_csv("https://raw.githubusercontent.com/ia-nechaev/tm_ai_incidents/93c38d02a1c76e58525edfdd02f15225e406cf2a/incidents.csv")

In [None]:
df.head()

In [None]:
import spacy

In [None]:
!python -m spacy download en_core_web_md

## Explicitly specify which spacy model we want to use

In [None]:
nlp = spacy.load('en_core_web_md', exclude=['tagger', 'parser', 'ner', 
                                            'attribute_ruler', 'lemmatizer'])

In [None]:
topic_model = BERTopic(embedding_model=nlp)

In [None]:
# Taking first 100 of examples for testing purposes

docs=df['description']

In [None]:
# Fitting the model

topics, probs = topic_model.fit_transform(docs)

In [None]:
# Checking the results

topic_model.get_topic_info()

In [None]:
topic_model.get_topic(0)

In [None]:
topic_model.get_document_info(docs)

## Removing stop words (variant 1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_model = CountVectorizer(stop_words="english")
topic_model = BERTopic(vectorizer_model=vectorizer_model)

In [None]:
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.get_topic_info()

## Removing stop words (variant 2)

In [None]:
from bertopic.vectorizers import ClassTfidfTransformer

ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
topic_model = BERTopic(ctfidf_model=ctfidf_model)

In [None]:
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.get_topic_info()

## Visualization of topics

### Word cloud

In [None]:
# %pip install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
def create_wordcloud(model, topic):
    text = {word: value for word, value in model.get_topic(topic)}
    wc = WordCloud(background_color="white", max_words=1000)
    wc.generate_from_frequencies(text)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.show()



In [None]:
# Show wordcloud
create_wordcloud(topic_model, topic=2)

In [None]:
# Mini Assignment: Let's make wordclouds for all topics at once

## Barchart

In [None]:

topic_model.visualize_barchart(top_n_topics=8, n_words = 10)

In [None]:
topic_model.visualize_topics()

# Full BERTopic specification and subprocesses

In [None]:
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer

In [None]:
from transformers.pipelines import pipeline

In [None]:

# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
#embedding_model = pipeline("feature-extraction", model="bert-base-uncased")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=7, n_components=3, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=7, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer()

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()


In [None]:
# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model, # Step 6 - (Optional) Fine-tune topic represenations
  min_topic_size=7
)

In [None]:
topics, probs = topic_model.fit_transform(docs)

In [None]:
topic_model.visualize_barchart(top_n_topics=8)

In [None]:
topic_model.visualize_barchart(top_n_topics=8)

## Hierarchical Topic Modeling
 

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(docs)

In [None]:
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

In [None]:
topic_model.visualize_heatmap(n_clusters=5, width=1000, height=1000)

In [None]:
topics_to_merge = [[0, 4],
                   [10, 14]]
topic_model.merge_topics(docs, topics_to_merge)

In [None]:
df.info()

In [None]:
df['year']=pd.to_datetime(df['date']).dt.year

In [None]:
df['year']

In [None]:
len(topics)

In [None]:
topics_over_time = topic_model.topics_over_time(docs, df['year'])

In [None]:
topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=20)

In [None]:
# Visualize only several topics

## Categorical value analysis (in progress)

In [None]:
df['topic']=topics

In [None]:
df.head()

In [None]:
df.rename(columns={'Alleged developer of AI system':'developer'}, inplace=True)
df.rename(columns={'Alleged harmed or nearly harmed parties':'harmed'}, inplace=True)
df.rename(columns={'Alleged deployer of AI system':'deployer'}, inplace=True)

In [None]:
df.info()

In [None]:
df['topic']=df['topic'].astype('category')
df['developer']=df['developer'].astype('category')
df['harmed']=df['harmed'].astype('category')
df['deployer']=df['deployer'].astype('category')

In [None]:
from scipy.stats import chi2_contingency

In [None]:
df['topic']

In [None]:
temp=pd.crosstab(df['developer'],df['topic'])

In [None]:
chi2_contingency(temp)