# Clutering themes approach

We'll try to cluster the themes, and then have QA models fine tuned for each of the clusters.

## Loading dataset


In [None]:
import gdown

def download_test_data(round = 1):
    """Download the test data (4 csv files)"""
    assert round in [1,2], "round can be 1 or 2"
    ids = [
        [
            "15WPYOD3ZLShFq_NRtiBHbpz3RTvc8ZWR",
            "15yxIF27NvEa3l12yNy6F5h8lGCJ2n7rf",
            "1Ilpxyj_0T-1KzQMdVSEbSmc1ybxOv69G",
            "1nkEDQZJY6_cAEVw3JlaKCgz0C6mDSYiv"
        ],
        [
            "1-3fMldkBVsTAX3W5JewdAdlUG_agexG0",
            "1-59pQe8TH7UaORF1RSqzFWybMJShdf1U",
            "1-AbnJRRHQiTU5zyUdDC2gUwbIGkEF5l6",
            "1-Px6FFj043L7lbAEBOAMSy2bdoPiVNhy"
        ]
    ]
    for id in ids[round-1]:
        url = f"https://drive.google.com/u/1/uc?id={id}&export=download"
        gdown.download(url, quiet=True)

In [None]:
download_test_data(round=2)

In [None]:
import pandas as pd
paragraphs2 = pd.read_csv('input_paragraph.csv')
print(type(paragraphs2))
paragraphs2.head()

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,id,paragraph,theme
0,1,In The New Yorker music critic Jody Rosen desc...,Beyoncé
1,2,Beyoncé's second solo album B'Day was released...,Beyoncé
2,3,"In July 2002, Beyoncé continued her acting car...",Beyoncé
3,4,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,Beyoncé
4,5,Forbes magazine began reporting on Beyoncé's e...,Beyoncé


In [None]:
uniq_theme_list2 = list(paragraphs2['theme'].unique())
para_list2 = list(paragraphs2['paragraph'])

print(f"We have {len(para_list2)} pargraphs in total & those belong to {len(uniq_theme_list2)} unique themes")

We have 13481 pargraphs in total & those belong to 361 unique themes


Concatenate the paragraphs of a theme as we would like to cluster themes and not paragraphs

In [None]:
para_list2 = [' '.join(paragraphs2[paragraphs2['theme']==theme]['paragraph']) for theme in uniq_theme_list2]

## Main Pipeline

### Setting up helper functions

In [None]:
def generate_embeddings(embedding_model, para_list):
  print("Generating the document embeddings...")
  para_embeddings = embedding_model(para_list)
  para_embeddings = para_embeddings.numpy()
  return para_embeddings

def fit_bertopic_model(topic_model, para_list, para_embeddings):
  print("Fitting the model using the paras & their embeddings")
  # print(type(para_embeddings))
  # np_para_emb = para_embeddings.numpy() # Converting the para_embeddings from EagerTensor -> numpy array
  topic_model.fit(documents = para_list, embeddings = para_embeddings) # Can use pre-trained embeddings directly

def perform_cluster_prediction(topic_model, para_list, para_embeddings):
  np_para_emb = para_embeddings #.numpy() # Converting the para_embeddings from EagerTensor -> numpy array
  try:
    topics, scores = topic_model.transform(para_list, np_para_emb) # Predicting the documents clusters
  except:
    topics
  return topics, scores

def get_max_cluster_id(theme_cluster_count_dict):
  """
  Input: theme_cluster_count_dict : A dictionary with cluster_id as keys and their # of occurences as the value
  """
  max_count = 0
  cid = -1
  for key in theme_cluster_count_dict:
    if (theme_cluster_count_dict[key] > max_count):
      max_count = theme_cluster_count_dict[key]
      cid = key
  return cid

def get_cid_to_list_topics(mydic):
  """
  Input: mydic: Dict mapping theme --> cluster id
  Returns a dict with cluster id as key and the value is a list of themes associated to it.
  """
  cluster_id_to_topic = {}
  for topic, cluster_id in mydic.items():
      if cluster_id not in cluster_id_to_topic:
          cluster_id_to_topic[cluster_id] = [topic]
      else:
          cluster_id_to_topic[cluster_id].append(topic)
  return cluster_id_to_topic

def get_avg_themes_per_cluster(topic_model):
  data = topic_model.get_topic_info()
  avg = data[data['Topic']!=-1]['Count'].mean()
  return avg

def generate_theme_to_cluster_mapping(paragraphs, topics):
  global_idx = 0

  theme_to_cluster_mapping = {} # This will hold the final theme to cluster mapping

  uniq_theme_list_df = paragraphs['theme'].unique()

  for theme in uniq_theme_list_df:

    theme_df = paragraphs[paragraphs['theme']==theme] # Getting the part of df with the curr theme
    theme_cluster_count_dict = {} # To store the count of each cluster_id the prev theme was mapped to. '-1' cluster_id indicates that we'll be using the global model.

    for i in range(len(theme_df)):
        curr_cid = topics[global_idx]
        if theme_cluster_count_dict.get(curr_cid)==None:
          theme_cluster_count_dict[curr_cid] = 0
        theme_cluster_count_dict[curr_cid] += 1 # Incrementing the count
    global_idx+=1
    mode_cid = get_max_cluster_id(theme_cluster_count_dict)
    theme_to_cluster_mapping[theme] = mode_cid

  return theme_to_cluster_mapping

### Installing and importing some libraries

In [None]:
%%capture
!pip install bertopic

In [None]:
from bertopic import BERTopic
import tensorflow_hub
from umap import UMAP
import copy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import Birch

### Generate Paragraph Embeddings

In [None]:
# Generating Embeddings Once
embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
para_embeddings2 = generate_embeddings(embedding_model, para_list2) 

NameError: ignored

### Initialize BERTopic Model

In [None]:
vectorizer_model = CountVectorizer(stop_words="english")
brc_model = Birch(branching_factor=50, n_clusters=None, threshold=0.4)
umap_model = UMAP(n_neighbors=15, n_components=6, min_dist=0.0, metric='cosine')


topic_model = BERTopic(
                  low_memory = True,
                  umap_model = umap_model,
                  hdbscan_model = brc_model,
                  embedding_model = embedding_model,
                  vectorizer_model = vectorizer_model, 
                  calculate_probabilities = True, 
                  verbose = True, 
                  nr_topics = int(0.3*len(uniq_theme_list2))
                )  

## Running the Model

In [None]:
fit_bertopic_model(topic_model, para_list2, para_embeddings2)
topics, scores = perform_cluster_prediction(topic_model, para_list2, para_embeddings2)
theme_to_cluster_mapping = generate_theme_to_cluster_mapping(paragraphs2, topics)
cid_to_themes_mapping = (get_cid_to_list_topics(theme_to_cluster_mapping))
avg_themes_per_cluster = get_avg_themes_per_cluster(topic_model)
topic_info = topic_model.get_topic_info()
unclustered = topic_info[topic_info['Topic']==-1]['Count']
if unclustered.empty:
  unclustered = 0
  num_clusters = len(topic_info['Topic'])
else:
  unclustered = int(unclustered)
  num_clusters = len(topic_info['Topic'])-1
print('--------------------------------------------------------------------------------------------------------------------------------------------')
print(f"Main Dataset Results:")
print(f"#Themes unclustered = {unclustered}\nAvg. #Themes / cluster = {avg_themes_per_cluster}\n# clusters = {num_clusters}")

Fitting the model using the paras & their embeddings


2023-02-03 16:58:53,892 - BERTopic - Reduced dimensionality
2023-02-03 16:58:53,915 - BERTopic - Clustered reduced embeddings
Instructions for updating:
Use tf.identity instead.
2023-02-03 16:58:57,603 - BERTopic - Reduced number of topics from 37 to 37
2023-02-03 16:58:59,541 - BERTopic - Reduced dimensionality
2023-02-03 16:58:59,544 - BERTopic - Predicted clusters


--------------------------------------------------------------------------------------------------------------------------------------------
Main Dataset Results:
#Themes unclustered = 0
Avg. #Themes / cluster = 9.756756756756756
# clusters = 37


In [None]:
# topic_model.save('birch_dot4_thresh', save_embedding_model=False)

In [None]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,11,0_film_music_album_madonna
1,1,4,1_architecture_cubism_riba_architects
2,2,8,2_nasser_tito_torch_soviet
3,3,23,3_data_windows_dell_audio
4,4,26,4_city_new_york_area
5,5,9,5_law_court_constitution_supreme
6,6,15,6_uranium_light_copper_energy
7,7,14,7_philosophy_quran_hayek_whitehead
8,8,21,8_species_bacteria_birds_plants
9,9,5,9_education_universities_university_schools


### Visualizing the clustered paragraphs

In [None]:
train_data_map = cid_to_themes_mapping

In [None]:
topic_model.visualize_barchart(top_n_topics=10)

In [None]:
topic_model.visualize_topics()

In [None]:
hierarchical_topics = topic_model.hierarchical_topics(para_list2)
tree = topic_model.get_topic_tree(hierarchical_topics)
print(tree)

100%|██████████| 36/36 [00:00<00:00, 113.74it/s]

.
├─city_war_new_state_states
│    ├─greek_language_roman_empire_century
│    │    ├─israel_jews_jewish_apollo_israeli
│    │    │    ├─■──israel_israeli_jewish_jerusalem_arab ── Topic: 28
│    │    │    └─■──jews_jewish_apollo_ashkenazi_jesus ── Topic: 14
│    │    └─greek_language_empire_roman_ottoman
│    │         ├─greek_empire_ottoman_greece_roman
│    │         │    ├─■──iran_cyprus_armenian_turkish_greece ── Topic: 36
│    │         │    └─■──greek_roman_ottoman_empire_rome ── Topic: 11
│    │         └─■──language_languages_dutch_slavic_comics ── Topic: 12
│    └─city_new_war_state_states
│         ├─city_new_population_area_state
│         │    ├─race_african_black_racial_americans
│         │    │    ├─■──race_black_racial_african_indigenous ── Topic: 25
│         │    │    └─■──tennessee_florida_state_spanish_states ── Topic: 35
│         │    └─city_new_area_island_population
│         │         ├─city_new_area_island_population
│         │         │    ├─■──island_islands




# Predicting on new themes

In [None]:
# Only for loading a saved model from a particular run, else ignore
old_model = BERTopic.load("cluster_model")

In [None]:
topic_info = old_model.get_topic_info()
topic_info

Unnamed: 0,Topic,Count,Name
0,0,38,0_data_apple_software_windows
1,1,14,1_chinese_china_dynasty_han
2,2,17,2_air_aircraft_forces_war
3,3,16,3_church_mary_pope_jesus
4,4,23,4_gaddafi_country_africa_children
5,5,19,5_theory_god_philosophy_enlightenment
6,6,9,6_london_southampton_beer_plymouth
7,7,18,7_language_languages_characters_dialects
8,8,24,8_used_antenna_light_copper
9,9,20,9_music_album_film_madonna


## Preparing Test Data

In [None]:
# download_test_data(round=1)

In [None]:
import pandas as pd
paragraphs1 = pd.read_csv('paragraphs.csv')
print(type(paragraphs1))
paragraphs1.head()

FileNotFoundError: ignored

In [None]:
uniq_theme_list1 = list(paragraphs1['theme'].unique())
para_list1 = list(paragraphs1['paragraph'])

print(f"We have {len(para_list1)} pargraphs in total & those belong to {len(uniq_theme_list1)} unique themes")

We have 1179 pargraphs in total & those belong to 30 unique themes


### Concatenate paragraphs

In [None]:
para_list1 = [' '.join(paragraphs1[paragraphs1['theme']==theme]['paragraph']) for theme in uniq_theme_list1]

## Prediction Setup

### Generate Embeddings

In [None]:
# embedding_model = tensorflow_hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
para_embeddings1 = generate_embeddings(embedding_model, para_list1) 

Generating the document embeddings...


### Make Predictions

In [None]:
preds,scores = old_model.transform(documents=para_list1,embeddings=para_embeddings1)

In [None]:
mapp = generate_theme_to_cluster_mapping(paragraphs1,preds)

In [None]:
cid_to_topic = get_cid_to_list_topics(mapp)

### Cluster Assignment of New Topics

In [None]:
new_map = cid_to_topic

In [None]:
new_map

{0: ['IPod', 'Wayback_Machine', 'Web_browser', 'DevRev'],
 1: ['2008_Sichuan_earthquake', 'Nanjing'],
 2: ['Canadian_Armed_Forces'],
 3: ['Cardinal_(Catholicism)', 'Heresy', 'Mary_(mother_of_Jesus)'],
 4: ['Human_Development_Index'],
 31: ['Warsaw_Pact'],
 5: ['Materialism'],
 6: ['Pub', 'Southampton'],
 7: ['Catalan_language', 'Dialect'],
 8: ['Paper'],
 9: ['Adult_contemporary_music', 'Hard_rock'],
 10: ['The_Times', 'United_States_dollar'],
 11: ['Immunology'],
 12: ['Imamah_(Shia_doctrine)'],
 39: ['Grape'],
 14: ['Everton_F.C.'],
 15: ['Great_Plains'],
 13: ['Biodiversity'],
 16: ['Federal_Bureau_of_Investigation'],
 34: ['Unknown']}

#### Displaying Cluster Assignments with the training map

In [None]:
for cluster in new_map:
  print(f"New Topics: {new_map[cluster]}")
  print(f"Assigned to: {train_data_map[cluster]}")

New Topics: ['IPod', 'Web_browser', 'Antenna_(radio)']
Assigned to: ['The_Legend_of_Zelda:_Twilight_Princess', 'High-definition_television', 'Computer', 'MP3', 'Computer_security', 'Videoconferencing', 'Xbox_360', 'ASCII', 'Macintosh', 'Dell', 'Nintendo_Entertainment_System', 'Film_speed', 'Data_compression', 'Digimon', 'Gramophone_record', 'USB', 'PlayStation_3', 'LaserDisc', 'Software_testing', 'Compact_disc', 'Database', 'Windows_8', 'Super_Nintendo_Entertainment_System']
New Topics: ['2008_Sichuan_earthquake', 'Muammar_Gaddafi', '1973_oil_crisis', 'Islamism']
Assigned to: ['2008_Summer_Olympics_torch_relay', 'Russian_Soviet_Federative_Socialist_Republic', 'Josip_Broz_Tito', 'Myanmar', 'Dissolution_of_the_Soviet_Union', 'Gamal_Abdel_Nasser', 'Estonia', 'Tajikistan']
New Topics: ['Wayback_Machine', 'Comcast', 'Packet_switching']
Assigned to: ['BBC_Television', 'Internet_service_provider', 'BeiDou_Navigation_Satellite_System', 'Communications_in_Somalia', 'General_Electric', 'Copyrigh

### Merge Mappings

In [None]:
merge_map = {k:v + new_map.get(k) for k,v in train_data_map.items() if (new_map.get(k))}
left_over = {k:v for k,v in new_map.items() if k not in train_data_map.keys()}
merge_map

{0: ['Beyoncé',
  'Spectre_(2015_film)',
  'Kanye_West',
  'American_Idol',
  'Sony_Music_Entertainment',
  'Universal_Studios',
  'House_music',
  'Queen_(band)',
  'Madonna_(entertainer)',
  'Turner_Classic_Movies',
  'Steven_Spielberg',
  'Adult_contemporary_music',
  'Hard_rock',
  'Marvel_Comics',
  'Sky_(United_Kingdom)'],
 32: ['Frédéric_Chopin',
  'Classical_music',
  'A_cappella',
  'Mandolin',
  'Post-punk'],
 30: ['Sino-Tibetan_relations_during_the_Ming_dynasty',
  'East_India_Company',
  'British_Empire',
  'Samurai',
  'Modern_history',
  'Great_power',
  'Qing_dynasty',
  'Heian_period',
  'Han_dynasty',
  'Kievan_Rus%27',
  'Normans',
  'Yuan_dynasty'],
 3: ['The_Legend_of_Zelda:_Twilight_Princess',
  'High-definition_television',
  'Computer',
  'MP3',
  'Computer_security',
  'Videoconferencing',
  'Xbox_360',
  'ASCII',
  'Macintosh',
  'Dell',
  'Nintendo_Entertainment_System',
  'Film_speed',
  'Data_compression',
  'Digimon',
  'Gramophone_record',
  'USB',
  'Play

In [None]:
left_over

{}

## Comparing Merge Map with a Combined Map to check if we aren't forcing themes into clusters

In [None]:
paragraphs_c = pd.concat((paragraphs1,paragraphs2))
uniq_theme_list_c = uniq_theme_list1 + uniq_theme_list2
combined_paras = para_list1 + para_list2
para_embeddings_c = generate_embeddings(embedding_model, combined_paras) 

Generating the document embeddings...


In [None]:
topic_model_c = BERTopic(
                  low_memory = True,
                  umap_model = umap_model,
                  hdbscan_model = brc_model,
                  embedding_model = embedding_model,
                  vectorizer_model = vectorizer_model, 
                  calculate_probabilities = True, 
                  verbose = True, 
                  nr_topics = int(0.3*len(uniq_theme_list_c))
                )  

In [None]:
fit_bertopic_model(topic_model_c, combined_paras,para_embeddings_c)
topics, scores = perform_cluster_prediction(topic_model_c, combined_paras, para_embeddings_c)
theme_to_cluster_mapping = generate_theme_to_cluster_mapping(paragraphs_c, topics)
cid_to_themes_mapping = (get_cid_to_list_topics(theme_to_cluster_mapping))
avg_themes_per_cluster = get_avg_themes_per_cluster(topic_model_c)
topic_info = topic_model_c.get_topic_info()
unclustered = topic_info[topic_info['Topic']==-1]['Count']
if unclustered.empty:
  unclustered = 0
  num_clusters = len(topic_info['Topic'])
else:
  unclustered = int(unclustered)
  num_clusters = len(topic_info['Topic'])-1
print('--------------------------------------------------------------------------------------------------------------------------------------------')
print(f"Merged Dataset Results:")
print(f"#Themes unclustered = {unclustered}\nAvg. #Themes / cluster = {avg_themes_per_cluster}\n# clusters = {num_clusters}")

Fitting the model using the paras & their embeddings


2023-02-03 16:59:35,146 - BERTopic - Reduced dimensionality
2023-02-03 16:59:35,166 - BERTopic - Clustered reduced embeddings
2023-02-03 16:59:39,935 - BERTopic - Reduced number of topics from 43 to 43
2023-02-03 16:59:42,349 - BERTopic - Reduced dimensionality
2023-02-03 16:59:42,352 - BERTopic - Predicted clusters


--------------------------------------------------------------------------------------------------------------------------------------------
Merged Dataset Results:
#Themes unclustered = 0
Avg. #Themes / cluster = 11.093023255813954
# clusters = 43


In [None]:
combined_map = cid_to_themes_mapping

In [None]:
combined_map

{0: ['IPod',
  'Wayback_Machine',
  'Web_browser',
  'Comcast',
  'Sky_(United_Kingdom)',
  'Packet_switching',
  'The_Legend_of_Zelda:_Twilight_Princess',
  'BBC_Television',
  'Internet_service_provider',
  'BeiDou_Navigation_Satellite_System',
  'Communications_in_Somalia',
  'High-definition_television',
  'Computer',
  'MP3',
  'Computer_security',
  'Videoconferencing',
  'Xbox_360',
  'ASCII',
  'Macintosh',
  'Dell',
  'Nintendo_Entertainment_System',
  'Copyright_infringement',
  'Film_speed',
  'Intellectual_property',
  'Data_compression',
  'Digimon',
  'Gramophone_record',
  'USB',
  'PlayStation_3',
  'LaserDisc',
  'Software_testing',
  'CBC_Television',
  'IBM',
  'Compact_disc',
  'Database',
  'Windows_8',
  'Super_Nintendo_Entertainment_System',
  'YouTube'],
 1: ['2008_Sichuan_earthquake',
  'Nanjing',
  'Heian_period',
  'Tibet',
  'Han_dynasty',
  'Yuan_dynasty',
  'Sino-Tibetan_relations_during_the_Ming_dynasty',
  '2008_Summer_Olympics_torch_relay',
  'Zhejiang'

In [None]:
topic_model_c.save('cluster_model', save_embedding_model=False)

In [None]:
topic_model_c.get_topic_info()

Unnamed: 0,Topic,Count,Name
0,0,38,0_data_apple_software_windows
1,1,14,1_chinese_china_dynasty_han
2,2,17,2_air_aircraft_forces_war
3,3,16,3_church_mary_pope_jesus
4,4,23,4_gaddafi_country_africa_children
5,5,19,5_theory_god_philosophy_enlightenment
6,6,9,6_london_southampton_beer_plymouth
7,7,18,7_language_languages_characters_dialects
8,8,24,8_used_antenna_light_copper
9,9,20,9_music_album_film_madonna
