<a href="https://colab.research.google.com/github/izk8/2022_siop_fri_seminar/blob/main/bert_example/22_04_SIOP_fri_sem_messy_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# load relivant packages
!pip install bertopic
!pip install -U sentence-transformers
!pip install ipython-autotime

In [None]:
# bring in relivant models
# print(pd.__version__)
from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import pickle
from bertopic import BERTopic

# explain models and model choice
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [4]:
# read in job_dict from pkl file
job_dict = pd.read_pickle(r'/content/job_dict.pkl')
job_dict.keys()

dict_keys(['df', 'category_names', 'all_cat_indicies', 'labels_to_add', 'indices_to_add', 'y'])

In [5]:
job_dict["df"]

Unnamed: 0,job_title,category,job_description,cat_indicies
0,Member Service Specialist,Banking-or-loans,Role: To assist members and potential members ...,0
1,Floating Multi Service Banker,Banking-or-loans,JOB FUNCTION / SUMMARY: A Multi-Service Banker...,0
2,Sales Associate Blue Sky #711,Retail,"Job Summary At Blue Sky, we recognize the need...",1
3,Store Manager,Retail,JOB DESCRIPTION Position: Store Manager Report...,1
4,Planning Technician,Real-Estate,ALL APPLICATIONS MUST BE SUBMITTED ONLINE AT h...,2
...,...,...,...,...
9995,Nurse Practitioner - Dermatology (Mon-Fri) OUT...,Healthcare,Position Summary To perform an expanded clinic...,9
9996,Substance Abuse Treatment Case Manager,Healthcare,The Case Manager is responsible for complete c...,9
9997,Supply Technician,Healthcare,Job Title: Property Accountability & Materials...,9
9998,Temporary PRN Nurse Practitioner/Physician Ass...,Healthcare,The Mary S. Shook Student Health Service exist...,9


# Tutorial 1: Embedding Job Titles


In [6]:
# pull out the job titles
job_titles = job_dict["df"]["job_title"]
job_titles

0                               Member Service Specialist
1                           Floating Multi Service Banker
2                           Sales Associate Blue Sky #711
3                                           Store Manager
4                                     Planning Technician
                              ...                        
9995    Nurse Practitioner - Dermatology (Mon-Fri) OUT...
9996               Substance Abuse Treatment Case Manager
9997                                    Supply Technician
9998    Temporary PRN Nurse Practitioner/Physician Ass...
9999           Clinical Social Worker Associate (35 Hour)
Name: job_title, Length: 10000, dtype: object

In [7]:
# get embeddings 
# make embeddings of titles
title_emb = model.encode(job_titles, show_progress_bar = True, convert_to_tensor=True)
# change runtime to gpu... will be very fast... 

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [8]:
# what do these embeddings look like 
title_emb.shape

torch.Size([10000, 384])

In [9]:
title_emb

tensor([[-0.4357, -0.4224, -0.2424,  ..., -0.3637,  0.1671,  0.4540],
        [-0.3686, -0.6254, -0.7172,  ..., -0.4637,  0.3951, -0.2661],
        [-0.2146, -0.4029,  0.1744,  ..., -0.7837, -0.7946,  0.3658],
        ...,
        [-0.9767, -0.3604, -0.1257,  ..., -0.7109,  0.7241,  0.0203],
        [-0.5571, -0.1584,  0.0423,  ..., -0.0864, -0.2682,  0.5536],
        [ 0.0519, -0.0374,  0.4928,  ..., -0.3209, -0.0911,  0.0488]],
       device='cuda:0')

In [None]:
# embedding for Member Service Specialist
title_emb[0]

In [None]:
# now how do we interact with the embeddings?
# let's try a random string of text
input_job_title = "Jr. Java Script Developer"
input_embedding = model.encode(input_job_title, convert_to_tensor=True)
input_embedding

In [None]:
# find the most similar match
results = util.semantic_search(input_embedding, title_emb, top_k = 10)
results
# unfortunatly it is in a pretty bad format

In [13]:
#results
#results[0][0] # first one is first seach, second one is the first dictionary... 
results[0][0]['corpus_id'] #gets index

8126

In [None]:
print(job_titles[results[0][0]['corpus_id']]) 

In [14]:
for i in range(0, len(results[0])):
  print(job_titles[results[0][i]['corpus_id']]) 

Sr. Java Developer
Java Developer
Java Developer
Java Developer
Java-Developer
Junior Java Developer
Application Programmer/Developer (Java)
Sr. Java Developer/Application Systems Engineer 5
Sr. Java Software Engineer
Java Developer (Full-Stack)


In [15]:
# little function to streamline query 

def job_search(input_text, how_many_results = 2):
    # type in string and will return top matches
    input_embedding = model.encode(input_text, convert_to_tensor=True)
    results = util.semantic_search(input_embedding, title_emb, top_k = how_many_results)
    
    for i in range(0, len(results[0])):
        print(job_titles[results[0][i]['corpus_id']])
        #print(job_dict["df"]["job_description"][results[0][i]['corpus_id']])


In [16]:
job_search("data scientist", 10)

Data Scientist
Data Scientist
Data Scientist Intern
Data Analyst / Data Science
Research Scientist
Senior Data Scientist - Nationwide Opportunities
Data Science intern
Scientist
Data Scientist (All Levels) - Charlotte
Data Engineer


In [17]:
job_search("Sales Associate")

Sales Associate
Sales Associate


In [18]:
job_search("Fry a chicken", 10)

Cook
Cook
Cook
Cook
Cook
Cook
Cook
Cook
Cook
Cook


# Tutorial 2: Clustering Job Descriptions

In [None]:
job_des= job_dict["df"]["job_description"]
job_des

0       Role: To assist members and potential members ...
1       JOB FUNCTION / SUMMARY: A Multi-Service Banker...
2       Job Summary At Blue Sky, we recognize the need...
3       JOB DESCRIPTION Position: Store Manager Report...
4       ALL APPLICATIONS MUST BE SUBMITTED ONLINE AT h...
                              ...                        
9995    Position Summary To perform an expanded clinic...
9996    The Case Manager is responsible for complete c...
9997    Job Title: Property Accountability & Materials...
9998    The Mary S. Shook Student Health Service exist...
9999    Our mission at the State of Connecticut, Depar...
Name: job_description, Length: 10000, dtype: object

In [None]:
job_des[123]

In [None]:
topic_model = BERTopic(language="english", calculate_probabilities=True, verbose=True, embedding_model="all-MiniLM-L6-v2")

In [None]:
topics, probs = topic_model.fit_transform(job_des)

In [None]:
freq = topic_model.get_topic_info(); freq.head(1000)

Unnamed: 0,Topic,Count,Name
0,-1,3406,-1_and_the_of_to
1,0,405,0_food_restaurant_kitchen_and
2,1,344,1_store_sales_retail_customer
3,2,307,2_students_teacher_classroom_children
4,3,222,3_accounting_financial_tax_finance
...,...,...,...
117,116,11,116_marine_vessel_ocean_enforcement
118,117,11,117_hotel_housekeeping_guest_room
119,118,11,118_signature_healthcare_nursing_we
120,119,10,119_dynamics_systems_engineering_mission


In [None]:
representative_docs = topic_model.get_representative_docs(0)
representative_docs

['Description Prepares food, including hot entrees, to specifications in an accurate and timely manner to provide guests with a quality product. Provide general supervision to kitchen staff in order to ensure food quality, presentation and service standards and are met and surpassed. ESSENTIAL FUNCTIONS Deliver internal and external guest service to Horseshoe standards. Reads food orders from computer printouts, and prepares food items according to established menu plans. Observe and test foods to determine if they have been cooked sufficiently, using methods such as tasting, smelling or piercing them with utensils. Observes and enforces Health Department regulations regarding food handling, storage, proper steam table temperature, and cleanliness of work environment. Provide general supervision to kitchen staff ensuring work procedures, quality standards and menu specifications are adhered to. Maintain control of the kitchen to ensure staffing levels are appropriate based on business 

In [None]:
topic_model.get_topic(0)  # Select the most frequent topic

[('food', 0.02419104267419103),
 ('restaurant', 0.016909686696236617),
 ('kitchen', 0.010882073840595056),
 ('and', 0.00796544000117073),
 ('service', 0.007946685923120738),
 ('guests', 0.007921017620736333),
 ('to', 0.007753205501175767),
 ('all', 0.007657978637220115),
 ('guest', 0.007347896682127284),
 ('the', 0.00694640045149775)]

In [None]:
topic_model.visualize_topics()

In [None]:
probs

array([[0.00318718, 0.00524091, 0.00372959, ..., 0.00249554, 0.0051894 ,
        0.00295321],
       [0.00231494, 0.00362065, 0.0025478 , ..., 0.00172549, 0.00369467,
        0.00204687],
       [0.00853951, 0.25222627, 0.00352483, ..., 0.00228039, 0.00593912,
        0.00295591],
       ...,
       [0.00232895, 0.00411251, 0.00324661, ..., 0.00191937, 0.07227041,
        0.00257163],
       [0.00178935, 0.00221293, 0.00403804, ..., 0.00953526, 0.00270842,
        0.03532724],
       [0.00216373, 0.0024628 , 0.00776175, ..., 0.00815496, 0.00310709,
        0.00835107]])

In [None]:
topic_model.visualize_hierarchy(top_n_topics=1000)

In [None]:
topic_model.visualize_barchart(top_n_topics=20, n_words = 15)

In [None]:
new_topics, new_probs = topic_model.reduce_topics(job_des, topics, probs, nr_topics=27)

2022-04-19 18:05:45,843 - BERTopic - Reduced number of topics from 122 to 28


In [None]:
freq = topic_model.get_topic_info(); freq.head(1000)

Unnamed: 0,Topic,Count,Name
0,-1,4680,-1_and_the_to_of
1,0,602,0_food_and_to_the
2,1,423,1_care_nursing_of_and
3,2,417,2_store_and_to_the
4,3,364,3_students_and_the_of
5,4,284,4_and_data_to_the
6,5,258,5_property_and_to_the
7,6,222,6_accounting_and_financial_of
8,7,175,7_and_of_the_to
9,8,174,8_office_medical_and_to


In [None]:
representative_docs = topic_model.get_representative_docs(0)
representative_docs

In [None]:
topic_model.get_topic(0)

[('food', 0.030835972762991888),
 ('and', 0.029267417083478284),
 ('to', 0.02536480406338696),
 ('the', 0.022943141394783796),
 ('of', 0.019165383124380254),
 ('in', 0.017827519793519618),
 ('restaurant', 0.016543311458843645),
 ('all', 0.016265196730576942),
 ('for', 0.016159675923085343),
 ('with', 0.015432485284687141)]

In [None]:
topic_model.visualize_heatmap( width=1000, height=1000)

In [None]:
topic_model.visualize_barchart(top_n_topics=20, n_words = 15)

In [None]:
similar_topics, similarity = topic_model.find_topics("Code in python or R", top_n=5); similar_topics

[14, 8, 13, 20, 15]

In [None]:
representative_docs = topic_model.get_representative_docs(14)
representative_docs

['Does the thought of working with a company that is at the cutting-edge of developing next-generation SaaS technology excite you? Do you enjoy working in a fast-paced environment with a creative and passionate team where your input will have an impact? We a seeking a resourceful and creative full stack asp.net developer with at least 2 years of experience to join our Long Island development team. If you answered yes to these questions, you might be that person. Primary functions: Create cutting edge web and mobile applications Rapidly developing new features, without compromise, while maintaining and improving existing products Reviewing code of teammates to facilitate collaboration and optimum output Testing, maintenance, and troubleshooting of development projects Required skills and attributes: 2+ years of experience as a software engineer building web applications Proficient in C#, SQL, JavaScript, CSS3, and HTML5 Experience working with RESTful WebServices and Bootstrap is a plus

# Semi-supervised topic modeling with labels
Steer the dimensionality reduction of the embeddings into a space that closely follows any labels you might already have. In other words, we use a semi-supervised UMAP instance to reduce the dimensionality of embeddings before clustering the documents with HDBSCAN.


In [None]:
# what lables we will add 
# important to note the labels are inclusive, meaning all the ones listed have labels 
job_dict["labels_to_add"]

['Sales',
 'Restaurant-or-food-Service',
 'Computer-or-internet',
 'Upper-Management-or-consulting']

In [None]:
# specifically each one is turned into a number 
job_dict["y"]

In [None]:
# so simply train a new model with y present 
topic_model = BERTopic(verbose=True).fit(job_des, y=job_dict["y"])


Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2022-04-19 18:13:18,238 - BERTopic - Transformed documents to Embeddings
2022-04-19 18:13:45,180 - BERTopic - Reduced dimensionality with UMAP
2022-04-19 18:13:45,672 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
freq = topic_model.get_topic_info(); freq.head(1000)

Unnamed: 0,Topic,Count,Name
0,-1,3973,-1_and_the_of_to
1,0,263,0_food_restaurant_kitchen_cook
2,1,230,1_store_sales_retail_merchandise
3,2,211,2_accounting_financial_tax_accounts
4,3,179,3_property_leasing_residents_community
...,...,...,...
120,119,11,119_pricing_xactly_business_price
121,120,11,120_nursing_care_rn_assures
122,121,10,121_server_system_network_systems
123,122,10,122_chemistrybiochemistry_analytical_samples_lab


In [None]:
topic_model.visualize_barchart(top_n_topics=20, n_words = 15)

In [None]:
new_topics, new_probs = topic_model.reduce_topics(job_des, topics, probs, nr_topics=27)

2022-04-19 18:14:26,101 - BERTopic - Reduced number of topics from 125 to 28


In [None]:
freq = topic_model.get_topic_info(); freq.head(1000)

Unnamed: 0,Topic,Count,Name
0,-1,4751,-1_and_to_the_of
1,0,535,0_and_food_to_the
2,1,441,1_store_and_to_the
3,2,321,2_students_and_the_of
4,3,278,3_accounting_and_financial_of
5,4,252,4_the_you_of_to
6,5,220,5_nursing_care_nurse_of
7,6,213,6_and_the_to_hr
8,7,207,7_and_to_of_patient
9,8,204,8_care_and_to_home


In [None]:
topic_model.visualize_barchart(top_n_topics=20, n_words = 15)

# Semi-supervised topic modeling with labels
Listing topic representations. By defining the topics BERTopic is more likely to model the defined seeded topics. However, BERTopic is merely nudged towards creating those topics. In practice, if the seeded topics do not exist or might be divided into smaller topics, then they will not be modeled

In [None]:
# create a list of each of the catigories
def extractDigits(lst):
    return [[el] for el in lst]
                  
# Driver code
seed_list =job_dict["category_names"]
print(extractDigits(seed_list))

[['Banking-or-loans'], ['Retail'], ['Real-Estate'], ['Law-Enforcement-or-security'], ['Administrative'], ['Arts-or-entertainment-or-publishing'], ['Customer-Service'], ['Computer-or-internet'], ['Human-Resources'], ['Healthcare'], ['Sales'], ['Insurance'], ['Manufacturing-or-mechanical'], ['Construction-or-facilities'], ['Hospitality-or-travel'], ['Restaurant-or-food-Service'], ['Transportation-or-logistics'], ['Education-or-training'], ['Telecommunications'], ['Legal'], ['Accounting-or-finance'], ['Non-profit/volunteering'], ['Engineering-or-architecture'], ['Pharmaceutical/bio-tech'], ['Upper-Management-or-consulting'], ['Marketing-or-advertising-or-pr'], ['Government-or-military']]


In [None]:
topic_model = BERTopic(seed_topic_list=seed_list)
topics, probs = topic_model.fit_transform(job_des)


2022-04-19 18:36:56,796 - BERTopic - Transformed documents to Embeddings
2022-04-19 18:37:21,028 - BERTopic - Reduced dimensionality with UMAP
2022-04-19 18:37:21,545 - BERTopic - Clustered UMAP embeddings with HDBSCAN


In [None]:
freq = topic_model.get_topic_info(); freq.head(1000)

Unnamed: 0,Topic,Count,Name
0,-1,3775,-1_and_the_of_to
1,0,497,0_store_sales_customer_merchandise
2,1,407,1_food_restaurant_kitchen_guests
3,2,295,2_accounting_financial_tax_finance
4,3,192,3_network_support_hardware_technical
...,...,...,...
123,122,11,122_learning_development_training_pearson
124,123,10,123_correctional_be_inmates_your
125,124,10,124_equipment_tree_maintenance_ranch
126,125,10,125_mercy_care_barnesjewish_were


In [None]:
topic_model.visualize_barchart(top_n_topics=20, n_words = 15)