# Enable & Disable GPU (Optional) 
### "0" : Enable 
### "-1" : Disable

In [1]:
#Disable GPU "-1", Enable GPU "0"
#import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
#import tensorflow as tf
#print(tf.__version__)
#print(len(tf.config.list_physical_devices('GPU'))>0)

# Import Libraries

In [2]:
import pandas as pd
# Allows to view truncated descriptions. 
pd.set_option('display.min_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

import numpy as np

from nltk.corpus import stopwords
import nltk
import re
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

from keras.preprocessing.text import Tokenizer
import gensim
from keras_preprocessing.sequence import pad_sequences

[nltk_data] Downloading package punkt to C:\Users\MASTER
[nltk_data]     ILYAS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\MASTER
[nltk_data]     ILYAS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Load & Analyse Datasets

In [3]:
import os
data_path = ["Job Titles"]

In [4]:
filepath = os.sep.join(data_path + ['jd_source_b.csv']) #Import Data from database.
documents = pd.read_csv(filepath, header=0) #Panda read imported excel data and delete first row to find column of interests.

#View First 21 Rows.
documents.head(21)

Unnamed: 0,url,job_title,company,job_desc,company_desc
0,https://www.jobstreet.com.sg/en/job/project-en...,Project Engineer / Coordinator (Kaki Bukit / U...,Ideals Recruitment Pte Ltd\n\n \t\t\t\t\t\t...,• Office hour\n• Immediate interview\n• Electr...,Ideals Recruitment\n is a new and dynamic recr...
1,https://www.jobstreet.com.sg/en/job/customer-s...,Customer Service Executive (East / UP$4000 / 3...,Ideals Recruitment Pte Ltd\n\n \t\t\t\t\t\t...,• Japan MNC Company\n• $3500 - $4000\n• Third ...,Ideals Recruitment\n is a new and dynamic recr...
2,https://www.jobstreet.com.sg/en/job/sap-senior...,SAP Senior FICO Consultant,Washington Frank International (A divison of F...,SAP Senior FICO Consultant\nMy client is an es...,
3,https://www.jobstreet.com.sg/en/job/field-appl...,Field Application Engineer,LITE-ON SINGAPORE PTE LTD,Key Responsibilities:\n\n\n\n\n\t\nFrequents o...,Open your world to new opportunities and be pa...
4,https://www.jobstreet.com.sg/en/job/personal-b...,Personal Banker (Basic up to 3.2K/Bonus/No exp...,ScienTec Personnel \t\t\t\t\t\t\t \n\t\...,"New branch opening, expanding fast. Achieveabl...",ScienTec Personnel\n focusses on professional/...
5,https://www.jobstreet.com.sg/en/job/admin-exec...,Admin Executive (Logistics/Purchasing) (Up to ...,FSK Advisory Pte Ltd\n\n \t\t\t\t\t\t\t ...,Responsibilities:\nUpdating of data into the s...,FSK Advisory is a professional and dynamic exe...
6,https://www.jobstreet.com.sg/en/job/merchant-s...,merchant sales officer,integrated loyalty solutions pte ltd,Candidate must possess at least Higher seconda...,Integrated Loyalty Solutions Pte Ltd (www.ils-...
7,https://www.jobstreet.com.sg/en/job/beauty-adv...,"Beauty Advisor (Up to $2000, Retail Hrs, Islan...",FSK Advisory Pte Ltd\n\n \t\t\t\t\t\t\t ...,Responsibilities:\nProvide professional beauty...,FSK Advisory is a professional and dynamic exe...
8,https://www.jobstreet.com.sg/en/job/bus-captai...,Bus Captain ($1950 / Gross $3500 / 6Days / 44H...,MCi CAREER SERVICES PTE LTD\n\n \t\t\t\t\t\...,"Benefits Summary:\n• Bus driver, Full-time Per...",MCI Group of Companies provides a one stop tot...
9,https://www.jobstreet.com.sg/en/job/sap-cutove...,SAP Cutover Specialist,Washington Frank International (A divison of F...,SAP Cutover Specialist \nMy client is an estab...,


In [5]:
#Original Datasets

def show_info(data):
    print('DATASET SHAPE: ', data.shape, '\n')
    print('-'*50)
    print('FEATURE DATA TYPES:')
    print(data.info())
    print('\n', '-'*50)
    print('NUMBER OF UNIQUE VALUES PER FEATURE:', '\n')
    print(data.nunique())
    print('\n', '-'*50)
    print('NULL VALUES PER FEATURE')
    print(data.isnull().sum())

show_info(documents)

DATASET SHAPE:  (15001, 5) 

--------------------------------------------------
FEATURE DATA TYPES:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15001 entries, 0 to 15000
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           15001 non-null  object
 1   job_title     15001 non-null  object
 2   company       15001 non-null  object
 3   job_desc      15001 non-null  object
 4   company_desc  14771 non-null  object
dtypes: object(5)
memory usage: 586.1+ KB
None

 --------------------------------------------------
NUMBER OF UNIQUE VALUES PER FEATURE: 

url             15001
job_title        8970
company          2675
job_desc        10039
company_desc     3064
dtype: int64

 --------------------------------------------------
NULL VALUES PER FEATURE
url               0
job_title         0
company           0
job_desc          0
company_desc    230
dtype: int64


In [6]:
#Display summary statistics for dataframe's numeric columns.
#I chose to display extra features as it helps to detect outliers.

documents.describe([0.10,0.25,0.50,0.75,0.90,0.95,0.99]).T



Unnamed: 0,count,unique,top,freq
url,15001,15001,https://www.jobstreet.com.sg/en/job/project-en...,1
job_title,15001,8970,Accounts Executive,60
company,15001,2675,Capita Pte Ltd – Temp & Contract \t\t\t\t\t...,512
job_desc,15001,10039,Job Description:\nOperate car park enforcement...,22
company_desc,14771,3064,"Founded in 2007 in Singapore, Capita Pte Ltd i...",1081


#### Out of the the 5 columns, only company_desc has missing values. This is due to the fact that there are no values in it. 

#### I will want to drop this column as it will not be helpful in training the data.

# Drop & Standardized Columns

In [7]:
df = pd.DataFrame(documents)
cols = [2,1,3] # Will only select 'company, job_title & job_description'. 2,1,3 is the column arrangement.
df = df[df.columns[cols]]

df = df.rename(columns={'job_title': 'Job Title', 'company': 'Company', 'job_desc': 'Job Description'}) #This will format the header

df.to_csv('JD_Source_B_060722.csv', index=False) #Rename accordingly and save into a new csv for editing
df.head(50)

Unnamed: 0,Company,Job Title,Job Description
0,Ideals Recruitment Pte Ltd\n\n \t\t\t\t\t\t...,Project Engineer / Coordinator (Kaki Bukit / U...,• Office hour\n• Immediate interview\n• Electr...
1,Ideals Recruitment Pte Ltd\n\n \t\t\t\t\t\t...,Customer Service Executive (East / UP$4000 / 3...,• Japan MNC Company\n• $3500 - $4000\n• Third ...
2,Washington Frank International (A divison of F...,SAP Senior FICO Consultant,SAP Senior FICO Consultant\nMy client is an es...
3,LITE-ON SINGAPORE PTE LTD,Field Application Engineer,Key Responsibilities:\n\n\n\n\n\t\nFrequents o...
4,ScienTec Personnel \t\t\t\t\t\t\t \n\t\...,Personal Banker (Basic up to 3.2K/Bonus/No exp...,"New branch opening, expanding fast. Achieveabl..."
5,FSK Advisory Pte Ltd\n\n \t\t\t\t\t\t\t ...,Admin Executive (Logistics/Purchasing) (Up to ...,Responsibilities:\nUpdating of data into the s...
6,integrated loyalty solutions pte ltd,merchant sales officer,Candidate must possess at least Higher seconda...
7,FSK Advisory Pte Ltd\n\n \t\t\t\t\t\t\t ...,"Beauty Advisor (Up to $2000, Retail Hrs, Islan...",Responsibilities:\nProvide professional beauty...
8,MCi CAREER SERVICES PTE LTD\n\n \t\t\t\t\t\...,Bus Captain ($1950 / Gross $3500 / 6Days / 44H...,"Benefits Summary:\n• Bus driver, Full-time Per..."
9,Washington Frank International (A divison of F...,SAP Cutover Specialist,SAP Cutover Specialist \nMy client is an estab...


In [8]:
# Modified Datasets

def show_info(data):
    print('DATASET SHAPE: ', data.shape, '\n')
    print('-'*50)
    print('FEATURE DATA TYPES:')
    print(data.info())
    print('\n', '-'*50)
    print('NUMBER OF UNIQUE VALUES PER FEATURE:', '\n')
    print(data.nunique())
    print('\n', '-'*50)
    print('NULL VALUES PER FEATURE')
    print(data.isnull().sum())

show_info(df)

DATASET SHAPE:  (15001, 3) 

--------------------------------------------------
FEATURE DATA TYPES:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15001 entries, 0 to 15000
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Company          15001 non-null  object
 1   Job Title        15001 non-null  object
 2   Job Description  15001 non-null  object
dtypes: object(3)
memory usage: 351.7+ KB
None

 --------------------------------------------------
NUMBER OF UNIQUE VALUES PER FEATURE: 

Company             2675
Job Title           8970
Job Description    10039
dtype: int64

 --------------------------------------------------
NULL VALUES PER FEATURE
Company            0
Job Title          0
Job Description    0
dtype: int64


### Creating a copy of original data from df to df_mod

In [9]:
df_mod = df.copy(deep = True) 

### Cleaning the empty spaces and removing symbols

In [10]:
df_mod['Company'] = df['Company'].str.replace("\n","").str.replace("\t","").str.replace("\s+"," ").str.strip()

df_mod.head(25)

  df_mod['Company'] = df['Company'].str.replace("\n","").str.replace("\t","").str.replace("\s+"," ").str.strip()


Unnamed: 0,Company,Job Title,Job Description
0,Ideals Recruitment Pte Ltd (Recruitment Firm),Project Engineer / Coordinator (Kaki Bukit / U...,• Office hour\n• Immediate interview\n• Electr...
1,Ideals Recruitment Pte Ltd (Recruitment Firm),Customer Service Executive (East / UP$4000 / 3...,• Japan MNC Company\n• $3500 - $4000\n• Third ...
2,Washington Frank International (A divison of F...,SAP Senior FICO Consultant,SAP Senior FICO Consultant\nMy client is an es...
3,LITE-ON SINGAPORE PTE LTD,Field Application Engineer,Key Responsibilities:\n\n\n\n\n\t\nFrequents o...
4,ScienTec Personnel (Recruitment Firm),Personal Banker (Basic up to 3.2K/Bonus/No exp...,"New branch opening, expanding fast. Achieveabl..."
5,FSK Advisory Pte Ltd (Recruitment Firm),Admin Executive (Logistics/Purchasing) (Up to ...,Responsibilities:\nUpdating of data into the s...
6,integrated loyalty solutions pte ltd,merchant sales officer,Candidate must possess at least Higher seconda...
7,FSK Advisory Pte Ltd (Recruitment Firm),"Beauty Advisor (Up to $2000, Retail Hrs, Islan...",Responsibilities:\nProvide professional beauty...
8,MCi CAREER SERVICES PTE LTD (Recruitment Firm),Bus Captain ($1950 / Gross $3500 / 6Days / 44H...,"Benefits Summary:\n• Bus driver, Full-time Per..."
9,Washington Frank International (A divison of F...,SAP Cutover Specialist,SAP Cutover Specialist \nMy client is an estab...


### Create New Column to be edited

In [11]:
df_mod['Job Description_Edited'] = df_mod['Job Description']

df_mod.head(25)

Unnamed: 0,Company,Job Title,Job Description,Job Description_Edited
0,Ideals Recruitment Pte Ltd (Recruitment Firm),Project Engineer / Coordinator (Kaki Bukit / U...,• Office hour\n• Immediate interview\n• Electr...,• Office hour\n• Immediate interview\n• Electr...
1,Ideals Recruitment Pte Ltd (Recruitment Firm),Customer Service Executive (East / UP$4000 / 3...,• Japan MNC Company\n• $3500 - $4000\n• Third ...,• Japan MNC Company\n• $3500 - $4000\n• Third ...
2,Washington Frank International (A divison of F...,SAP Senior FICO Consultant,SAP Senior FICO Consultant\nMy client is an es...,SAP Senior FICO Consultant\nMy client is an es...
3,LITE-ON SINGAPORE PTE LTD,Field Application Engineer,Key Responsibilities:\n\n\n\n\n\t\nFrequents o...,Key Responsibilities:\n\n\n\n\n\t\nFrequents o...
4,ScienTec Personnel (Recruitment Firm),Personal Banker (Basic up to 3.2K/Bonus/No exp...,"New branch opening, expanding fast. Achieveabl...","New branch opening, expanding fast. Achieveabl..."
5,FSK Advisory Pte Ltd (Recruitment Firm),Admin Executive (Logistics/Purchasing) (Up to ...,Responsibilities:\nUpdating of data into the s...,Responsibilities:\nUpdating of data into the s...
6,integrated loyalty solutions pte ltd,merchant sales officer,Candidate must possess at least Higher seconda...,Candidate must possess at least Higher seconda...
7,FSK Advisory Pte Ltd (Recruitment Firm),"Beauty Advisor (Up to $2000, Retail Hrs, Islan...",Responsibilities:\nProvide professional beauty...,Responsibilities:\nProvide professional beauty...
8,MCi CAREER SERVICES PTE LTD (Recruitment Firm),Bus Captain ($1950 / Gross $3500 / 6Days / 44H...,"Benefits Summary:\n• Bus driver, Full-time Per...","Benefits Summary:\n• Bus driver, Full-time Per..."
9,Washington Frank International (A divison of F...,SAP Cutover Specialist,SAP Cutover Specialist \nMy client is an estab...,SAP Cutover Specialist \nMy client is an estab...


In [12]:
import string

#removing emoji
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"U0001F600-U0001F64F"  # emoticons
                           u"U0001F300-U0001F5FF"  # symbols & pictographs
                           u"U0001F680-U0001F6FF"  # transport & map symbols
                          u"U0001F1E0-U0001F1FF"  # flags (iOS)
                           u"U00002702-U000027B0"
                           u"U000024C2-U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)



#"-" , "." , "/" & "@" is not added in the replacement

df_mod['Job Description_Edited'] = df_mod['Job Description_Edited'].str.replace("\n","").str.replace("\t","")\
            .str.replace("\d+","")\
            .str.replace(r'[•!#%^&$*()~↠=＋☑✉►:;<>?✅★◆✔|–♠♦_⇒]',"").str.replace("\s+"," ")\
            .str.strip().str.lower()


#df_mod.to_csv('Testing2.csv', index=False)

df_mod.head(25)

  df_mod['Job Description_Edited'] = df_mod['Job Description_Edited'].str.replace("\n","").str.replace("\t","")\


Unnamed: 0,Company,Job Title,Job Description,Job Description_Edited
0,Ideals Recruitment Pte Ltd (Recruitment Firm),Project Engineer / Coordinator (Kaki Bukit / U...,• Office hour\n• Immediate interview\n• Electr...,office hour immediate interview electrical / a...
1,Ideals Recruitment Pte Ltd (Recruitment Firm),Customer Service Executive (East / UP$4000 / 3...,• Japan MNC Company\n• $3500 - $4000\n• Third ...,japan mnc company - third party logistic immed...
2,Washington Frank International (A divison of F...,SAP Senior FICO Consultant,SAP Senior FICO Consultant\nMy client is an es...,sap senior fico consultantmy client is an esta...
3,LITE-ON SINGAPORE PTE LTD,Field Application Engineer,Key Responsibilities:\n\n\n\n\n\t\nFrequents o...,key responsibilitiesfrequents overseas travel ...
4,ScienTec Personnel (Recruitment Firm),Personal Banker (Basic up to 3.2K/Bonus/No exp...,"New branch opening, expanding fast. Achieveabl...","new branch opening, expanding fast. achieveabl..."
5,FSK Advisory Pte Ltd (Recruitment Firm),Admin Executive (Logistics/Purchasing) (Up to ...,Responsibilities:\nUpdating of data into the s...,responsibilitiesupdating of data into the syst...
6,integrated loyalty solutions pte ltd,merchant sales officer,Candidate must possess at least Higher seconda...,candidate must possess at least higher seconda...
7,FSK Advisory Pte Ltd (Recruitment Firm),"Beauty Advisor (Up to $2000, Retail Hrs, Islan...",Responsibilities:\nProvide professional beauty...,responsibilitiesprovide professional beauty ad...
8,MCi CAREER SERVICES PTE LTD (Recruitment Firm),Bus Captain ($1950 / Gross $3500 / 6Days / 44H...,"Benefits Summary:\n• Bus driver, Full-time Per...","benefits summary bus driver, full-time permane..."
9,Washington Frank International (A divison of F...,SAP Cutover Specialist,SAP Cutover Specialist \nMy client is an estab...,sap cutover specialist my client is an establi...


### ***Tagging ID to based on Job Title***

In [13]:
df_mod["id"] = df_mod.index + 1

df_mod.head(25)

Unnamed: 0,Company,Job Title,Job Description,Job Description_Edited,id
0,Ideals Recruitment Pte Ltd (Recruitment Firm),Project Engineer / Coordinator (Kaki Bukit / U...,• Office hour\n• Immediate interview\n• Electr...,office hour immediate interview electrical / a...,1
1,Ideals Recruitment Pte Ltd (Recruitment Firm),Customer Service Executive (East / UP$4000 / 3...,• Japan MNC Company\n• $3500 - $4000\n• Third ...,japan mnc company - third party logistic immed...,2
2,Washington Frank International (A divison of F...,SAP Senior FICO Consultant,SAP Senior FICO Consultant\nMy client is an es...,sap senior fico consultantmy client is an esta...,3
3,LITE-ON SINGAPORE PTE LTD,Field Application Engineer,Key Responsibilities:\n\n\n\n\n\t\nFrequents o...,key responsibilitiesfrequents overseas travel ...,4
4,ScienTec Personnel (Recruitment Firm),Personal Banker (Basic up to 3.2K/Bonus/No exp...,"New branch opening, expanding fast. Achieveabl...","new branch opening, expanding fast. achieveabl...",5
5,FSK Advisory Pte Ltd (Recruitment Firm),Admin Executive (Logistics/Purchasing) (Up to ...,Responsibilities:\nUpdating of data into the s...,responsibilitiesupdating of data into the syst...,6
6,integrated loyalty solutions pte ltd,merchant sales officer,Candidate must possess at least Higher seconda...,candidate must possess at least higher seconda...,7
7,FSK Advisory Pte Ltd (Recruitment Firm),"Beauty Advisor (Up to $2000, Retail Hrs, Islan...",Responsibilities:\nProvide professional beauty...,responsibilitiesprovide professional beauty ad...,8
8,MCi CAREER SERVICES PTE LTD (Recruitment Firm),Bus Captain ($1950 / Gross $3500 / 6Days / 44H...,"Benefits Summary:\n• Bus driver, Full-time Per...","benefits summary bus driver, full-time permane...",9
9,Washington Frank International (A divison of F...,SAP Cutover Specialist,SAP Cutover Specialist \nMy client is an estab...,sap cutover specialist my client is an establi...,10


### ***Check for any additional inputs that need to be removed***

In [14]:
cols = [4,0,1,2,3]
df_mod=df_mod[df_mod.columns[cols]]

df_mod.head(25)

#save into testing if you want to check... else not neccessary 
#df4.to_csv('Testing5.csv', index=False)

Unnamed: 0,id,Company,Job Title,Job Description,Job Description_Edited
0,1,Ideals Recruitment Pte Ltd (Recruitment Firm),Project Engineer / Coordinator (Kaki Bukit / U...,• Office hour\n• Immediate interview\n• Electr...,office hour immediate interview electrical / a...
1,2,Ideals Recruitment Pte Ltd (Recruitment Firm),Customer Service Executive (East / UP$4000 / 3...,• Japan MNC Company\n• $3500 - $4000\n• Third ...,japan mnc company - third party logistic immed...
2,3,Washington Frank International (A divison of F...,SAP Senior FICO Consultant,SAP Senior FICO Consultant\nMy client is an es...,sap senior fico consultantmy client is an esta...
3,4,LITE-ON SINGAPORE PTE LTD,Field Application Engineer,Key Responsibilities:\n\n\n\n\n\t\nFrequents o...,key responsibilitiesfrequents overseas travel ...
4,5,ScienTec Personnel (Recruitment Firm),Personal Banker (Basic up to 3.2K/Bonus/No exp...,"New branch opening, expanding fast. Achieveabl...","new branch opening, expanding fast. achieveabl..."
5,6,FSK Advisory Pte Ltd (Recruitment Firm),Admin Executive (Logistics/Purchasing) (Up to ...,Responsibilities:\nUpdating of data into the s...,responsibilitiesupdating of data into the syst...
6,7,integrated loyalty solutions pte ltd,merchant sales officer,Candidate must possess at least Higher seconda...,candidate must possess at least higher seconda...
7,8,FSK Advisory Pte Ltd (Recruitment Firm),"Beauty Advisor (Up to $2000, Retail Hrs, Islan...",Responsibilities:\nProvide professional beauty...,responsibilitiesprovide professional beauty ad...
8,9,MCi CAREER SERVICES PTE LTD (Recruitment Firm),Bus Captain ($1950 / Gross $3500 / 6Days / 44H...,"Benefits Summary:\n• Bus driver, Full-time Per...","benefits summary bus driver, full-time permane..."
9,10,Washington Frank International (A divison of F...,SAP Cutover Specialist,SAP Cutover Specialist \nMy client is an estab...,sap cutover specialist my client is an establi...


### ***Tokenized each rows into sentences***

In [15]:
df_mod['Tokenized_Description'] = df_mod.apply(lambda row: nltk.sent_tokenize(row['Job Description_Edited']), axis=1)

df_mod.head(25)

Unnamed: 0,id,Company,Job Title,Job Description,Job Description_Edited,Tokenized_Description
0,1,Ideals Recruitment Pte Ltd (Recruitment Firm),Project Engineer / Coordinator (Kaki Bukit / U...,• Office hour\n• Immediate interview\n• Electr...,office hour immediate interview electrical / a...,[office hour immediate interview electrical / ...
1,2,Ideals Recruitment Pte Ltd (Recruitment Firm),Customer Service Executive (East / UP$4000 / 3...,• Japan MNC Company\n• $3500 - $4000\n• Third ...,japan mnc company - third party logistic immed...,[japan mnc company - third party logistic imme...
2,3,Washington Frank International (A divison of F...,SAP Senior FICO Consultant,SAP Senior FICO Consultant\nMy client is an es...,sap senior fico consultantmy client is an esta...,[sap senior fico consultantmy client is an est...
3,4,LITE-ON SINGAPORE PTE LTD,Field Application Engineer,Key Responsibilities:\n\n\n\n\n\t\nFrequents o...,key responsibilitiesfrequents overseas travel ...,[key responsibilitiesfrequents overseas travel...
4,5,ScienTec Personnel (Recruitment Firm),Personal Banker (Basic up to 3.2K/Bonus/No exp...,"New branch opening, expanding fast. Achieveabl...","new branch opening, expanding fast. achieveabl...","[new branch opening, expanding fast., achievea..."
5,6,FSK Advisory Pte Ltd (Recruitment Firm),Admin Executive (Logistics/Purchasing) (Up to ...,Responsibilities:\nUpdating of data into the s...,responsibilitiesupdating of data into the syst...,[responsibilitiesupdating of data into the sys...
6,7,integrated loyalty solutions pte ltd,merchant sales officer,Candidate must possess at least Higher seconda...,candidate must possess at least higher seconda...,[candidate must possess at least higher second...
7,8,FSK Advisory Pte Ltd (Recruitment Firm),"Beauty Advisor (Up to $2000, Retail Hrs, Islan...",Responsibilities:\nProvide professional beauty...,responsibilitiesprovide professional beauty ad...,[responsibilitiesprovide professional beauty a...
8,9,MCi CAREER SERVICES PTE LTD (Recruitment Firm),Bus Captain ($1950 / Gross $3500 / 6Days / 44H...,"Benefits Summary:\n• Bus driver, Full-time Per...","benefits summary bus driver, full-time permane...","[benefits summary bus driver, full-time perman..."
9,10,Washington Frank International (A divison of F...,SAP Cutover Specialist,SAP Cutover Specialist \nMy client is an estab...,sap cutover specialist my client is an establi...,[sap cutover specialist my client is an establ...


### ***Split each tokenized sentences into separate rows***

In [16]:
df_mod = df_mod.assign(text=df_mod.Tokenized_Description.str.split('[.!;]')).explode('Tokenized_Description').loc[lambda x : x.text!='']

df_mod.head(25)

Unnamed: 0,id,Company,Job Title,Job Description,Job Description_Edited,Tokenized_Description,text
0,1,Ideals Recruitment Pte Ltd (Recruitment Firm),Project Engineer / Coordinator (Kaki Bukit / U...,• Office hour\n• Immediate interview\n• Electr...,office hour immediate interview electrical / a...,office hour immediate interview electrical / a...,
1,2,Ideals Recruitment Pte Ltd (Recruitment Firm),Customer Service Executive (East / UP$4000 / 3...,• Japan MNC Company\n• $3500 - $4000\n• Third ...,japan mnc company - third party logistic immed...,japan mnc company - third party logistic immed...,
2,3,Washington Frank International (A divison of F...,SAP Senior FICO Consultant,SAP Senior FICO Consultant\nMy client is an es...,sap senior fico consultantmy client is an esta...,sap senior fico consultantmy client is an esta...,
2,3,Washington Frank International (A divison of F...,SAP Senior FICO Consultant,SAP Senior FICO Consultant\nMy client is an es...,sap senior fico consultantmy client is an esta...,"requirementsbachelor degree, master degree or ...",
3,4,LITE-ON SINGAPORE PTE LTD,Field Application Engineer,Key Responsibilities:\n\n\n\n\n\t\nFrequents o...,key responsibilitiesfrequents overseas travel ...,key responsibilitiesfrequents overseas travel ...,
4,5,ScienTec Personnel (Recruitment Firm),Personal Banker (Basic up to 3.2K/Bonus/No exp...,"New branch opening, expanding fast. Achieveabl...","new branch opening, expanding fast. achieveabl...","new branch opening, expanding fast.",
4,5,ScienTec Personnel (Recruitment Firm),Personal Banker (Basic up to 3.2K/Bonus/No exp...,"New branch opening, expanding fast. Achieveabl...","new branch opening, expanding fast. achieveabl...","achieveable target, quartery bonus and special...",
4,5,ScienTec Personnel (Recruitment Firm),Personal Banker (Basic up to 3.2K/Bonus/No exp...,"New branch opening, expanding fast. Achieveabl...","new branch opening, expanding fast. achieveabl...",personal banker basic up to .k/bonus/no experi...,
4,5,ScienTec Personnel (Recruitment Firm),Personal Banker (Basic up to 3.2K/Bonus/No exp...,"New branch opening, expanding fast. Achieveabl...","new branch opening, expanding fast. achieveabl...",if you are excited by the above opportunity an...,
5,6,FSK Advisory Pte Ltd (Recruitment Firm),Admin Executive (Logistics/Purchasing) (Up to ...,Responsibilities:\nUpdating of data into the s...,responsibilitiesupdating of data into the syst...,responsibilitiesupdating of data into the syst...,


### ***Drop unwanted column***

In [17]:
df_mod.drop(['Job Description','Job Description_Edited','text','Company'],axis=1, inplace=True)

In [18]:
df_mod.head(25)

Unnamed: 0,id,Job Title,Tokenized_Description
0,1,Project Engineer / Coordinator (Kaki Bukit / U...,office hour immediate interview electrical / a...
1,2,Customer Service Executive (East / UP$4000 / 3...,japan mnc company - third party logistic immed...
2,3,SAP Senior FICO Consultant,sap senior fico consultantmy client is an esta...
2,3,SAP Senior FICO Consultant,"requirementsbachelor degree, master degree or ..."
3,4,Field Application Engineer,key responsibilitiesfrequents overseas travel ...
4,5,Personal Banker (Basic up to 3.2K/Bonus/No exp...,"new branch opening, expanding fast."
4,5,Personal Banker (Basic up to 3.2K/Bonus/No exp...,"achieveable target, quartery bonus and special..."
4,5,Personal Banker (Basic up to 3.2K/Bonus/No exp...,personal banker basic up to .k/bonus/no experi...
4,5,Personal Banker (Basic up to 3.2K/Bonus/No exp...,if you are excited by the above opportunity an...
5,6,Admin Executive (Logistics/Purchasing) (Up to ...,responsibilitiesupdating of data into the syst...


In [19]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')


df_mod['Tokenized_Description_Test'] = df_mod['Tokenized_Description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
pd.DataFrame(df_mod,columns=['Tokenized_Description_Test']).head(25)


[nltk_data] Downloading package stopwords to C:\Users\MASTER
[nltk_data]     ILYAS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Tokenized_Description_Test
0,office hour immediate interview electrical / a...
1,japan mnc company - third party logistic immed...
2,sap senior fico consultantmy client establishe...
2,"requirementsbachelor degree, master degree phd..."
3,key responsibilitiesfrequents overseas travel ...
4,"new branch opening, expanding fast."
4,"achieveable target, quartery bonus special inc..."
4,personal banker basic .k/bonus/no experience r...
4,excited opportunity challenges enjoy making th...
5,responsibilitiesupdating data systemhandling p...


In [20]:
df_mod.head(25)

Unnamed: 0,id,Job Title,Tokenized_Description,Tokenized_Description_Test
0,1,Project Engineer / Coordinator (Kaki Bukit / U...,office hour immediate interview electrical / a...,office hour immediate interview electrical / a...
1,2,Customer Service Executive (East / UP$4000 / 3...,japan mnc company - third party logistic immed...,japan mnc company - third party logistic immed...
2,3,SAP Senior FICO Consultant,sap senior fico consultantmy client is an esta...,sap senior fico consultantmy client establishe...
2,3,SAP Senior FICO Consultant,"requirementsbachelor degree, master degree or ...","requirementsbachelor degree, master degree phd..."
3,4,Field Application Engineer,key responsibilitiesfrequents overseas travel ...,key responsibilitiesfrequents overseas travel ...
4,5,Personal Banker (Basic up to 3.2K/Bonus/No exp...,"new branch opening, expanding fast.","new branch opening, expanding fast."
4,5,Personal Banker (Basic up to 3.2K/Bonus/No exp...,"achieveable target, quartery bonus and special...","achieveable target, quartery bonus special inc..."
4,5,Personal Banker (Basic up to 3.2K/Bonus/No exp...,personal banker basic up to .k/bonus/no experi...,personal banker basic .k/bonus/no experience r...
4,5,Personal Banker (Basic up to 3.2K/Bonus/No exp...,if you are excited by the above opportunity an...,excited opportunity challenges enjoy making th...
5,6,Admin Executive (Logistics/Purchasing) (Up to ...,responsibilitiesupdating of data into the syst...,responsibilitiesupdating data systemhandling p...


In [21]:
stop_words_l=stopwords.words('english')
df_mod['Tokenized_Description_Test_Cleaned']=df_mod['Tokenized_Description_Test'].apply(lambda x: " ".join(re.sub(r'[^a-zA-Z]',' ',w).lower() for w in x.split()))

In [22]:
df_mod=pd.DataFrame(df_mod,columns=['Tokenized_Description_Test_Cleaned'])

In [23]:
df_mod.head(25)

Unnamed: 0,Tokenized_Description_Test_Cleaned
0,office hour immediate interview electrical a...
1,japan mnc company third party logistic immed...
2,sap senior fico consultantmy client establishe...
2,requirementsbachelor degree master degree phd...
3,key responsibilitiesfrequents overseas travel ...
4,new branch opening expanding fast
4,achieveable target quartery bonus special inc...
4,personal banker basic k bonus no experience r...
4,excited opportunity challenges enjoy making th...
5,responsibilitiesupdating data systemhandling p...


### Corpus Dataframe

In [24]:
df_mod['Tokenized_Description_Test_Cleaned'].head(25)

0     office hour immediate interview electrical   a...
1     japan mnc company   third party logistic immed...
2     sap senior fico consultantmy client establishe...
2     requirementsbachelor degree  master degree phd...
3     key responsibilitiesfrequents overseas travel ...
4                   new branch opening  expanding fast 
4     achieveable target  quartery bonus special inc...
4     personal banker basic  k bonus no experience r...
4     excited opportunity challenges enjoy making th...
5     responsibilitiesupdating data systemhandling p...
5     requirementsdiploma business administration re...
5     regret inform shortlisted candidates notified ...
6     candidate must possess least higher secondary ...
6     ils deploy pos terminals accepting cards payme...
6     also required manage merchant relationship  pr...
6     monthly spend commission paid ils earnings car...
6     sign commission reaching     signups month pai...
7     responsibilitiesprovide professional beaut

### ***Drop Duplicates***

In [25]:
df_mod = df_mod.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)

In [26]:
tokenized_jd = df_mod.loc[0, 'Tokenized_Description_Test_Cleaned']
print(tokenized_jd)

office hour immediate interview electrical   aa project aws   variable bonus   months excellent welfare benefits   career progression job scopeto manage  co ordinate deploy site foreman  subcontractor supplier effectively ensure project completed according specifications  quality standards  cost time frameattend meetings negotiation clients  subcontractors suppliersresolve site problem investigating problems  develop solution  make recommendation managementensure compliance legal requirements concerning safety  supervision quality controlensure customer service satisfaction maintain good client relationships requirementnitec diploma electrical relevant qualification years relevant experiencesknowledge acmv advantage candidates encouraged apply position via apply button following information resume work experiences job responsibilitiescurrent expected salaryreason leavingdate availabilityeducation background


#### BERT model

In [27]:
from sentence_transformers import SentenceTransformer, util
import torch

embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [28]:
corpus =(df_mod['Tokenized_Description_Test_Cleaned'].values.tolist())

In [29]:
#corpus = df_mod.drop_duplicates(subset=None, keep='first', inplace=False, ignore_index=False)

In [30]:
corpus

['office hour immediate interview electrical   aa project aws   variable bonus   months excellent welfare benefits   career progression job scopeto manage  co ordinate deploy site foreman  subcontractor supplier effectively ensure project completed according specifications  quality standards  cost time frameattend meetings negotiation clients  subcontractors suppliersresolve site problem investigating problems  develop solution  make recommendation managementensure compliance legal requirements concerning safety  supervision quality controlensure customer service satisfaction maintain good client relationships requirementnitec diploma electrical relevant qualification years relevant experiencesknowledge acmv advantage candidates encouraged apply position via apply button following information resume work experiences job responsibilitiescurrent expected salaryreason leavingdate availabilityeducation background',
 'japan mnc company   third party logistic immediate candidate advantage aw

In [31]:
# Single list of sentences
'''
sentences = ['job purpose the physiotherapist develops and provides the rehabilitative care and services based on the care model to patients of the nursing homes.',
'responsible for all financial matters of shamir singapore including accounting and bookkeeping monthly reporting to shamir hq meeting the statutory local requirements taxes vat etc.',
'jprovide administrative support issue delivery orders invoices quotations and other paperwork keep track of company vehicle maintenance record e.g.',
'ability to work with other departments and management to determine organizational goals pinpoint trends correlations and patterns in complicated data sets.',
'performing basic office tasks such as filing data entry answering phones processing the mail etc.',
'job descriptions to manage project site by supervising the man on site and coordinating the site daily activities with the main contractors.'
'standardize processes and implement best practices.',
'evaluate and prescribe assistive devices to improve function in activities of daily living as required by patients.',
'administration maintain database/records of all patients to provide evidence-based treatment with accurate and appropriate documentation.',
'assess plan implement monitor and evaluate physiotherapy and related service delivery to residents.'
]
'''

#Compute embeddings
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['Software Engineer']

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(100, len(corpus))
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)

# We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))
        

'''
#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        pairs.append({'index': [i, j], 'score': cosine_scores[i][j]})

#Sort scores in decreasing order
pairs = sorted(pairs, key=lambda x: x['score'], reverse=True)

print(len(pairs))
6

for pair in pairs[0:10]:
    i, j = pair['index']
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair['score']))

'''





Query: Software Engineer

Top 5 most similar sentences in corpus:
software engineer charge developing application control software automated machines developed company  (Score: 0.7824)
junior software engineer  working directly cto senior programmers aid creation said products  (Score: 0.7718)
senior software development engineer  drive technical innovation across digital organization  (Score: 0.7507)
provides technical assistance process development engineering personnel  (Score: 0.7361)
job responsibilities develops supports software manufacturing system  (Score: 0.7361)
degree computer engineer   computer science   information systems relatedmin  (Score: 0.7315)
develop process integration engineer  (Score: 0.7080)
senior software engineer responsible designing developing sfdc applications apac  (Score: 0.7076)
currently looking senior engineer industrial process improvement  (Score: 0.6953)
degree engineering science  (Score: 0.6948)
software engineer work closely clients acros

'\n#Find the pairs with the highest cosine similarity scores\npairs = []\nfor i in range(len(cosine_scores)-1):\n    for j in range(i+1, len(cosine_scores)):\n        pairs.append({\'index\': [i, j], \'score\': cosine_scores[i][j]})\n\n#Sort scores in decreasing order\npairs = sorted(pairs, key=lambda x: x[\'score\'], reverse=True)\n\nprint(len(pairs))\n6\n\nfor pair in pairs[0:10]:\n    i, j = pair[\'index\']\n    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], pair[\'score\']))\n\n'