# Import Libraries, Dataset, and Installation of Model

In [1]:
pip install -U sentence-transformers

Note: you may need to restart the kernel to use updated packages.


DEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063
DEPRECATION: textract 1.6.5 has a non-standard dependency specifier extract-msg<=0.29.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of textract or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063


In [11]:
import pandas as pd
import torch
import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
from emoji import demojize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_excel('Chips_Database.xlsx')
stop_words = set(stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()

In [19]:
print(df.dtypes)

Title                         object
Company                       object
Description                   object
Salary                        object
Deadline              datetime64[ns]
Locations                     object
Mode                          object
Type                          object
Gender                        object
Experience                    object
Field_Of_Interests            object
dtype: object


# List of Functions

In [5]:
def text_clean(text_data, stop_words, lemmatizer):
    text_data = unicodedata.normalize('NFKD', text_data).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text_data = text_data.lower()
    text_data = demojize(text_data)
    pattern_punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')
    text_data = pattern_punct.sub(r'\1', text_data)
    text_data = re.sub(' {2,}',' ', text_data)
    text_data = re.sub(r"[^a-zA-Z?!]+", ' ', text_data)
    text_data = str(text_data)
    tokenizer = ToktokTokenizer()
    text_data = tokenizer.tokenize(text_data)
    text_data = [item for item in text_data if item not in stop_words]
    text_data = [lemmatizer.lemmatize(word = w, pos = 'v') for w in text_data]
    text_data = ' '.join (text_data)
    return text_data

In [20]:
from sentence_transformers import SentenceTransformer, util
def cosine_similarity(sentences):
    #Job title is 5 points, fields_of_interests is 3 points, description is 2 points, experience is useless since no interpretation
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    
    embedding_1 = model.encode(sentences[0], convert_to_tensor = True)
    
    similarities = []
    
    for sentence in sentences[1:]:
        embedding_2 = model.encode(sentence, convert_to_tensor=True)
        
        similarity = util.pytorch_cos_sim(embedding_1, embedding_2)
        similarities.append(similarity.item())
    
    return similarities

In [7]:
def filter_dataframe(df, company, deadline, locations, mode, type_, gender):
    location_set = set(locations)
    df['Deadline'] = pd.to_datetime(df['Deadline'], format='%d-%m-%Y')
    
    filtered_df = df[
        (df['Company'] == company) &
        (df['Locations'].apply(lambda x: x in location_set)) &  # Check if location is in the list
        (df['Mode'] == mode) &
        (df['Type'] == type_) &
        (df['Gender'] == gender) &
        (df['Deadline'] >= pd.to_datetime(deadline, format='%d-%m-%Y'))
    ]
    return filtered_df

# Comparison with Chips Database Data

In [14]:
title = input("Enter Job Title: ")
company = input("Enter Company Name:")
description = input("Enter Description:")
salary = input("Enter Salary:") + " a month"
#deadline = input("Enter Deadline:")
deadline = "01-01-2020"
locations = []
while True:
    location = input("Enter a location (or 'stop' to quit): ")
    if location.lower() == 'stop':
        break
    locations.append(location)
mode = input("Enter Job Mode:")
type_ = input("Enter Job Type:")
gender = input("Enter Gender:")
experience = input("Enter Experience:") + " years"
fields_of_interests = []
while True:
    fields_of_interest = input("Enter a field of interest (or 'stop' to quit): ")
    if fields_of_interest.lower() == 'stop':
        break
    fields_of_interests.append(fields_of_interest)
filtered_df = filter_dataframe(df, company, deadline, locations, mode, type_, gender)

filtered_df

# ENTER IN THIS FORMAT
# Enter Job Title: Data Analyst
# Enter Company Name:Herbion Private Limited
# Enter Description:Needs to know SQL, Excel, Tableau
# Enter Salary:40,000
# Enter a location (or 'stop' to quit): Karachi
# Enter a location (or 'stop' to quit): stop
# Enter Job Mode:Physical
# Enter Job Type:Full-time
# Enter Gender:All
# Enter Experience:2
# Enter a field of interest (or 'stop' to quit): Python
# Enter a field of interest (or 'stop' to quit): R
# Enter a field of interest (or 'stop' to quit): Node JS
# Enter a field of interest (or 'stop' to quit): stop

Enter Job Title: Data Analyst
Enter Company Name:Herbion Private Limited
Enter Description:Needs to know SQL, Excel, Tableau
Enter Salary:40,000
Enter a location (or 'stop' to quit): Karachi
Enter a location (or 'stop' to quit): stop
Enter Job Mode:Physical
Enter Job Type:Full-time
Enter Gender:All
Enter Experience:2
Enter a field of interest (or 'stop' to quit): Python
Enter a field of interest (or 'stop' to quit): R
Enter a field of interest (or 'stop' to quit): Node JS
Enter a field of interest (or 'stop' to quit): stop


Unnamed: 0,Title,Company,Description,Salary,Deadline,Locations,Mode,Type,Gender,Experience,Field_Of_Interests
0,Data Scientist,Herbion Private Limited,Job Summary: Seeking a skilled and motivated D...,"Rs 60,000 - Rs 80,000 a month",2023-06-30,Karachi,Physical,Full-time,All,Unknown,"Data Science, Supply Chain, Predictive Modelin..."


In [21]:
csv_titles = filtered_df['Title'].tolist()
titles = [title] + csv_titles
#print(titles)

score = cosine_similarity(titles)
total_score = [x * 0.5 for x in score] # Giving Title as weight 0.5

csv_description = filtered_df['Description'].tolist()
descriptions = [description] + csv_description

for i in range(len(score)):
    clean_descriptions = [text_clean(description, stop_words, lemmatizer) for description in descriptions]
    score = cosine_similarity(clean_descriptions)
#     score = cosine_similarity(descriptions)
    total_score[i] += score[i] * 0.2 # Giving Description as weight 0.2


csv_foi = filtered_df['Field_Of_Interests'].tolist()
foi = ', '.join(fields_of_interests)
fois = [foi] + csv_foi

for i in range(len(score)):
    score = cosine_similarity(fois)
    total_score[i] += score[i] * 0.3 # Giving Field Of Interests as weight 0.3
print(total_score)

[0.5058190226554871]


# Comparison Between Two Chips Inputs

In [22]:
titles = ["Web Developer - Node JS", "Web Developer - Node JS"]
descriptions = ["Should know the basics of Node JS, knowledge of APIS is a good plus", "Dedicated professional who can design amazing webpages using Node JS"]
fois = ["Node JS, Javascript, Backend", "Node JS, Javascript, Web Developer"]

In [25]:
if input("Press 'y' to restart: ") != 'y':
    for j in range(2):
        titles = []
        titles.append(input("Add a title for chip: "))

        descriptions = []
        descriptions.append(input("Add description for chip: "))

        fois = []
        fois.append(input("Add fois for chip (Python, R, SQL): "))

score = cosine_similarity(titles)
total_score = [x * 0.5 for x in score]  # Giving Title as weight 0.5
print("\nTitle Percentage: ", total_score[0], "%")

clean_descriptions = [text_clean(description, stop_words, lemmatizer) for description in descriptions]
score = cosine_similarity(clean_descriptions)
total_score[0] += score[0] * 0.2  # Giving Description as weight 0.2
print("\nDescription Percentage: ", score[0] * 0.2, "%")

score = cosine_similarity(fois)
total_score[0] += score[0] * 0.3  # Giving Field Of Interest as weight 0.3
print("\nField Of Interest Percentage: ", score[0] * 0.3, "%")

print("\nAccuracy: ", total_score[0], "%")

# Add a title for chip: Backend Developer
# Add description for chip: Should know the basics of Node JS, knowledge of APIS is a good plus
# Add fois for chip (Python, R, SQL): Node JS, Javascript, Backend
# Add a title for chip: Web Developer - Node JS
# Add description for chip: Dedicated professional who can design amazing webpages using Node JS
# Add fois for chip (Python, R, SQL): Node JS, Java, Web Developer

Press 'y' to restart: y

Title Percentage:  0.5000000596046448 %

Description Percentage:  0.09553024768829346 %

Field Of Interest Percentage:  0.2424574255943298 %

Accuracy:  0.837987732887268 %
