In [25]:
# Import statements
import pandas as pd
import numpy as np
from collections import Counter
from functools import reduce


In [26]:
# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess

In [27]:
# stopwords
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words  = stopwords.words("english")
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
# Read the data file
df = pd.read_csv("train_data.csv")

In [29]:
# I'm considering only 8000 rows from the entire dataset - to implement Naive Bayes
# Because - my notebook is crashing for huge data

df = df[:8000]
df

Unnamed: 0.1,Unnamed: 0,Sentence_id,New_Sentence,Type
0,0,GERRES15609,Author and/or Review architecture/design and o...,Responsibility
1,1,PHERES15784,Should be able to develop custom dynamic shape...,Responsibility
2,2,GERREQ10457,Experience in working crosslly with a larger ...,Requirement
3,3,GERSKL27235,"Previous business experience, including but no...",Skill
4,4,HONSSK18415,Delivering fast and right the first time.,SoftSkill
...,...,...,...,...
7995,7995,HONSKL1751,Sr Director Customer Experience.,Skill
7996,7996,UAESSK42241,Excellent analytical skills .,SoftSkill
7997,7997,GERREQ10410,Strong experience in integration testing.,Requirement
7998,7998,COGRES47805,"Optimize the sales organization and processes,...",Responsibility


In [30]:
# Divide the data into train, dev and test
# We will shuffle the whole dataset first (df.sample(frac=1, random_state=42)),
# and then split our data set into the following parts
# Train - 80%, Dev - 10%, Test - 10%

train, dev, test = np.split(df.sample(frac=1, random_state=42), 
                       [int(.8*len(df)), int(.9*len(df))])

In [31]:
# shape of train data
train.shape

(6400, 4)

In [32]:
# shape of dev data
dev.shape

(800, 4)

In [33]:
# shape of test data
test.shape

(800, 4)

In [34]:
train

Unnamed: 0.1,Unnamed: 0,Sentence_id,New_Sentence,Type
2215,2215,HONREQ8748,"Schedule flexibility and resilience, includin...",Requirement
2582,2582,GERREQ4428,Planning out projects and being involved in p...,Requirement
1662,1662,UAEREQ42145,Be an entrepreneurial problem solver who is p...,Requirement
3027,3027,GERRES40667,Review & assist with the Description of Opera...,Responsibility
4343,4343,UAESSK26982,Strong team player and collaboration with oth...,SoftSkill
...,...,...,...,...
5399,5399,GERSSK8333,"Ability to communicate effectively (oral, wri...",SoftSkill
1387,1387,HONSSK52070,Effective communicator with a wide degree of ...,SoftSkill
7507,7507,UAESKL6837,Proven ability to lead projects and ensure obj...,Skill
7242,7242,UAERES26567,Continuously improve service operational metr...,Responsibility


In [35]:
# Data preprocessing - Preprocessing each and every sentence separately

def preprocess(sentence_list):
    row_sentence_list = []

    for i in sentence_list:
        sentence = re.sub(r"\W", " ", str(i)) # keep only words and digits
        sentence = re.sub(r"\d", " ", sentence) # removing the digits
        sentence = re.sub(r"\s+[a-z]\s+" , " " , sentence, flags = re.I) # removing extra spaces
        sentence = re.sub(r"^\s" , " " , sentence) # remove the space at the beginning of the sentence
        sentence = re.sub( r"\s$", " " , sentence) # remove the sapce at the ending of the sentence
        sentence = gensim.utils.simple_preprocess(str(sentence), deacc = True)
        sentence = [word for word in sentence if word not in stop_words]  # for experimenting
        row_sentence_list.append(sentence)

    return  row_sentence_list
    


In [36]:
# Data preprocessing  - all rows together to obtain the voacb list

def preprocess_train_data(sentence_list):

        sentence = re.sub(r"\W", " ", str(sentence_list)) # keep only words and digits
        sentence = re.sub(r"\d", " ", sentence) # removing the digits
        sentence = re.sub(r"\s+[a-z]\s+" , " " , sentence, flags = re.I) # removing extra spaces
        sentence = re.sub(r"^\s" , " " , sentence) # remove the space at the beginning of the sentence
        sentence = re.sub( r"\s$", " " , sentence) # remove the sapce at the ending of the sentence
        sentence = gensim.utils.simple_preprocess(str(sentence), deacc = True)
        sentence = [word for word in sentence if word not in stop_words]  # for rexperimenting
        
        return sentence

In [37]:
# Fetching the list of words present from all the rows given in train data

sentence_list = train["New_Sentence"].tolist()
vocab = preprocess_train_data(sentence_list)

vocab_dict = Counter(vocab) # Fetches the uniques items in the vocab list, and counts its occurences
rare_words = {} # dict to store all the rare words
processed_vocab_dict = {} # processed dict which will not include omitted rare words 

# Find the rare words, i.e. if the occurrence is less than five times
for k,v in vocab_dict.items():
  if v < 5:
    rare_words[k] = v

# Omit all the rare words, whose occrence is less than five times
for k,v in vocab_dict.items():
  if k not in rare_words.keys():
    processed_vocab_dict[k] = v

In [38]:
# Printing the vocabulary dictionary
print(f"Global Vocabulary : \nCount = {len(vocab_dict)}\n")
print(dict(vocab_dict))

Global Vocabulary : 
Count = 6912



In [39]:
# Print all the rare words
print(f"Rare Words in the Train data : \nCount = {len(rare_words)}\n")
print(rare_words)

Rare Words in the Train data : 
Count = 5218



In [40]:
# Printing the processed vocabulary dictionary
print(f"Processed Vocabulary: \nCount = {len(processed_vocab_dict)}\n")
print(processed_vocab_dict)

Processed Vocabulary: 
Count = 1694

{'schedule': 39, 'flexibility': 12, 'including': 169, 'ability': 505, 'adapt': 15, 'changing': 15, 'conditions': 5, 'planning': 103, 'projects': 166, 'involved': 11, 'project': 322, 'management': 516, 'decisions': 29, 'problem': 92, 'bring': 7, 'cross': 63, 'functional': 135, 'teams': 145, 'together': 5, 'new': 124, 'ideas': 15, 'review': 48, 'assist': 31, 'description': 9, 'operation': 11, 'test': 131, 'plans': 50, 'user': 42, 'strong': 296, 'team': 302, 'player': 13, 'collaboration': 24, 'functions': 46, 'apply': 27, 'defined': 22, 'agile': 69, 'product': 202, 'program': 126, 'practices': 53, 'ensure': 125, 'specific': 30, 'component': 17, 'single': 9, 'non': 30, 'complex': 105, 'ensuring': 28, 'delivery': 88, 'time': 62, 'budget': 11, 'scope': 29, 'build': 59, 'advance': 7, 'data': 262, 'analytics': 52, 'technology': 164, 'next': 10, 'years': 899, 'alignment': 12, 'strategic': 68, 'business': 517, 'initiatives': 50, 'demonstrate': 24, 'execution'

In [41]:
# Function to remove the rare words from each row document
def removeRareWords(row):

  row = [word for word in row if word not in rare_words.keys()]
  return Counter(row)


In [42]:
sentence_list = train["New_Sentence"].tolist()
labels = Counter(train["Type"].tolist()) # Fetch all the unique labels

labels
    

Counter({'Requirement': 1490,
         'Responsibility': 1682,
         'SoftSkill': 954,
         'Experience': 1028,
         'Skill': 761,
         'Education': 485})

In [43]:
# Dividing the training set into subsets based on each unique label

experience_df = train[train['Type'] == "Experience"]
education_df = train[train['Type'] == "Education"]
requirement_df = train[train['Type'] == "Requirement"]
skill_df = train[train['Type'] == "Skill"]
softskill_df = train[train['Type'] == "SoftSkill"]
responsibility_df = train[train['Type'] == "Responsibility"]


responsibility_df

Unnamed: 0.1,Unnamed: 0,Sentence_id,New_Sentence,Type
3027,3027,GERRES40667,Review & assist with the Description of Opera...,Responsibility
2680,2680,INDRES20388,Should apply defined Agile Product & Program M...,Responsibility
1765,1765,PHERES51459,Build and advance data & analytics technology ...,Responsibility
1123,1123,UAERES26624,Define and execute our intellectual property ...,Responsibility
4054,4054,GERRES52232,You will provide leadership in the adoption of...,Responsibility
...,...,...,...,...
3280,3280,HONRES29259,"Apply engineering knowledge, tools, and princ...",Responsibility
4831,4831,COGRES1180,"As a Senior Full Stack Developer, you will be ...",Responsibility
2106,2106,COGRES36526,Be a coach to Security Architects & Security ...,Responsibility
2679,2679,INDRES39785,You will be involved in every aspect of the th...,Responsibility


In [78]:
experience_df

Unnamed: 0.1,Unnamed: 0,Sentence_id,New_Sentence,Type
2944,2944,UAEEXP9691,Five (5) or more years directly-applicable ex...,Experience
3181,3181,UAEEXP48504,"Bachelors degree in Engineering, Business or ...",Experience
1618,1618,INDEXP31825,Minimum of 2 years of experience in the design...,Experience
4794,4794,HONEXP36558,7+ years of planning/manufacturing experience.,Experience
3362,3362,GEREXP6238,3+ years of scheduling experience.,Experience
...,...,...,...,...
7887,7887,UAEEXP9727,Qualification and Skills: Minimum 4 years of e...,Experience
2573,2573,INDEXP10274,"2+ years in building REST/SOAP APIs, MongoDB,...",Experience
2768,2768,UAEEXP51060,At least 5 years of technical leadership expe...,Experience
4688,4688,INDEXP2501,1+ years of experience working with networked ...,Experience


In [44]:
# Creating the list of row documents w.r.t to each label

# Create a list of processed row documents
def process_rows(row_words):
  row_document_list = []

  for row in row_words:
    row_document_list.append(removeRareWords(row))

  return row_document_list

# Process the rows of each Type/Class label
# STEPS : 
# 1 - preprocess each row - where, unwanted  character, digits, spaces etc are reomoved. And the
#     function returns the list of splitted words for each row
# 2 - In the second step process, remove the rare words from that list of words for each row

# Experience
exp_sent_list = experience_df["New_Sentence"].tolist()
exp_rows = process_rows(preprocess(exp_sent_list))

# Education
edu_sent_list = education_df["New_Sentence"].tolist()
edu_rows = process_rows(preprocess(edu_sent_list))

# Requirement
req_sent_list = requirement_df["New_Sentence"].tolist()
req_rows = process_rows(preprocess(req_sent_list))

# Skill
skill_sent_list = skill_df["New_Sentence"].tolist()
skill_rows = process_rows(preprocess(skill_sent_list))

# Soft Skill
softskill_sent_list = softskill_df["New_Sentence"].tolist()
softskill_rows = process_rows(preprocess(softskill_sent_list))

# Reponsibility
resp_sent_list = responsibility_df["New_Sentence"].tolist()
resp_rows = process_rows(preprocess(resp_sent_list))

In [45]:

# Printing the list of processed row documents for each Type, where it displays the row words 
print("List of row documents for Experience Type: ")
print(len(exp_rows))
print(exp_rows)


print("\nList of row documents for Education Type: ")
print(len(edu_rows))
print(edu_rows)

print("\nList of row documents for Skill Type: ")
print(len(skill_rows))
print(skill_rows)

print("\nList of row documents for Requirement Type: ")
print(len(req_rows))
print(req_rows)

print("\nList of row documents for Soft Skill Type: ")
print(len(softskill_rows))
print(softskill_rows)

print("\nList of row documents for Responsibility Type: ")
print(len(resp_rows))
print(resp_rows)

List of row documents for Experience Type: 
1028
[Counter({'five': 1, 'years': 1, 'directly': 1, 'applicable': 1, 'experience': 1, 'strategic': 1, 'sourcing': 1, 'procurement': 1, 'supply': 1, 'chain': 1, 'similar': 1, 'function': 1}), Counter({'bachelors': 1, 'degree': 1, 'engineering': 1, 'business': 1, 'relevant': 1, 'discipline': 1, 'plus': 1, 'years': 1, 'related': 1, 'experience': 1}), Counter({'minimum': 1, 'years': 1, 'experience': 1, 'design': 1, 'implementation': 1, 'maintenance': 1, 'honeywell': 1, 'ot': 1, 'enterprise': 1, 'applications': 1, 'connected': 1, 'services': 1}), Counter({'years': 1, 'planning': 1, 'manufacturing': 1, 'experience': 1}), Counter({'years': 1, 'scheduling': 1, 'experience': 1}), Counter({'minimum': 1, 'years': 1, 'experience': 1, 'continuous': 1, 'integration': 1, 'deployment': 1}), Counter({'experience': 2, 'minimum': 1, 'years': 1, 'network': 1, 'security': 1, 'architecture': 1, 'design': 1, 'operations': 1}), Counter({'experience': 1, 'years': 1}

In [46]:
# Calculate the probability of occurence for each word present in processed vocab list
# input - processed_vocab_dict, row_document_list

total_rows = list_of_all_row_docs = exp_rows + edu_rows + skill_rows + req_rows + softskill_rows + resp_rows

def calcualteProbabilityOfOccurence(processed_vocab_dict, row_document_list):

  prob_occurence_doc = {}

  total_doc_count = len(row_document_list)

  for word in processed_vocab_dict:

    occurence  = 0

    for row_words in row_document_list:

      if word in row_words.keys():
        occurence += 1
      
    prob_occurence = occurence/total_doc_count

    prob_occurence_doc[word] = prob_occurence

  return prob_occurence_doc


In [47]:
print("Probabily of the occrence of each word in the processed vocabulary: \n")
prob_occurence = calcualteProbabilityOfOccurence(processed_vocab_dict, total_rows)


prob_occurence_df = pd.DataFrame.from_dict(prob_occurence, orient ='index') 
prob_occurence_df.columns = ["Probability Occurence"]
prob_occurence_df

Probabily of the occrence of each word in the processed vocabulary: 



Unnamed: 0,Probability Occurence
schedule,0.005938
flexibility,0.001875
including,0.026094
ability,0.077500
adapt,0.002344
...,...
vault,0.000469
legacy,0.000781
desk,0.000781
secret,0.000625


In [48]:
# Calculate conditional probabilties for each each word in the vocabulary w.r.t each Class label

experience_cond_prob = calcualteProbabilityOfOccurence(processed_vocab_dict, exp_rows)

education_cond_prob = calcualteProbabilityOfOccurence(processed_vocab_dict, edu_rows)

requirement_cond_prob = calcualteProbabilityOfOccurence(processed_vocab_dict, req_rows)

skill_cond_prob = calcualteProbabilityOfOccurence(processed_vocab_dict, skill_rows)

softskill_cond_prob = calcualteProbabilityOfOccurence(processed_vocab_dict, softskill_rows)

responsibility_cond_prob = calcualteProbabilityOfOccurence(processed_vocab_dict, resp_rows)



In [49]:
# Conditional Probabilties w.r.t class type - Experience

display_df = pd.DataFrame.from_dict(experience_cond_prob, orient ='index') 
display_df.columns = ["P({word}|Experience)"]
display_df

Unnamed: 0,P({word}|Experience)
schedule,0.000973
flexibility,0.000000
including,0.025292
ability,0.007782
adapt,0.000000
...,...
vault,0.000973
legacy,0.000973
desk,0.002918
secret,0.000000


In [50]:
# Conditional Probabilties w.r.t class type - Education

display_df = pd.DataFrame.from_dict(education_cond_prob, orient ='index') 
display_df.columns = ["P({word}|Education)"]
display_df

Unnamed: 0,P({word}|Education)
schedule,0.000000
flexibility,0.000000
including,0.004124
ability,0.004124
adapt,0.000000
...,...
vault,0.000000
legacy,0.000000
desk,0.000000
secret,0.000000


In [51]:
# Conditional Probabilties w.r.t class type - Requirement

display_df = pd.DataFrame.from_dict(requirement_cond_prob, orient ='index') 
display_df.columns = ["P({word}|Requirement)"]
display_df

Unnamed: 0,P({word}|Requirement)
schedule,0.009396
flexibility,0.002685
including,0.022819
ability,0.163758
adapt,0.004027
...,...
vault,0.000671
legacy,0.001342
desk,0.000671
secret,0.002685


In [52]:
# Conditional Probabilties w.r.t class type - Skill

display_df = pd.DataFrame.from_dict(skill_cond_prob, orient ='index') 
display_df.columns = ["P({word}|Skill)"]
display_df

Unnamed: 0,P({word}|Skill)
schedule,0.002628
flexibility,0.000000
including,0.028909
ability,0.034166
adapt,0.001314
...,...
vault,0.001314
legacy,0.000000
desk,0.001314
secret,0.000000


In [53]:
# Conditional Probabilties w.r.t class type - Soft Skills

display_df = pd.DataFrame.from_dict(softskill_cond_prob, orient ='index') 
display_df.columns = ["P({word}|Soft Skill)"]
display_df

Unnamed: 0,P({word}|Soft Skill)
schedule,0.005241
flexibility,0.007338
including,0.010482
ability,0.208595
adapt,0.006289
...,...
vault,0.000000
legacy,0.000000
desk,0.000000
secret,0.000000


In [54]:
# Conditional Probabilties w.r.t class type - Responsibility

display_df = pd.DataFrame.from_dict(responsibility_cond_prob, orient ='index') 
display_df.columns = ["P({word}|Responsibility)"]
display_df

Unnamed: 0,P({word}|Responsibility)
schedule,0.009512
flexibility,0.000595
including,0.043401
ability,0.010107
adapt,0.001189
...,...
vault,0.000000
legacy,0.001189
desk,0.000000
secret,0.000000


In [55]:
# Defining the Naive Bayes function - which computes the probability of each class using Naive Bayes theorem
# And, the class with the highest probability becomes the predicted label for the given test sentence

# calculate the probability of the classes

total_rows = list_of_all_row_docs = exp_rows + edu_rows + skill_rows + req_rows + softskill_rows + resp_rows

prob_experience_class = len(experience_df)/len(total_rows)

prob_education_class = len(education_df)/len(total_rows)

prob_requirement_class = len(requirement_df)/len(total_rows)

prob_skill_class = len(skill_df)/len(total_rows)

prob_softskill_class = len(softskill_df)/len(total_rows)

prob_resposibility_class = len(responsibility_df)/len(total_rows)


def naiveBayes(exp_values, edu_values, req_values, skill_values, softskill_values, resp_values):

  prob_dict = {}

  # compute the probability of the sentence w.r.t to Experience class
  prob_exp = reduce(lambda a, b: a * b, exp_values) * prob_experience_class
  prob_dict["Experience"] = prob_exp


  # compute the probability of the sentence w.r.t to Education class
  prob_edu = reduce(lambda a, b: a * b, edu_values) * prob_education_class
  prob_dict["Education"] = prob_edu


  # compute the probability of the sentence w.r.t to Requirement class
  prob_req = reduce(lambda a, b: a * b, req_values) * prob_requirement_class
  prob_dict["Requirement"] = prob_req


  # compute the probability of the sentence w.r.t to Skill class
  prob_skill = reduce(lambda a, b: a * b, skill_values) * prob_skill_class
  prob_dict["Skill"] = prob_skill


  # compute the probability of the sentence w.r.t to Soft Skill class
  prob_softskill = reduce(lambda a, b: a * b, softskill_values) * prob_softskill_class
  prob_dict["SoftSkill"] = prob_softskill


  # compute the probability of the sentence w.r.t to Responsibility class
  prob_resp = reduce(lambda a, b: a * b, resp_values) * prob_resposibility_class
  prob_dict["Responsibility"] = prob_resp

  # Fetch the class with the max probability
  predicted_class = max(prob_dict, key=prob_dict.get)

  # Return the predicted class
  return predicted_class

In [56]:
# In this step, considering the dev data to predict the class type for each given sentence in dev data set
# displaying dev data

dev

Unnamed: 0.1,Unnamed: 0,Sentence_id,New_Sentence,Type
6890,6890,UAESSK18283,Manage multiple priorities and work independen...,SoftSkill
7711,7711,PHEEDU46062,Bachelors Degree in Engineering or Chemistry.,Education
5000,5000,PHEEXP49816,Experience: 6-8 Years.,Experience
5853,5853,INDREQ20702,Travel Requirements: No Travel.,Requirement
1279,1279,PHERES28909,Debug and maintain test application and sourc...,Responsibility
...,...,...,...,...
3236,3236,PHEEDU34168,Critical SkillsProvides a critical role in ini...,Education
6172,6172,UAEEDU9612,Candidates with masters degree are preferable.,Education
1540,1540,PHERES1560,Coordinate user experience and design activiti...,Responsibility
1217,1217,INDRES2060,"Collaborate with Sales, Marketing, Technology...",Responsibility


In [57]:
# Preprocess dev data
# Splitting each document's sentence into lits of words
print(f"No. of dev data documnets: {len(dev)}\n")
processed_dev_data = preprocess(dev["New_Sentence"].tolist())


No. of dev data documnets: 800



In [58]:
def predictClassType(data, experience_cond_prob, education_cond_prob, requirement_cond_prob, skill_cond_prob, softskill_cond_prob, responsibility_cond_prob):

  # Predict the class type for each sentence given in each row of the dev data
  predicted_label_list = [] # place holder list to store all the predicted values

  for row in data:

    #Creating the place holder lists for each class that will store the conditional prob of each word in the given sentence
    exp_list = []
    edu_list = []
    req_list = []
    skill_list = []
    softskill_list = []
    resp_list = []

    # For each word in the sentence - fetch it's conditional probability w.r.t each class from the previously calculated dict
    # And, append those values to their respective lists
    # If the word is not present in the train data, then ignore that value by just taking 1 as it default conditional prob value

    for word in row:
      exp_list.append(experience_cond_prob.get(word, 1))

      edu_list.append(education_cond_prob.get(word, 1))

      req_list.append(requirement_cond_prob.get(word, 1))

      skill_list.append(skill_cond_prob.get(word, 1))

      softskill_list.append(softskill_cond_prob.get(word, 1))

      resp_list.append(responsibility_cond_prob.get(word, 1))

    # Apply naive bayes formula to compute the class type of each sentence
    predicted_label = naiveBayes(exp_list, edu_list, req_list, skill_list, softskill_list, resp_list)

    predicted_label_list.append(predicted_label)
  
  return predicted_label_list

In [59]:
# Function call to predict the class type of each document in the dev data
predicted_label_list = predictClassType(processed_dev_data, experience_cond_prob, education_cond_prob, requirement_cond_prob, skill_cond_prob, softskill_cond_prob, responsibility_cond_prob)

In [60]:
# Display the predicted labels for each sentence in dev data
print("Predicted Class Type for each document in dev data: \n")
display_df = pd.DataFrame(list(zip(dev["New_Sentence"].tolist(), predicted_label_list)), columns = ["Sentence", "Predicted Class Type"])
display_df

Predicted Class Type for each document in dev data: 



Unnamed: 0,Sentence,Predicted Class Type
0,Manage multiple priorities and work independen...,SoftSkill
1,Bachelors Degree in Engineering or Chemistry.,Education
2,Experience: 6-8 Years.,Experience
3,Travel Requirements: No Travel.,Requirement
4,Debug and maintain test application and sourc...,Responsibility
...,...,...
795,Critical SkillsProvides a critical role in ini...,Responsibility
796,Candidates with masters degree are preferable.,Requirement
797,Coordinate user experience and design activiti...,Responsibility
798,"Collaborate with Sales, Marketing, Technology...",Responsibility


In [61]:
# Defining the function to calculate the accuracy 
# actual_label_list = dev["Type"].tolist()

def calculateAccuracy(predicted_label_list, actual_label_list, total_count):

  correct_predicted_count = 0
  index = 0

  for i in range(total_count):
    if predicted_label_list[i] == actual_label_list[i]:
      correct_predicted_count += 1

  accuracy = (correct_predicted_count/total_count) * 100

  print(f"Acccuracy : {accuracy: .2f} %")

calculateAccuracy(predicted_label_list, dev["Type"].tolist(), len(dev))


Acccuracy :  59.62 %


In [62]:
# Implement Laplace smooting

def laplaceSmoothing(processed_vocab_dict, row_document_list, smoothing_param):

  prob_occurence_doc = {}

  total_doc_count = len(row_document_list)

  # Smoothing parameter to perform Laplace smoothing - smoothing_param

  # where k is the no. of feature in data, i.e total no. of words in processed voab dict
  k_value = len(processed_vocab_dict) 

  for word in processed_vocab_dict:

    occurence  = 0

    for row_words in row_document_list:

      if word in row_words.keys():
        occurence += 1

    # Calculate the condiional probability by applying laplace smoothing technique
    prob_occurence = (occurence + smoothing_param)/(total_doc_count + (k_value * smoothing_param))

    prob_occurence_doc[word] = prob_occurence

  return prob_occurence_doc


def laplaceSmoothingForMissingWord(word, k_value, n_value, smoothing_param):

  # Smoothing parameter to perform Laplace smoothing - smoothing_param

  prob_occurence = smoothing_param/(n_value + (k_value * smoothing_param))

  return prob_occurence



In [63]:
# Calculate conditional probabilties for each each word in the vocabulary w.r.t each Class label

def calculateConditionalProbabilitiy(smoothing_param):


  exp_cond_prob_laplace = laplaceSmoothing(processed_vocab_dict, exp_rows, smoothing_param)

  edu_cond_prob_laplace = laplaceSmoothing(processed_vocab_dict, edu_rows, smoothing_param)

  req_cond_prob_laplace = laplaceSmoothing(processed_vocab_dict, req_rows, smoothing_param)

  skill_cond_prob_laplace = laplaceSmoothing(processed_vocab_dict, skill_rows, smoothing_param)

  softskill_cond_prob_laplace = laplaceSmoothing(processed_vocab_dict, softskill_rows, smoothing_param)

  responsibility_cond_prob_laplace = laplaceSmoothing(processed_vocab_dict, resp_rows, smoothing_param)

  return exp_cond_prob_laplace, edu_cond_prob_laplace, req_cond_prob_laplace, skill_cond_prob_laplace, softskill_cond_prob_laplace, responsibility_cond_prob_laplace


exp_cond_prob_laplace, edu_cond_prob_laplace, req_cond_prob_laplace, skill_cond_prob_laplace, softskill_cond_prob_laplace, responsibility_cond_prob_laplace = calculateConditionalProbabilitiy(1)


In [64]:
# Conditional Probabilties w.r.t class type - Experience
# If we observe the conditional probabilites of each word w.r.t to Experience class type - 
# none of the values are 0 - after performing laplace smoothing

print("Coniditonal probabilities w.r.t Experience class type - after performing Laplace Smoothing: \n")
display_df = pd.DataFrame.from_dict(exp_cond_prob_laplace, orient ='index') 
display_df.columns = ["P({word}|Experience)"]
display_df

Coniditonal probabilities w.r.t Experience class type - after performing Laplace Smoothing: 



Unnamed: 0,P({word}|Experience)
schedule,0.000735
flexibility,0.000367
including,0.009919
ability,0.003306
adapt,0.000367
...,...
vault,0.000735
legacy,0.000735
desk,0.001470
secret,0.000367


In [65]:
def predictClassTypeUsingSmoothing(data, experience_cond_prob, education_cond_prob, requirement_cond_prob, skill_cond_prob, softskill_cond_prob, responsibility_cond_prob, smoothing_param):

  # Predict the class type for each sentence given in each row of the dev data
  predicted_label_list = [] # place holder list to store all the predicted values

  k_value = len(processed_vocab_dict)

  for row in data:

    #Creating the place holder lists for each class that will store the conditional prob of each word in the given sentence
    exp_list = []
    edu_list = []
    req_list = []
    skill_list = []
    softskill_list = []
    resp_list = []

    # For each word in the sentence - fetch it's conditional probability w.r.t each class from the previously calculated dict
    # And, append those values to their respective lists
    # If the word is not present in the train data, calculate the prob by using Laplace smoothing technique

    for word in row:
      exp_list.append(experience_cond_prob.get(word, laplaceSmoothingForMissingWord(word, k_value, len(experience_df), smoothing_param)))

      edu_list.append(education_cond_prob.get(word, laplaceSmoothingForMissingWord(word, k_value, len(education_df), smoothing_param)))

      req_list.append(requirement_cond_prob.get(word, laplaceSmoothingForMissingWord(word, k_value, len(requirement_df), smoothing_param)))

      skill_list.append(skill_cond_prob.get(word, laplaceSmoothingForMissingWord(word, k_value, len(skill_df), smoothing_param)))

      softskill_list.append(softskill_cond_prob.get(word, laplaceSmoothingForMissingWord(word, k_value, len(softskill_df), smoothing_param)))

      resp_list.append(responsibility_cond_prob.get(word, laplaceSmoothingForMissingWord(word, k_value, len(responsibility_df), smoothing_param)))

    # Apply naive bayes formula to compute the class type of each sentence
    predicted_label = naiveBayes(exp_list, edu_list, req_list, skill_list, softskill_list, resp_list)

    predicted_label_list.append(predicted_label)
  

  return predicted_label_list
  


In [66]:
# Function call to predict the class type of each document in the dev data

predicted_label_list = predictClassTypeUsingSmoothing(processed_dev_data, exp_cond_prob_laplace, edu_cond_prob_laplace, req_cond_prob_laplace, skill_cond_prob_laplace, softskill_cond_prob_laplace, responsibility_cond_prob_laplace, 1)

In [67]:
# Display the predicted labels for each sentence in dev data
print("Predicted Class Type for each document in dev data, after Laplace Smoothing: \n")
display_df = pd.DataFrame(list(zip(dev["New_Sentence"].tolist(), predicted_label_list)), columns = ["Sentence", "Predicted Class Type"])
display_df


Predicted Class Type for each document in dev data, after Laplace Smoothing: 



Unnamed: 0,Sentence,Predicted Class Type
0,Manage multiple priorities and work independen...,SoftSkill
1,Bachelors Degree in Engineering or Chemistry.,Education
2,Experience: 6-8 Years.,Experience
3,Travel Requirements: No Travel.,Requirement
4,Debug and maintain test application and sourc...,Responsibility
...,...,...
795,Critical SkillsProvides a critical role in ini...,Responsibility
796,Candidates with masters degree are preferable.,Requirement
797,Coordinate user experience and design activiti...,Responsibility
798,"Collaborate with Sales, Marketing, Technology...",Responsibility


In [68]:
# Calculate the accuracy after performing Laplace Smoothing - smoothing parameter = 1

print("After performing Laplace Smoothing: \n\nSmoothing parameter (alpha) = 1\n")
calculateAccuracy(predicted_label_list, dev["Type"].tolist(), len(dev))

After performing Laplace Smoothing: 

Smoothing parameter (alpha) = 1

Acccuracy :  64.62 %


In [69]:
def experimentSmoothing(smoothing_param):
 

  # Calculate Condtional Probabilities
  exp_cond_prob_laplace, edu_cond_prob_laplace, req_cond_prob_laplace, skill_cond_prob_laplace, softskill_cond_prob_laplace, responsibility_cond_prob_laplace = calculateConditionalProbabilitiy(smoothing_param)

  # Function call to predict the class type of each document in the dev data
  predicted_label_list = predictClassTypeUsingSmoothing(processed_dev_data, exp_cond_prob_laplace, edu_cond_prob_laplace, req_cond_prob_laplace, skill_cond_prob_laplace, softskill_cond_prob_laplace, responsibility_cond_prob_laplace, smoothing_param)

  # Calculate the accuracy after performing Laplace Smoothing
  print(f"After performing Laplace Smoothing: \n\nSmoothing parameter (alpha) = {smoothing_param}\n")
  calculateAccuracy(predicted_label_list, dev["Type"].tolist(), len(dev))


In [70]:
# Experimenting with smoothing_param = 1

experimentSmoothing(1)

After performing Laplace Smoothing: 

Smoothing parameter (alpha) = 1

Acccuracy :  64.62 %


In [71]:
# Experimenting with smoothing_param = 100 

experimentSmoothing(100)

After performing Laplace Smoothing: 

Smoothing parameter (alpha) = 100

Acccuracy :  57.25 %


In [72]:
# Experimenting with smoothing_param = 200

experimentSmoothing(200)

After performing Laplace Smoothing: 

Smoothing parameter (alpha) = 200

Acccuracy :  55.12 %


In [73]:
# Use the test dataset - by the experiments performed above - choose the best smoothing param value 


In [74]:
# Displaying the test data

test

Unnamed: 0.1,Unnamed: 0,Sentence_id,New_Sentence,Type
7817,7817,COGSKL19114,"Strong computer skills, especially Microsoft ...",Skill
2993,2993,HONREQ6132,Experience in healthcare domain or Facets imp...,Requirement
5512,5512,HONREQ20609,Experience in the software development of sen...,Requirement
6313,6313,PHEREQ1496,Experience in designing and implementing REST...,Requirement
1191,1191,COGRES50407,Continuously monitor the working environment a...,Responsibility
...,...,...,...,...
5226,5226,UAESSK828,Demonstrated analytical ability and problem-so...,SoftSkill
5390,5390,COGEXP1832,5+ years standard costing experience.,Experience
860,860,PHEREQ41860,Demonstrated skills developing and delivering ...,Requirement
7603,7603,COGSSK27119,Excellent English verbal and written communic...,SoftSkill


In [75]:
# Preprocess test data
# Splitting each document's sentence into list of words

print(f"No. of test data documnets: {len(dev)}\n")
processed_test_data = preprocess(test["New_Sentence"].tolist())

No. of test data documnets: 800



In [76]:
# Function to calculate the final accuracy

def calculateFinalAccuracy(smoothing_param):

  # Calculate Condtional Probabilities
  exp_cond_prob_laplace, edu_cond_prob_laplace, req_cond_prob_laplace, skill_cond_prob_laplace, softskill_cond_prob_laplace, responsibility_cond_prob_laplace = calculateConditionalProbabilitiy(smoothing_param)

  # Function call to predict the class type of each document in the dev data
  predicted_label_list = predictClassTypeUsingSmoothing(processed_test_data, exp_cond_prob_laplace, edu_cond_prob_laplace, req_cond_prob_laplace, skill_cond_prob_laplace, softskill_cond_prob_laplace, responsibility_cond_prob_laplace, smoothing_param)

  # Calculate the accuracy after performing Laplace Smoothing
  print(f"Smoothing parameter (alpha) = {smoothing_param}\n")
  calculateAccuracy(predicted_label_list, test["Type"].tolist(), len(test))


In [77]:
# Function call to calculate final accuracy

print("Final accuracy achieved: \n")
calculateFinalAccuracy(1)

Final accuracy achieved: 

Smoothing parameter (alpha) = 1

Acccuracy :  63.75 %
