In [0]:
# Mounts data with stem as 'drive'
from google.colab import drive
drive.mount('/content/drive')

In [0]:
# Packages to install
!pip install PyPDF2
!pip install textract
!pip install nltk

In [0]:
# Imports

# General
import copy
import os
from os import listdir
from os.path import isfile, join
import numpy as np
import random
from collections import *

# Tokenization
import PyPDF2 
import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Classification
import sklearn
from sklearn.ensemble import GradientBoostingClassifier as gbc
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.neural_network import MLPClassifier as mlp

In [0]:
# This line refreshes the known data in the resume folder
!ls "/content/drive/Shared drives/585_final_project/HackHer413_Resumes/"

In [0]:
# Set up path for resume scraper
mypath = "/content/drive/Shared drives/585_final_project/HackHer413_Resumes/"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]


In [0]:
texts = []
for filename in onlyfiles:
  f = filename
  filename = mypath+filename
  print(filename)
  #open allows you to read the file
  pdfFileObj = open(filename,'rb')
  #The pdfReader variable is a readable object that will be parsed
  pdfReader = PyPDF2.PdfFileReader(pdfFileObj, strict=False)
  #discerning the number of pages will allow us to parse through all #the pages
  num_pages = pdfReader.numPages
  count = 0
  text = ""
  #The while loop will read each page
  while count < num_pages:
      pageObj = pdfReader.getPage(count)
      count +=1
      text += pageObj.extractText()
      text = text.replace('\r','!')
      text = text.replace('\n','')
      text = text.replace('\t','^')
      text = text.replace('\v','*')
      text = text.lower()
  # split into words by white space
  # split into words by white space

  # remove punctuation from each word
  import re
  words = re.split(r'\W+', text)
  #This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
  if text != "":
     text = text
  #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
  else:
    try:
      text = textract.process(fileurl, method='tesseract', language='eng')
    except:
      text = ""
  # Now we have a text variable which contains all the text derived #from our PDF file. Type print(text) to see what it contains. It #likely contains a lot of spaces, possibly junk such as '\n' etc.
  # Now, we will clean our text variable, and return it as a list of keywords.

  texts.append((f, text))

In [0]:
print('Size of raw dataset:', len(texts))

In [0]:
'''
anonymize:

DESCRIPTION:
takes in tokenized resume and removes identifying information. Approaches task 
by removing all text before a few 'action' words. This process
also conveniently cleans the data of a few garbage tokens.

PARAMS:
keywords - tokenized data from a single resume

RETURN:
a copy of keywords with id info scrubbed

'''
def anonymize(keywords):
  lk = len(keywords)
  keewords = copy.copy(keywords)
  education = ['Education', 'education', 'EDUCATION']
  school = ['School', 'school', 'SCHOOL']
  experience = ['Experience', 'experience', 'EXPERIENCE']
  skills = ['Skills', 'skills', 'SKILLS']
  technical = ['Technical', 'technical', 'TECHNICAL']
  research = ['Research', 'research', 'RESEARCH']
  projects = ['Projects', 'projects', 'PROJECTS']
  objective = ['Objective', 'objective', 'OBJECTIVE']
  activities = ['Activities', 'activities', 'ACTIVITIES']
  interests = ['Interests', 'interests', 'INTERESTS']
  for word in range(lk):
    if (keywords[word] in education or keywords[word] in experience or
        keywords[word] in skills or keywords[word] in technical or
       keywords[word] in research or keywords[word] in projects or
       keywords[word] in objective or keywords[word] in activities or
       keywords[word] in interests):
      break
    else:
      keewords = keewords[1:]
  return keewords

In [0]:
# Helper function for categorize
def make_false(flag_array, target):
  flag_arr = copy.copy(flag_array)
  for flag in flag_arr:
    if flag is not target:
      flag[0] = False
  return flag_arr

In [0]:
'''
categorize:

DESCRIPTION:
Sorts anonymized data into general resume categories retaining order

PARAMS:
keywords - anonymized list of resume data in order-ish

RETURN:
a dictionary of the categorized resume
'''
def categorize(keywords):
  education = ['Education', 'education', 'EDUCATION', 'School', 'school', 'SCHOOL']
  
  # Flag to determine both if we run into the word and are in the 
  # section (flag[0]), as well as if we have seen it before (flag[1])
  # Given nature of reumes, first time we encounter these words is 
  # overwhelmingly the section header
  edu = [False, False]
  experience = ['Experience', 'experience', 'EXPERIENCE']
  exp = [False, False]
  skills = ['Skills', 'skills', 'SKILLS', 'Technical', 'technical', 'TECHNICAL']
  tech = [False, False]
  research = ['Research', 'research', 'RESEARCH']
  res = [False, False]
  projects = ['Projects', 'projects', 'PROJECTS']
  pro = [False, False]
  objective = ['Objective', 'objective', 'OBJECTIVE']
  obj = [False, False]
  activities = ['Activities', 'activities', 'ACTIVITIES']
  act = [False, False]
  interests = ['Interests', 'interests', 'INTERESTS']
  inter = [False, False]
  flags = [edu, exp, tech, res, pro, obj, act, inter]
  categories_without_skills_and_tech = ['Education', 'education', 'EDUCATION',
                                        'School', 'school', 'SCHOOL'
                                       'Experience', 'experience', 'EXPERIENCE',
                                       'Research', 'research', 'RESEARCH',
                                        'Projects', 'projects', 'PROJECTS',
                                        'Objective', 'objective', 'OBJECTIVE',
                                        'Activities', 'activities', 'ACTIVITIES',
                                        'Interests', 'interests', 'INTERESTS']
  all_cats = ['Education', 'education', 'EDUCATION',
              'School', 'school', 'SCHOOL'
              'Experience', 'experience', 'EXPERIENCE',
              'Skills', 'skills', 'SKILLS',
              'Technical', 'technical', 'TECHNICAL',
              'Research', 'research', 'RESEARCH',
              'Projects', 'projects', 'PROJECTS',
              'Objective', 'objective', 'OBJECTIVE',
              'Activities', 'activities', 'ACTIVITIES',
              'Interests', 'interests', 'INTERESTS']
  
  categories = {'education':[], 'experience':[], 'skills':[], 'research':[],
                'projects':[], 'objective':[], 'activities':[], 'interests':[]}
  words = copy.copy(keywords)
  '''
  this counter + counter_val are to prevent accidentally going into the next
  section
  ** in future be sure to check for 'research intern' or 'research assistant' **
  '''
  counter = 0
  count_val = 3
  for word in words:
    if (word in education and edu[1] == False and counter <= 0):
      edu[0] = True
      edu[1] = True
      counter = count_val
      flags = make_false(flags, edu)
    elif (word in experience and exp[1] == False and counter <= 0):
      exp[0] = True
      exp[1] = True
      counter = count_val
      flags = make_false(flags, exp)
    elif (word in skills and tech[1] == False and counter <= 0):
      tech[0] = True
      tech[1] = True
      counter = count_val
      flags = make_false(flags, tech)
    elif (word in research and res[1] == False and counter <= 0):
      res[0] = True
      res[1] = True
      counter = count_val
      flags = make_false(flags, res)
    elif (word in projects and pro[1] == False and counter <= 0):
      pro[0] = True
      pro[1] = True
      counter = count_val
      flags = make_false(flags, pro)
    elif (word in objective and obj[1] == False and counter <= 0):
      obj[0] = True
      obj[1] = True
      counter = count_val
      flags = make_false(flags, obj)
    elif (word in activities and act[1] == False and counter <= 0):
      act[0] = True
      act[1] = True
      counter = count_val
      flags = make_false(flags, act)
    elif (word in interests and inter[1] == False and counter <= 0):
      inter[0] = True
      inter[1] = True
      counter = count_val
      flags = make_false(flags, inter)
    
    if (edu[0] and word not in education):
      categories['education'].append(word)
      counter -=1
    if (exp[0] and word not in experience):
      categories['experience'].append(word)
      counter -=1
    if (tech[0] and word not in skills):
      categories['skills'].append(word)
      counter -=1
    if (res[0] and word not in research):
      categories['research'].append(word)
      counter -=1
    if (pro[0] and word not in projects):
      categories['projects'].append(word)
      counter -=1
    if (obj[0] and word not in objective):
      categories['objective'].append(word)
      counter -=1
    if (act[0] and word not in activities):
      categories['activities'].append(word)
      counter -=1
    if (inter[0] and word not in interests):
      categories['interests'].append(word)
      counter -=1
  return categories

In [0]:
tokenized_keywords = []
tokenized_categories = []
success_files = []
c = 0
for t in texts:
  #The word_tokenize() function will break our text phrases into #individual words
  tokens = word_tokenize(t[1])
  #we'll create a new list which contains punctuation we wish to clean
  punctuations = ['(',')',';',':','[',']',',']
  #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
  stop_words = stopwords.words('english')
  #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
  keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
  if keywords != []:
    k = anonymize(keywords)
    cats = categorize(k)
    if k == []:
      c += 1
    else:
      tokenized_keywords.append((t[0], k))
      tokenized_categories.append((t[0],cats))
      success_files.append(t[0])

print('Size of cleaned dataset:',len(tokenized_keywords))
print(tokenized_categories[0:2])


In [0]:
# Small test to see how we're doing
# Should not see any names or identifying informations (other than the file handle which could have names)
tokenized_categories[0]

In [0]:
# Loads 50-dimensional glove embeddings file from wikipedia
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

In [0]:
# Execute above function
glove_file = "/content/drive/Shared drives/585_final_project/glove_words.txt"
glove_model = loadGloveModel(glove_file)


In [0]:
# Sanity check to make sure model was loaded in correctly
print('the' in glove_model)

In [0]:
# Here we are assigning a glove word embedding to its corresponding word in the 
# resume for all resumes
model_embeddings = {}
for tokenized_resume in tokenized_categories:
  resume_embeddings = {}
  for cat in tokenized_resume[1]:
    category_embeddings = {}
    for word in tokenized_resume[1][cat]:
      if word in glove_model:
        if word in category_embeddings:
          index = category_embeddings[word][0]
          index += 1
          # Change glove_model[word] to generate_random_embedding()
          # to create the random embedding baseline
          category_embeddings[word] = (index, glove_model[word])
        else:
          # Change glove_model[word] to generate_random_embedding()
          # to create the random embedding baseline
          category_embeddings[word] = (1, glove_model[word])
    if cat in resume_embeddings:
      index = resume_embeddings[cat][0]
      index += 1
      resume_embeddings[cat] = (index, category_embeddings)
    else:
      resume_embeddings[cat] = (1, category_embeddings)
  if tokenized_resume[0] in model_embeddings:
    index = model_embeddings[tokenized_resume[0]][0]
    index += 1
    model_embeddings[tokenized_resume[0]] = (index, resume_embeddings)
  else:
    model_embeddings[tokenized_resume[0]] = (1, resume_embeddings)


In [0]:
# Creates a completely random 50-dimensional word embedding for every instance
# of a word in the corpus. Two of the same words could have a different
# embedding with this implementation
def generate_random_embedding():
    embedding = []
    for i in range(50):
        random_number = random.uniform(-1,1)
        embedding.append(random_number)
    return np.array(embedding)

In [0]:
# Sanity check to make sure it creates the embeddings as expected
model_embeddings['100_Tanisha_Nalavadi.pdf']

In [0]:
# If a word appears more than once in a resume, we want it to be treated
# differently than if it appears only once. This code block allows us to do that
single_embeddings = copy.deepcopy(model_embeddings)
for resume in single_embeddings:
  for activity in single_embeddings[resume][1]:
    for word_embedding in single_embeddings[resume][1][activity][1]:
      single_embeddings[resume][1][activity][1][word_embedding] = single_embeddings[resume][1][activity][1][word_embedding][0] * single_embeddings[resume][1][activity][1][word_embedding][1]

In [0]:
# Sanity check to make sure it creates the embeddings as expected
single_embeddings['100_Tanisha_Nalavadi.pdf']

In [0]:
# Here we are taking the mean of the word embeddings for each category to form
# single category embeddings. From there we concatenate the category embeddings 
# together to form a resume embedding
resume_embeddings = []
for resume in single_embeddings:
  res_embedding = [resume, np.array([])]
  for activity in single_embeddings[resume][1]:
    activity_sum = np.zeros(50)
    counter = 0
    for word in single_embeddings[resume][1][activity][1]:
      activity_sum = np.add( activity_sum, single_embeddings[resume][1][activity][1][word])
      counter += 1
    if (counter != 0):
      activity_sum = activity_sum/counter
    res_embedding[1] = np.concatenate((res_embedding[1], activity_sum))
  resume_embeddings.append([res_embedding[0], res_embedding[1].tolist()])




In [0]:
# Sanity test
for i in range(5):
  print(resume_embeddings[i])

In [0]:
# yx_label contains list of tuples
# add new directory name in directory_names (if add new directory in the future)
yx_label = []
directory_names = [('AI', 0),('fullstack_SWE', 1), ('Hardware', 2), ('Informatics', 3), ('Other', 4), ('inexperienced_SWE', 5),('Web_Developer', 6)]
for dir_name in directory_names:
  temp = "/content/drive/Shared drives/585_final_project/"+dir_name[0]+"/"
  filename_list = [f for f in listdir(temp) if isfile(join(temp, f))]
  for name in filename_list:
    yx_label.append((dir_name[1],name))
print(yx_label[0])


In [0]:
# finds the class label from the folders and assigns it to the file name
# returns in a list of tuples with the stored data
def find(file_name, list_of_files):
  for f in list_of_files:
    if (f[1] == file_name):
      return f
  return (4, file_name)

In [0]:
# attatches the class label to the embedding data
labeled_resume_embeddings = copy.deepcopy(resume_embeddings)
for r in labeled_resume_embeddings:
  f = find(r[0], yx_label)[0]
  r.insert(0, f)
print(labeled_resume_embeddings[0])

In [0]:
# Format training and testing data
y_data = []
x_data = []
combined_data = []
for resume in labeled_resume_embeddings:
  combined_data.append((resume[0], resume[2]))
random.shuffle(combined_data)
for resume in combined_data:
  y_data.append(resume[0])
  x_data.append(resume[1])
x_train = np.array(x_data[:int((len(x_data)/5)*4)])
x_test = np.array(x_data[int((len(x_data)/5)*4):])
y_train = np.array(y_data[:int((len(y_data)/5)*4)])
y_test = np.array(y_data[int((len(y_data)/5)*4):])

# Print shapes of sets as a sanity check
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [0]:
# Classifiers
gradient_boost_classifier = gbc()
random_forest_classifier = rfc()
multilayer_perceptron_classifier = mlp()
grid_params = {"n_estimators":[5, 7, 10, 15]}
search = GridSearchCV(gradient_boost_classifier, grid_params , scoring="accuracy",
                    return_train_score=True, refit=True, cv=5)
search.fit(x_train, y_train)
results = search.cv_results_
print(results.get('params'))
print(results.get('mean_test_score'))
print(results.get('mean_train_score'))
print(search.best_params_)
print(search.best_score_)

grid_params_2 = {"n_estimators":[5, 7, 10, 15]}
search_2 = GridSearchCV(random_forest_classifier, grid_params , scoring="accuracy",
                    return_train_score=True, refit=True, cv=5)
search_2.fit(x_train, y_train)
results_2 = search_2.cv_results_
print(results_2.get('params'))
print(results_2.get('mean_test_score'))
print(results_2.get('mean_train_score'))
print(search_2.best_params_)
print(search_2.best_score_)

grid_params = {"activation":['tanh', 'relu'], "alpha":[5, 6, 7, 8, 9, 10], "learning_rate":["constant", "adaptive"]}
mlp_classifier = GridSearchCV(multilayer_perceptron_classifier, grid_params , scoring="accuracy",
                    return_train_score=True, refit=True, cv=3)
mlp_classifier.fit(x_train, y_train)
results = mlp_classifier.cv_results_
print(results.get('params'))
print(results.get('mean_test_score'))
print(results.get('mean_train_score'))
print(mlp_classifier.best_params_)
print(mlp_classifier.best_score_)

In [0]:
# See the accuracy on the test set
print("Gradient Boost accuracy: ",search.score(x_test, y_test))
print("Random Forest accuracy: ",search_2.score(x_test, y_test))
print("Multilayer Perceptron Model accuracy: ",mlp_classifier.score(x_test, y_test))

In [0]:
# ELMo time!
!pip install allennlp
import allennlp
from allennlp.commands.elmo import ElmoEmbedder

Elmo = ElmoEmbedder()
tokens = ["Roasted", "ants", "are", "a", "popular", "snack", "in", "Columbia"]
vectors = Elmo.embed_sentence(tokens)

In [0]:
# Sanity check
vectors

In [0]:
# Similar labeling process from above with GloVe
content = copy.deepcopy(tokenized_categories)
for resume in range(len(content)):
  category = find(content[resume][0], yx_label)[0]
  content[resume] = (category, content[resume][0], content[resume][1])

In [0]:
# Sanity check
print(content[0])

In [0]:
# Train ELMo Model
elmo_embeddings = []
labeled_content = copy.deepcopy(content)
count = 0
for resume in labeled_content:
  elmo_embedding = []
  for category in resume[2]:
    if (resume[2][category] != []):
      embeddings = Elmo.embed_sentence(resume[2][category])
      embeddings = np.mean(embeddings, axis=0)
      obj_to_add = (resume[1], embeddings)
      m = np.mean(obj_to_add[1], axis = 0)
      elmo_embedding.append(m)
    else:
      elmo_embedding.append(np.zeros((1, 1024)))
  elmo_embeddings.append((resume[1],elmo_embedding))
  print(str(count)+"/"+str(len(labeled_content)))
  count += 1





In [0]:
# Squish down 3-d ELMo output matrix using numpy mean
c_elmo = copy.deepcopy(elmo_embeddings)
e_embeddings = []
for resume in c_elmo:
  concat_array = np.array([])
  # print(resume)
  for category in resume[1]:
    # print(category.shape)
    sum_array = np.zeros((1, 1024))
    for word in category:
      sum_array = np.add(sum_array, word)
    sum_array = sum_array/(len(category))
    # print(sum_array[0].shape)
    # print(concat_array)
    concat_array = np.concatenate((concat_array, sum_array[0]))
  e_embeddings.append((resume[0], concat_array))

In [0]:
# Sanity check
print(e_embeddings[0])

In [0]:
# Format Data for  classifiers
c_elmoo = copy.deepcopy(e_embeddings)
y_trains = []
for resume in c_elmoo:
  y_val = [find(resume[0], yx_label)[0], resume[1]]
  y_trains.append(y_val)
y_data = []
x_data = []
combined_data = []
for yx in y_trains:
  combined_data.append((yx[0], yx[1]))
random.shuffle(combined_data)
for yx in combined_data:
  y_data.append(yx[0])
  x_data.append(yx[1])
x_train = x_data[:int((len(x_data)/5)*4)]
x_test = x_data[int((len(x_data)/5)*4):]
y_train = y_data[:int((len(y_data)/5)*4)]
y_test = y_data[int((len(y_data)/5)*4):]
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))
print(y_test)
# print(x_train)

In [0]:
# Classifiers
gradient_boost_classifier = gbc()
random_forest_classifier = rfc()

grid_params = {"n_estimators":[10, 20, 30, 35]}
search = GridSearchCV(gradient_boost_classifier, grid_params , scoring="accuracy",
                    return_train_score=True, refit=True, cv=5)
search.fit(x_train, y_train)
# search.fit(x_data, y_data)
results = search.cv_results_
print(results.get('params'))
print(results.get('mean_test_score'))
print(results.get('mean_train_score'))
print(search.best_params_)
print(search.best_score_)

grid_params_2 = {"n_estimators":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
search_2 = GridSearchCV(random_forest_classifier, grid_params , scoring="accuracy",
                    return_train_score=True, refit=True, cv=5)
search_2.fit(x_train, y_train)
results_2 = search_2.cv_results_
print(results_2.get('params'))
print(results_2.get('mean_test_score'))
print(results_2.get('mean_train_score'))
print(search_2.best_params_)
print(search_2.best_score_)


In [0]:
print("Gradient Boost accuracy: ",search.score(x_test, y_test))
print("Random Forest accuracy: ",search_2.score(x_test, y_test))

In [0]:
multilayer_perceptron_classifier = mlp()
grid_params = {"activation":['tanh'], "alpha":[0.001, 0.01, 0.1, 1, 3, 5, 6, 7, 8], "learning_rate":["constant"]}
mlp_classifier = GridSearchCV(multilayer_perceptron_classifier, grid_params , scoring="accuracy",
                    return_train_score=True, refit=True, cv=5)
mlp_classifier.fit(x_train, y_train)
results = mlp_classifier.cv_results_
print(results.get('params'))
print(results.get('mean_test_score'))
print(results.get('mean_train_score'))
print(mlp_classifier.best_params_)
print(mlp_classifier.best_score_)

In [0]:
print("Multilayer Perceptron Model accuracy: ",mlp_classifier.score(x_test, y_test))