In [142]:
# Imports
import copy
import os
from os import listdir
from os.path import isfile, join
import PyPDF2 
import textract
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
import sklearn
from sklearn.ensemble import GradientBoostingClassifier as gbc
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.naive_bayes import GaussianNB as nbc
from sklearn.feature_extraction.text import CountVectorizer 
from collections import *
nltk.download('punkt')
nltk.download('stopwords')
import numpy as np
import pickle
import random

[nltk_data] Downloading package punkt to /Users/hlerner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hlerner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [143]:
# This line refreshes the known data in the resume folder
!ls "HackHer413_Resumes"

<names of resume files>


In [144]:

mypath = "HackHer413_Resumes"
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

In [145]:
texts = []
for filename in onlyfiles:
  f = filename
  filename = mypath+"/"+filename
  print(filename)
  #open allows you to read the file
  pdfFileObj = open(filename,'rb')
  #The pdfReader variable is a readable object that will be parsed
  pdfReader = PyPDF2.PdfFileReader(pdfFileObj, strict=False)
  #discerning the number of pages will allow us to parse through all #the pages
  num_pages = pdfReader.numPages
  count = 0
  text = ""
  #The while loop will read each page
  while count < num_pages:
      pageObj = pdfReader.getPage(count)
      count +=1
      text += pageObj.extractText()
      text = text.replace('\r','!')
      text = text.replace('\n','')
      text = text.replace('\t','^')
      text = text.replace('\v','*')
      text = text.lower()
  # split into words by white space
  # split into words by white space

  # remove punctuation from each word
  import re
  words = re.split(r'\W+', text)
  #This if statement exists to check if the above library returned #words. It's done because PyPDF2 cannot read scanned files.
  if text != "":
     text = text
  #If the above returns as False, we run the OCR library textract to #convert scanned/image based PDF files into text
  else:
    try:
      text = textract.process(fileurl, method='tesseract', language='eng')
    except:
      text = ""
  # Now we have a text variable which contains all the text derived #from our PDF file. Type print(text) to see what it contains. It #likely contains a lot of spaces, possibly junk such as '\n' etc.
  # Now, we will clean our text variable, and return it as a list of keywords.
  texts.append((f, text))

<names of resume files>


<names of resume files>


In [146]:
print('Size of raw dataset:', len(texts))

Size of raw dataset: 621


In [147]:
'''
anonymize:

DESCRIPTION:
takes in tokenized resume and removes identifying information. Approaches task 
by removing all text before a few 'action' words. This process
also conveniently cleans the data of a few garbage tokens.

PARAMS:
keywords - tokenized data from a single resume

RETURN:
a copy of keywords with id info scrubbed

'''
def anonymize(keywords):
  lk = len(keywords)
  keewords = copy.copy(keywords)
  education = ['Education', 'education', 'EDUCATION']
  school = ['School', 'school', 'SCHOOL']
  experience = ['Experience', 'experience', 'EXPERIENCE']
  skills = ['Skills', 'skills', 'SKILLS']
  technical = ['Technical', 'technical', 'TECHNICAL']
  research = ['Research', 'research', 'RESEARCH']
  projects = ['Projects', 'projects', 'PROJECTS']
  objective = ['Objective', 'objective', 'OBJECTIVE']
  activities = ['Activities', 'activities', 'ACTIVITIES']
  interests = ['Interests', 'interests', 'INTERESTS']
  for word in range(lk):
    if (keywords[word] in education or keywords[word] in experience or
        keywords[word] in skills or keywords[word] in technical or
       keywords[word] in research or keywords[word] in projects or
       keywords[word] in objective or keywords[word] in activities or
       keywords[word] in interests):
      break
    else:
      keewords = keewords[1:]
  return keewords

In [148]:
def make_false(flag_array, target):
  flag_arr = copy.copy(flag_array)
  for flag in flag_arr:
    if flag is not target:
      flag[0] = False
  return flag_arr

In [149]:
'''
categorize:

DESCRIPTION:
Sorts anonymized data into general resume categories retaining order

PARAMS:
keywords - anonymized list of resume data in order-ish

RETURN:
a dictionary of the categorized resume
'''
def categorize(keywords):
  education = ['Education', 'education', 'EDUCATION', 'School', 'school', 'SCHOOL']
  
  # Flag to determine both if we run into the word and are in the 
  # section (flag[0]), as well as if we have seen it before (flag[1])
  # Given nature of reumes, first time we encounter these words is 
  # overwhelmingly the section header
  edu = [False, False]
  experience = ['Experience', 'experience', 'EXPERIENCE']
  exp = [False, False]
  skills = ['Skills', 'skills', 'SKILLS', 'Technical', 'technical', 'TECHNICAL']
  tech = [False, False]
  research = ['Research', 'research', 'RESEARCH']
  res = [False, False]
  projects = ['Projects', 'projects', 'PROJECTS']
  pro = [False, False]
  objective = ['Objective', 'objective', 'OBJECTIVE']
  obj = [False, False]
  activities = ['Activities', 'activities', 'ACTIVITIES']
  act = [False, False]
  interests = ['Interests', 'interests', 'INTERESTS']
  inter = [False, False]
  flags = [edu, exp, tech, res, pro, obj, act, inter]
  categories_without_skills_and_tech = ['Education', 'education', 'EDUCATION',
                                        'School', 'school', 'SCHOOL'
                                       'Experience', 'experience', 'EXPERIENCE',
                                       'Research', 'research', 'RESEARCH',
                                        'Projects', 'projects', 'PROJECTS',
                                        'Objective', 'objective', 'OBJECTIVE',
                                        'Activities', 'activities', 'ACTIVITIES',
                                        'Interests', 'interests', 'INTERESTS']
  all_cats = ['Education', 'education', 'EDUCATION',
              'School', 'school', 'SCHOOL'
              'Experience', 'experience', 'EXPERIENCE',
              'Skills', 'skills', 'SKILLS',
              'Technical', 'technical', 'TECHNICAL',
              'Research', 'research', 'RESEARCH',
              'Projects', 'projects', 'PROJECTS',
              'Objective', 'objective', 'OBJECTIVE',
              'Activities', 'activities', 'ACTIVITIES',
              'Interests', 'interests', 'INTERESTS']
  
  categories = {'education':[], 'experience':[], 'skills':[], 'research':[],
                'projects':[], 'objective':[], 'activities':[], 'interests':[]}
  words = copy.copy(keywords)
  '''
  this counter + counter_val are to prevent accidentally going into the next
  section
  ** in future be sure to check for 'research intern' or 'research assistant' **
  '''
  counter = 0
  count_val = 3
  for word in words:
    if (word in education and edu[1] == False and counter <= 0):
      edu[0] = True
      edu[1] = True
      counter = count_val
      flags = make_false(flags, edu)
    elif (word in experience and exp[1] == False and counter <= 0):
      exp[0] = True
      exp[1] = True
      counter = count_val
      flags = make_false(flags, exp)
    elif (word in skills and tech[1] == False and counter <= 0):
      tech[0] = True
      tech[1] = True
      counter = count_val
      flags = make_false(flags, tech)
    elif (word in research and res[1] == False and counter <= 0):
      res[0] = True
      res[1] = True
      counter = count_val
      flags = make_false(flags, res)
    elif (word in projects and pro[1] == False and counter <= 0):
      pro[0] = True
      pro[1] = True
      counter = count_val
      flags = make_false(flags, pro)
    elif (word in objective and obj[1] == False and counter <= 0):
      obj[0] = True
      obj[1] = True
      counter = count_val
      flags = make_false(flags, obj)
    elif (word in activities and act[1] == False and counter <= 0):
      act[0] = True
      act[1] = True
      counter = count_val
      flags = make_false(flags, act)
    elif (word in interests and inter[1] == False and counter <= 0):
      inter[0] = True
      inter[1] = True
      counter = count_val
      flags = make_false(flags, inter)
    
    if (edu[0] and word not in education):
      categories['education'].append(word)
      counter -=1
    if (exp[0] and word not in experience):
      categories['experience'].append(word)
      counter -=1
    if (tech[0] and word not in skills):
      categories['skills'].append(word)
      counter -=1
    if (res[0] and word not in research):
      categories['research'].append(word)
      counter -=1
    if (pro[0] and word not in projects):
      categories['projects'].append(word)
      counter -=1
    if (obj[0] and word not in objective):
      categories['objective'].append(word)
      counter -=1
    if (act[0] and word not in activities):
      categories['activities'].append(word)
      counter -=1
    if (inter[0] and word not in interests):
      categories['interests'].append(word)
      counter -=1
  return categories

In [150]:
tokenized_keywords = []
tokenized_categories = []
success_files = []
c = 0
for t in texts:
    #The word_tokenize() function will break our text phrases into #individual words
    tokens = word_tokenize(t[1])
    #we'll create a new list which contains punctuation we wish to clean
    punctuations = ['(',')',';',':','[',']',',']
    #We initialize the stopwords variable which is a list of words like #"The", "I", "and", etc. that don't hold much value as keywords
    stop_words = stopwords.words('english')
    #We create a list comprehension which only returns a list of words #that are NOT IN stop_words and NOT IN punctuations.
    keywords = [word for word in tokens if not word in stop_words and not word in punctuations]
    if keywords != []:
        k = anonymize(keywords)
        if k == []:
            c += 1
        else:
            tokenized_keywords.append((t[0], k))
            success_files.append(t[0])
random.shuffle(tokenized_keywords)
test_data = tokenized_keywords[int((len(tokenized_keywords)/5)*4):]
tokenized_keywords = tokenized_keywords[:int((len(tokenized_keywords)/5)*4)]
print('Size of cleaned dataset:',len(tokenized_keywords))
print(tokenized_keywords[0:2])

Size of cleaned dataset: 239
[('<name of resume file>
', ['education', 'university', 'massachusetts', 'amherst', 'college', 'humanities', 'fine', 'arts', 'fall', '2018', '-', 'present', 'major', 'computational', 'linguistics', 'relevant', 'courses', 'introduction', 'computational', 'linguistics', 'introduction', 'syntax', 'reasoning', 'uncertainty', 'spring', '2019', 'introduction', 'semantics', 'spring', '2019', 'syntax', 'dialects', 'english', 'spring', '2019', 'cornell', 'university', 'college', 'engineering', 'ithaca', 'ny', 'fall', '2015', 'spring', '2017', 'major', 'computer', 'science', 'linguistics', 'relevant', 'courses', 'introduction', 'computing', 'using', 'matlab', 'object-oriented', 'programming', 'data', 'structures', 'discrete', 'structures', 'multivariable', 'calculus', 'differential', 'equations', 'linear', 'algebra', 'introduction', 'linguistics', 'introduction', 'phonetics', 'phonology', 'boston', 'university', 'boston', 'summer', '2017', '6-week', 'historical', 'li

In [151]:
# yx_label contains list of tuples
# add new directory name in directory_names (if add new directory in the future)
# [(label,filename)]
yx_label = []
directory_names = [('AI', 0),('fullstack_SWE', 1), ('Hardware', 2), ('Informatics', 3), ('Other', 4), ('inexperienced_SWE', 5),('Web_Developer', 6)]
for dir_name in directory_names:
  temp = dir_name[0]
  filename_list = [f for f in listdir(temp) if isfile(join(temp, f))]
  for name in filename_list:
    yx_label.append((dir_name[1],name))
print(yx_label[0])

(0, '284_Hoang_Ho.pdf')


In [152]:
def find(file_name, list_of_files):
  for f in list_of_files:
    if (f[1] == file_name):
      return f
  return (4, file_name)

In [175]:
big_list = []
for resume in tokenized_keywords:
    for word in resume[1]:
        big_list.append(word)
big_list = set(big_list)
c = 0
print(len(big_list))
vocab_size = len(big_list)

12633


In [176]:
def assign_ids(big_list_of_words):
    ids = []
    count = 0
    for word in big_list_of_words:
        val = (count, word)
        ids.append(val)
        count += 1
    return ids

In [177]:
word_ids = assign_ids(big_list)


In [178]:
def find_id(word):
    for w in word_ids:
        if (w[1] == word):
            return w[0]
    return -1

In [179]:
# build one-hot vector with appropriate vocab size for padding
def vectorize(resume):
    vector = np.zeros(vocab_size)
    for word in resume:
        index = find_id(word)
        if (index != -1):
            vector[index] = 1
    return vector

In [180]:
# build x data data
labeled_tokenized_keywords = []
for resume in tokenized_keywords:
    category = find(resume[0], yx_label)
    labeled_tokenized_keywords.append((category[0], resume[1]))
corpus = []
for resume in labeled_tokenized_keywords:
    c = resume[1]
    corpus.append(c)
x_train = []
for resume in corpus:
    x_train.append(vectorize(resume))
x_train = np.array(x_train)
    

labeled_tokenized_keywords_test = []
for resume in test_data:
    category = find(resume[0], yx_label)
    labeled_tokenized_keywords_test.append((category[0], resume[1]))
corpus = []
for resume in labeled_tokenized_keywords_test:
    c = resume[1]
    corpus.append(c)
x_test = []
for resume in corpus:
    x_test.append(vectorize(resume))
x_test = np.array(x_test)

In [181]:
# Sanity check
print(x_train[0][0:500])
print(x_test[0][0:500])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

In [182]:
# build y data
y_train = []
for resume in labeled_tokenized_keywords:
    c = resume[0]
    y_train.append(c)
y_train = np.array(y_train)
print(y_train)

y_test = []
for resume in labeled_tokenized_keywords_test:
    c = resume[0]
    y_test.append(c)
y_test = np.array(y_test)
print(y_test)

[0 2 4 4 2 5 5 4 5 4 4 1 3 4 3 0 4 1 4 3 1 2 4 2 1 4 5 2 3 4 4 6 5 3 3 5 5
 6 5 1 5 4 3 4 5 5 2 3 4 1 4 4 5 4 4 3 4 3 5 1 5 1 4 3 4 4 4 4 5 2 4 4 0 3
 0 6 4 4 4 6 4 3 6 1 4 1 5 2 4 6 1 1 4 3 6 0 4 4 1 1 4 0 4 4 3 5 4 1 3 5 4
 0 4 5 2 3 3 4 2 4 4 4 1 4 4 4 3 4 3 4 5 5 2 3 4 5 4 4 4 1 4 5 2 4 1 4 4 4
 0 4 2 4 4 1 6 5 4 4 5 4 3 4 1 4 2 4 1 6 0 2 6 4 4 4 4 1 6 0 6 0 0 5 3 4 4
 4 2 1 4 1 4 4 3 3 1 4 1 3 4 2 1 3 3 5 4 4 1 4 4 4 4 1 5 1 3 0 1 3 3 4 1 1
 4 3 4 2 1 4 4 4 4 4 4 4 5 4 1 4 1]
[4 4 3 4 4 4 4 1 1 4 3 4 1 3 4 1 3 4 3 5 4 0 2 1 5 1 3 4 0 1 0 4 4 4 4 4 1
 4 4 3 4 1 4 1 5 5 0 4 3 1 5 4 5 1 5 4 5 5 0 4]


In [183]:
# Sanity check
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(239, 12633)
(239,)
(60, 12633)
(60,)


In [184]:
# Classifier
gnb = nbc()
classifier = gnb.fit(x_train, y_train)
score = classifier.score(x_test, y_test)
print(score)

0.4166666666666667
