In [9]:
import pandas as pd
import matplotlib.pyplot as plt
import re

In [2]:
data = pd.read_csv('processed_resumes_work_ADDED_JOB_TITLES.csv')

In [6]:
# Remove jobs without title
data = data[~data.role.isnull()]

In [10]:
# Manual transformations of similar jobs
manual_update_data = pd.read_csv('manual_job_title_transformations.csv', encoding='latin-1')
manual_update_job_dict = manual_update_data[~manual_update_data.manual_change.isnull()]\
    [['cleaned_job_title','manual_change']]\
    .set_index('cleaned_job_title')['manual_change']\
    .to_dict()
    
    
# Words that will be manually changed
manual_change_dict = {'developers':'developer',
                 'develppers':'developer',
                 'coordinators':'coordinator',
                 'application':'applications',
                 'analysts':'analyst',
                 'analyast':'analyst',
                 'ananlyst':'analyst',
                 'analysr':'analyst',
                 'information technology':'it',
                 'managers':'manager',
                 'srprogrammer':'sr programmer',
                 'srcomputer':'sr computer',
                 'srjava':'sr java',
                 'srsoftware':'sr software',
                 'srqa':'sr qa',
                 'srapplication':'sr application',
                 'srsystem':'sr systems',
                 'salesforcecom':'salesforce',
                 'admin':'administrator',
                 'developer':'engineer',
                 'solution':'solutions',
                 'net':'.net',
                 'dotnet':'.net',
                 'fullstack':'full stack',
                 'db':'database',
                 'system':'systems',
                 'principle':'principal',
                 'ui':'ux',
                 'user experience':'ux',
                 'ui/ux':'ux',
                 'ux/ui':'ux',
                 'wi-fi':'wifi',
                 'wi fi':'wifi',
                 'user interface':'ux',
                 'dba':'database administrator',
                 'enigneer':'engineer',
                 'enginerr':'engineer',
                 'engingeer':'engineer',
                 'engneer':'engineer',
                 'engr':'engineer',
                 'hr':'human resources',
                 'manger':'manager',
                 'qa':'quality assurance',
                 'frontend':'front end',
                 'backend':'back end',
                 'bi':'business intelligence',
                 'vp':'vice president',
                 'sr':'senior',
                 'jr':'junior',
                 'i':'1',
                 'ii':'2',
                 'iii':'3',
                 'iv':'4',
                 'v':'5',
                 'vi':'6',
                 'vii':'7',
                 'l1':'1',
                 'l2':'2',
                 'l3':'3',
                 'l4':'4',
                 'l5':'5',
                 'l6':'6',
                 'l7':'7',
                 'one':'1',
                 'two':'2',
                 'three':'3',
                 'four':'4',
                 'five':'5',
                 'six':'6',
                 'seven':'7',
                 'a':'1',
                 'b':'2',
                 'c':'3',
                 ' d':'4',
                 '-d':'4',
                 'e':'5',
                 'f':'6',
                 'g':'7',
                }

# Words that will be removed from job title
trash_words_list = ['jc', 'mts', 'level']

# Job prefix qualifiers that will be moved to experience list
pre_qualifiers = ['lead']

# Job postscript qualifiers that will be moved to experience list
post_qualifiers = []

# Job qualifiers that will be moved to experience list
any_location_qualifiers = [
                    'vice president',
                    'president',
                    'principal', 
                    'senior', 
                    'junior',
                    'entry', 
                    'mid', 
                    'intern',
                    '1', '2', '3', '4', '5', '6', '7',
                ]



# This function moves qualifiers from job title list to experience list
def parse_experience(list_of_jobs):
    clean_job_list = []
    experience_list = []
    for job in list_of_jobs:
        single_job_experience = []
        for word in pre_qualifiers:
            regex = re.compile('^'+word+'\\b', re.I)
            if re.search(regex, job):
                single_job_experience.append(word)
                job = re.sub(regex, '', job)
#         for word in post_qualifiers:
#             regex = re.compile('\s*\\b'+word+'(\W*?)$', re.I)
#             result = re.search(regex, job)
#             if re.search(regex, job):
#                 single_job_experience.append(word)
#                 job = re.sub(regex, '', job)
        for word in any_location_qualifiers:
            regex = re.compile('\\b'+word+'\\b', re.I)
            result = re.search(regex, job)
            if re.search(regex, job):
                single_job_experience.append(word)
                job = re.sub(regex, '', job)
        clean_job_list.append(job)
        experience_list.append(single_job_experience)
    return clean_job_list, experience_list

def trash_words(list_of_jobs):
    cleaned_list_of_jobs = []
    for job in list_of_jobs:
        for word in trash_words_list:
            regex = re.compile('\\b'+word+'[0-9]*(\s|\\b)', re.I)
            job = re.sub(regex, '', job)
        cleaned_list_of_jobs.append(job)
    return cleaned_list_of_jobs
        
def remove_special_characters(char_list, list_of_jobs):
    for char in char_list:
        list_of_jobs = [job.replace(char,'') for job in list_of_jobs]
    return list_of_jobs

def manual_update_words(list_of_jobs):
    cleaned_list_of_jobs = []
    for job in list_of_jobs:
        for key in manual_change_dict:
            regex = re.compile("\\b"+key+"\\b")
            job = re.sub(regex, manual_change_dict[key], job)
        cleaned_list_of_jobs.append(job)
    return cleaned_list_of_jobs

def manual_update_job_titles(list_of_jobs):
    cleaned_list_of_jobs = []
    for job in list_of_jobs:
        if job in manual_update_job_dict:
            job = manual_update_job_dict[job]
        cleaned_list_of_jobs.append(job)
    return cleaned_list_of_jobs

def remove_words_in_parenthesis(list_of_jobs):
    cleaned_list_of_jobs = []
    for job in list_of_jobs:
        job = re.sub(re.compile("\((.*?)\)"), '', job)
        job = re.sub(re.compile("\[(.*?)\]"), '', job)
        cleaned_list_of_jobs.append(job)
    return cleaned_list_of_jobs

def remove_words_after_special_char(char_list, list_of_jobs):
    for char in char_list:
        list_of_jobs = [job.split(char, -1)[0] for job in list_of_jobs]
    return list_of_jobs

def clean_job(list_of_jobs):
    
    # Lowercase and strip whitespaces
    clean_job_list = [job.strip().lower() for job in list_of_jobs]

    # Remove special characters . and : and ;
    clean_job_list = remove_special_characters( ['.',':',';','#'] , clean_job_list)
    
    # Remove specific words
    clean_job_list = trash_words(clean_job_list)

    # Manually update words using list
    clean_job_list = manual_update_words(clean_job_list)

    # First round pull out qualifiers
    clean_job_list, experience_list1 = parse_experience(clean_job_list)
    
    # Delete all content between parenthesis (...) or [...]
    clean_job_list = remove_words_in_parenthesis(clean_job_list)

    # Remove all text after - and / and (
    clean_job_list = remove_words_after_special_char([' -', '- ', ' - ','/','(',')'], clean_job_list)

    # Replace - with ' '
    clean_job_list = [job.replace('-',' ') for job in clean_job_list]

    # For strings with comma, reverse the order and remove comma
    clean_job_list = [job.split(',', 1)[1].strip() + ' ' + job.split(',', 1)[0].strip() 
                      if len(job.split(',', 1))>1 else job
                      for job in clean_job_list]
    
    # If there is more than 1 comma, remove the text for the 2nd
    clean_job_list = remove_words_after_special_char([','], clean_job_list)
    
    # Manually update job titles using list
    clean_job_list = manual_update_job_titles(clean_job_list)

    # Second round pull out qualifiers
    clean_job_list, experience_list2 = parse_experience(clean_job_list)
    
    # Clean up extra whitespaces
    clean_job_list = [job.replace('  ',' ').strip() for job in clean_job_list]
    
    # Remove any numbers
    clean_job_list = [x for x in clean_job_list if not isinstance(x, int)]

    # Merge the 2 rounds of qualifier grabbing
    experience_list = list(map(list.__add__, experience_list1, experience_list2))        

    return list_of_jobs, clean_job_list, experience_list

  interactivity=interactivity, compiler=compiler, result=result)


In [31]:
list_of_jobs, clean_job_list, experience_list = clean_job(data.role)
clean_job_list = manual_update_job_titles(clean_job_list)

In [17]:
sorted_experience_list = []
for item in experience_list:
    sorted_experience_list.append(sorted(item))

In [32]:
data['converted_job_title_new'] = clean_job_list
data['converted_experience_level_new'] = sorted_experience_list
data['converted_experience_level_new'] = data['converted_experience_level_new'].astype(str)
data['converted_experience_level_new'] = data.converted_experience_level_new.apply(lambda x: x.replace('[','').replace(']','').replace("'",''))

In [33]:
data.head()

Unnamed: 0.1,Unnamed: 0,city,resume_id,container,role,company,location,dates,descript,converted_job_title,converted_experience_level,converted_job_title_new,converted_experience_level_new
0,0,atlanta,0004d469fc497102,work-experience-items,senior informix database administrator,Breckinridge Insurance,"Kennesaw, GA",July 2017 to Present,.Informix DBA for Breckinridge Insurance appli...,administrator database,['senior'],informix database administrator,senior
1,1,atlanta,0004d469fc497102,work-experience-items,senior informix dba database administrator,INTERCALL Inc,,January 2007 to June 2017,.Informix Database Administor for InterCall's ...,administrator database dba,['senior'],informix database administrator database admin...,senior
2,2,atlanta,0004d469fc497102,work-experience-items,oracle informix dba database administrator,Accenture/Bellsouth Telecommuncations Inc,,March 2004 to December 2007,.Oracle Database for OPEDS production support....,administrator database oracle,[],oracle informix database administrator databas...,
3,3,atlanta,0004d469fc497102,work-experience-items,peoplesoft hrms oracle dba database administrator,ACENTRON/Michelin Inc,,March 2003 to February 2004,.Responsible for Migrating objects and Project...,administrator database oracle,[],peoplesoft hrms oracle database administrator ...,
4,4,atlanta,0004d469fc497102,work-experience-items,informix oracle database administrator,BellSouth Telecommunications INC,,November 1998 to January 2003,.Worked on various projects for BellSouth. Wor...,administrator database oracle,[],informix oracle database administrator,


In [141]:
data.groupby(['converted_job_title_new']).descript.count().to_csv('resume_titles_count.csv')

In [160]:
relevant_jobs = pd.read_csv('data_pivot.csv')
relevant_jobs['total'] = relevant_jobs['2010']+\
                         relevant_jobs['2011']+\
                         relevant_jobs['2012']+\
                         relevant_jobs['2013']+\
                         relevant_jobs['2014']+\
                         relevant_jobs['2015']+\
                         relevant_jobs['2016']+\
                         relevant_jobs['2017']+\
                         relevant_jobs['2018']
relevant_jobs = relevant_jobs.sort_values(by='total',ascending=False).head(50).cleaned_job_title

In [85]:
filtered_data = data[data.converted_job_title_new.isin(relevant_jobs)]
filtered_data = filtered_data[~filtered_data.descript.isnull()]

In [67]:
# filtered_data.groupby('converted_job_title_new').resume_id.count().sort_values()

In [86]:
labels = filtered_data.converted_job_title_new
text = filtered_data.descript

In [104]:
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [106]:
def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

In [107]:
tokens = []
for x in text:
#     print(x)
#     print()
    x = replace_contractions(x)
    x = nltk.word_tokenize(x)
    tokens.append(x)

In [108]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

In [109]:
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

In [110]:
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

In [111]:
def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

In [112]:
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

In [113]:
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

In [114]:
def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

In [119]:
def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
#     words = replace_numbers(words)
    words = remove_stopwords(words)
    words = lemmatize_verbs(words)
    return words

In [120]:
norm_tokens = []
for x in tokens:
#     print(x)
#     print()
    norm_tokens.append(normalize(x))

In [124]:
# norm_tokens[50]

In [131]:
norm_text = []
for i in norm_tokens:
    norm_text.append(' '.join(i))

In [132]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(norm_text, 
                                                    labels,
                                                    test_size=0.10,
                                                    random_state=0)

In [133]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)
print('Vocabulary len:', len(vect.get_feature_names()))
print('Longest word:', max(vect.vocabulary_, key=len))

X_train_vectorized = vect.transform(X_train)

Vocabulary len: 202176
Longest word: ___________________________________________________________________________________________________________


In [134]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [135]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

# Find the 10 smallest and 10 largest coefficients
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['jqgrids' 'ough' 'ought' 'oui' 'ouip' 'ouk' 'oum' 'ound' 'ounti' 'ounting']

Largest Coefs: 
['use' 'web' 'data' 'develop' 'application' 'server' 'service' 'design'
 'sql' 'aspnet']


In [136]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

Accuracy: 45.91%


In [150]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df=5, ngram_range=(1, 3)).fit(X_train)
print('Vocabulary len:', len(vect.get_feature_names()))
print('Longest word:', max(vect.vocabulary_, key=len))

X_train_vectorized = vect.transform(X_train)

Vocabulary len: 662203
Longest word: modalpopupextender maskededitextender maskededitvalidator


In [151]:
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [152]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['loan history' 'plan develop global' 'plan develop implement'
 'plan develop launch' 'plan develop maintain' 'plan develop manage'
 'plan develop new' 'plan develop execute' 'plan develop organize'
 'plan develop project']

Largest Coefs: 
['use' 'aspnet' 'web' 'net' 'server' 'sql' 'application' 'sql server'
 'data' 'develop']


In [153]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

Accuracy: 46.09%


In [156]:
list_tokens = pd.DataFrame(list(zip(norm_tokens, labels)))
list_tokens.columns = ['tokens','job']

In [145]:
# filtered_data.groupby('converted_job_title_new').resume_id.count()

In [161]:
list_tokens = list_tokens[list_tokens.job.isin(relevant_jobs)]


In [165]:
# list_tokens.groupby('job').count()

In [167]:
labels = list_tokens.job
text = list_tokens.tokens

In [171]:
norm_text = []
for i in text:
    norm_text.append(' '.join(i))

In [172]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(norm_text, 
                                                    labels,
                                                    test_size=0.10,
                                                    random_state=0)

In [173]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df=5, ngram_range=(1, 3)).fit(X_train)
print('Vocabulary len:', len(vect.get_feature_names()))
print('Longest word:', max(vect.vocabulary_, key=len))

X_train_vectorized = vect.transform(X_train)

Vocabulary len: 574461
Longest word: modalpopupextender maskededitextender maskededitvalidator


In [174]:
model = MultinomialNB(alpha=0.1)
model.fit(X_train_vectorized, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [175]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['load extract develop' 'personnel well' 'personnel view'
 'personnel various' 'personnel utilize' 'personnel update'
 'personnel understand' 'personnel troubleshoot problem'
 'personnel troubleshoot' 'personnel train new']

Largest Coefs: 
['use' 'aspnet' 'web' 'net' 'server' 'sql' 'application' 'data'
 'sql server' 'develop']


In [176]:
from sklearn.metrics import accuracy_score

y_pred = model.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

Accuracy: 54.09%


In [184]:
predictions = pd.DataFrame(list(zip(y_test, y_pred)))
predictions.columns=['actual','prediction']
predictions['count']=1

In [187]:
predictions.groupby(['actual','prediction']).count().reset_index()

Unnamed: 0,actual,prediction,count
0,.net software engineer,.net software engineer,159
1,.net software engineer,quality assurance engineer,1
2,.net software engineer,software engineer,88
3,.net software engineer,systems administrator,1
4,.net software engineer,ux engineer,1
5,accountant,accountant,223
6,accountant,business analyst,3
7,accountant,consultant,1
8,accountant,data analyst,1
9,accountant,financial analyst,29
