In [4]:
import csv
import datetime
import json
import math
import numpy as np
import pandas as pd
import pickle
import re

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

from nltk import word_tokenize
from nltk.util import ngrams

pd.set_option('display.max_colwidth', 500)

In [5]:
# directory = '/Users/kwheatley/Desktop/Capstone/gcloud_data/'
directory = '/mnt/disks/mnt_dir/data/'

In [6]:
# Load education for resumes
resume_edu = pd.read_csv(directory+'02_resumes_education.csv')

# Load the list of relevant job titles
relevant_job_titles = pd.read_csv(directory+'03_relevant_job_titles.csv')
relevant_job_titles.columns = ['cleaned_job_title']

# Load the list of relevant resume ids from 03_create_ngram_model
relevant_resume_ids = pd.read_csv(directory+'03_relevant_resume_ids.csv')
relevant_resume_ids.columns = ['resume_id']

# Remove all education not in resume id list
resume_edu = resume_edu[resume_edu.resume_id\
                       .isin(relevant_resume_ids.resume_id)]

print("Number of records:", resume_edu.resume_id.count())
print("Number of unique resume ids:", resume_edu.resume_id.nunique())

Number of records: 606127
Number of unique resume ids: 351206


In [None]:
# Clean education titles and tokenize
edu_titles = resume_edu.edu_title

In [None]:
edu_titles = [re.sub('[^A-Za-z0-9\s]+', '', row.lower()) for row in edu_titles]
edu_tokens = [word_tokenize(row) for row in edu_titles]

# Create a list of acronyms by selecting all words that are 4 or less letters
acronym_list = []
for row in edu_tokens:
    [acronym_list.append(x) for x in row if len(x) < 5]
acronym_list = list(set(acronym_list))

# Read in the manually created degree dictionary `degree_type_word_dict`
degree_type_word_dict = pd.read_csv('functions/configuration_files/degree_type_word_dict.csv', encoding='latin-1')
degree_type_word_dict = degree_type_word_dict[['keyword','type']]\
                        .set_index('keyword')['type']\
                        .to_dict()

# Read in the manually created degree dictionary `degree_type_phrase_dict`
degree_type_phrase_dict = pd.read_csv('functions/configuration_files/degree_type_phrase_dict.csv', encoding='latin-1')
degree_type_phrase_dict = degree_type_phrase_dict[['keyword','type']]\
                        .set_index('keyword')['type']\
                        .to_dict()

# Iterate through all the degree titles. Process each word:
# 1. If the word = 'in', then add it as a `degree_row`, remove that word from the 
#   `subject_row`, and stop processing word
# 2. If the word is in the acronym list or the manual dictionary, then add it as a
#    `degree_row`, remove that word from the `subject_row`, and stop processing word
# 3. If the word is not 1 or 2, stop processing the word
degree_name_list = []
subject_name_list = []
for row in edu_tokens:
    
    degree_row = []
    subject_row = row
    for token in row:
        if token == 'in':
            degree_row.append(token)
            subject_row = subject_row[1:]
            break
        elif token in list(degree_type_word_dict.keys()) + acronym_list:
            degree_row.append(token)
            subject_row = subject_row[1:]
        else:
            break

    degree_name_list.append(' '.join(degree_row))
    subject_name_list.append(' '.join(subject_row))
    
last_dict = {
        'immersive':'bootcamp',
        'certificate':'bootcamp',
        'bootcamp':'bootcamp',
        'boot camp':'bootcamp',
        'license':'license',
        'licensure':'license',
        'certification':'certificate',
        'certificate':'certificate',        
        }

degree_category_list = []
for index, row in enumerate(degree_name_list):
    degree_category = []
    
    found_key=0
    # Use the `degree_type_word_dict` dictionary to assign a degree type to each `degree_row`
    for key in filter(lambda x: str(degree_type_word_dict[x])!='nan', degree_type_word_dict):
        if key in row.split():
            degree_category.append(degree_type_word_dict[key])
            found_key=1

    if found_key==0:
        # Use the `degree_type_phrase_dict` dictionary to assign a degree type to each `degree_row`
        for phrase in degree_type_phrase_dict:
            if re.match(phrase,row):
                degree_category.append(degree_type_phrase_dict[phrase])
                found_key=1

    if found_key==0:
        for key in last_dict:
            if key in subject_name_list[index]:
                degree_category.append(last_dict[key])
                
    degree_category_list.append(list(set([x.strip() for x in degree_category if str(x)!='nan' and str(x)!= ' '])))

In [None]:
dict_test = ['minor',
'all but dissertation',
'juris doctor',
'doctorate',
'associates',
'some education',
'masters',
'bachelors',
'license',
'hs diploma',
'vocational',
'certificate']

final_degree_category_list = []
for row in degree_category_list:
    if len(row) > 1:
        for job in dict_test:
            if job in row:
                final_degree_category_list.append(job)
#                 print(job)
                break
    elif len(row) == 1:
        final_degree_category_list.append(row[0])
    else:
        final_degree_category_list.append('unknown')        

In [None]:
resume_edu['degree_name'] = degree_name_list
resume_edu['subject_name'] = subject_name_list
resume_edu['final_degree_category'] = final_degree_category_list
resume_edu['possible_degree_category'] = degree_category_list
resume_edu['possible_degree_category'] = resume_edu['possible_degree_category'].astype(str)

# resume_edu.groupby(['degree_name','subject_name','final_degree_category','state'])['resume_id'].count().to_csv('temp.csv')
current_edu = resume_edu[['resume_id','final_degree_category', 'subject_name', 'to_year']]
current_edu.set_index('resume_id',inplace=True)
current_edu.columns = ['final_degree_category','subject_name','edu_grad_year']
current_edu = current_edu.drop_duplicates()

In [None]:
current_job_titles = pd.read_csv(directory+'02_resumes_work.csv')
current_job_titles = current_job_titles\
    [current_job_titles.resume_id.isin(relevant_resume_ids.resume_id)]\
    [['resume_id','cleaned_job_title','from_year']]\
    .sort_values(by='from_year')
current_job_titles.set_index('resume_id',inplace=True)
current_job_titles.columns = ['cleaned_job_title','work_start_year']
current_job_titles = current_job_titles.drop_duplicates()

In [11]:
combined_data = pd.merge(current_job_titles, current_edu, how='left',
         left_index=True, right_index=True, sort=True)
combined_data.work_start_year = combined_data.work_start_year.fillna(0).astype(int)
combined_data.edu_grad_year = combined_data.edu_grad_year.fillna(0).astype(int)
combined_data = combined_data[combined_data.work_start_year >= combined_data.edu_grad_year]
combined_data = combined_data.drop_duplicates()

filler_df = current_job_titles[~current_job_titles.index.isin(combined_data.index)]
filler_df['final_degree_category'] = 'not listed'
filler_df['subject_name'] = 'not listed'
filler_df['edu_grad_year'] = 1900

combined_data = pd.concat([combined_data,filler_df])

NameError: name 'current_job_titles' is not defined

In [11]:
combined_data[['cleaned_job_title','final_degree_category','subject_name']]\
    .to_json(path_or_buf=directory+'04_edu_data_bar_chart.json', orient='table')

In [12]:
combined_data = combined_data[['cleaned_job_title','final_degree_category','subject_name']].reset_index()

In [13]:
combined_data = combined_data[combined_data.cleaned_job_title\
                       .isin(relevant_job_titles.cleaned_job_title)]

In [18]:
temp = combined_data.groupby(['cleaned_job_title','final_degree_category','subject_name'])\
    .resume_id.count().reset_index()
temp['ranking'] = temp.groupby(['cleaned_job_title','final_degree_category'])\
    .resume_id.rank(ascending=False).astype(int)
temp.loc[temp.ranking>500,'subject_name'] = 'other'
temp2 = temp.copy()
temp = temp.groupby(['cleaned_job_title','final_degree_category','subject_name'])\
    .resume_id.sum().unstack('subject_name')
temp = temp.reset_index()

In [19]:
pd.DataFrame(combined_data.cleaned_job_title.unique()).rename(index=str, columns={0:'cleaned_job_title'})\
    .sort_values(by='cleaned_job_title')\
    .to_csv(directory+'04_unique_jobs.csv')

In [20]:
# temp = temp[temp.cleaned_job_title == 'software engineer']

with open(directory+'04_edu_data_bar_chart.json', 'w') as outfile: 
    json.dump([row.dropna().to_dict() for index,row in temp.iterrows()],outfile)

In [21]:
temp2.groupby(['cleaned_job_title','subject_name']).resume_id.agg(['sum','max']).reset_index()\
    .sort_values(by=['cleaned_job_title','sum','max'],ascending=False)\
    .to_csv(directory+'04_ranked_subjects.csv',index=False)

# END