In [33]:
import json
import pandas as pd

# Custom function in functions folder
from functions.process_edu_titles import *

In [2]:
directory = '/Users/kwheatley/Desktop/Capstone/gcloud_data/'
# directory = '/mnt/disks/mnt_dir/data/'

# Load Data

In [3]:
# Load education for resumes
resume_edu = pd.read_csv(directory+'02_resumes_education.csv')

# Load the list of relevant job titles
relevant_job_titles = pd.read_csv(directory+'03_relevant_job_titles.csv')
relevant_job_titles.columns = ['cleaned_job_title']

# Remove all education not in resume id list
resume_edu = resume_edu[resume_edu.resume_id\
                       .isin(relevant_resume_ids.resume_id)]

print("Number of records:", resume_edu.resume_id.count())
print("Number of unique resume ids:", resume_edu.resume_id.nunique())

Number of records: 259134
Number of unique resume ids: 153067


In [60]:
# Load all the resume work information
current_job_titles = pd.read_csv(directory+'02_resumes_work.csv')

# Remove all null cleaned_job_title records
current_job_titles = current_job_titles[~current_job_titles.cleaned_job_title.isnull()]

# Load the list of relevant job titles
relevant_job_titles = pd.read_csv(directory+'03_relevant_job_titles.csv')
relevant_job_titles.columns = ['cleaned_job_title']

# Filter to only job titles used in `03_process_salary_and_create_ngram_model`
current_job_titles = current_job_titles[current_job_titles.cleaned_job_title\
                                        .isin(relevant_job_titles.cleaned_job_title)]

# Filter to only resumes used in `03_process_salary_and_create_ngram_model`
current_job_titles = current_job_titles[current_job_titles.resume_id.isin(relevant_resume_ids.resume_id)]    

# Select only these columns
current_job_titles = current_job_titles[['resume_id','cleaned_job_title','from_year']]

# Drop any duplicates
current_job_titles = current_job_titles.drop_duplicates()

# Make resume id the index
current_job_titles.set_index('resume_id', inplace=True)

# Rename the columns
current_job_titles.columns = ['cleaned_job_title','work_start_year']

# Process Education Titles

In [5]:
subject_name_list, degree_name_list, degree_category_list = process_edu_titles(resume_edu.edu_title)

In [7]:
# Add the processed data to the original dataframe
resume_edu['degree_name'] = degree_name_list
resume_edu['subject_name'] = subject_name_list
resume_edu['degree_category'] = degree_category_list


# Create a new dataframe with processed data
current_edu = resume_edu[['resume_id','degree_category', 'subject_name', 'to_year']].set_index('resume_id')
current_edu.columns = ['final_degree_category','subject_name','edu_grad_year']
current_edu = current_edu.drop_duplicates()

# Merge Education and Work History

In [117]:
# Merge the list of processed education titles with the list of resumes.
# This section matches the work start date and the education end date.
# If education end date is after the work start date, the education record
# is removed from the list. This ensures that education is applied to the 
# correct timeframes.
combined_data = pd.merge(current_job_titles, current_edu, how='left',
         left_index=True, right_index=True, sort=True)
combined_data.work_start_year = combined_data.work_start_year.fillna(0).astype(int)
combined_data.edu_grad_year = combined_data.edu_grad_year.fillna(0).astype(int)
combined_data = combined_data[combined_data.work_start_year >= combined_data.edu_grad_year]
combined_data = combined_data.drop_duplicates()

# filler_df = current_job_titles[~current_job_titles.index.isin(combined_data.index)]
# filler_df['final_degree_category'] = 'not listed'
# filler_df['subject_name'] = 'not listed'
# filler_df['edu_grad_year'] = 1900

# combined_data = pd.concat([combined_data,filler_df])
combined_data.reset_index(inplace=True)

# Create Files

In [118]:
combined_data = combined_data[combined_data.cleaned_job_title == 'accountant']

In [119]:
# For any job titles that have more than 1000 subjects, we will mask the last subjects as `other`
final_combined_data = combined_data.groupby(['cleaned_job_title','final_degree_category','subject_name'])\
    .resume_id.count().reset_index()
final_combined_data['ranking'] = final_combined_data.groupby(['cleaned_job_title','final_degree_category'])\
    .resume_id.rank(ascending=False).astype(int)
final_combined_data.loc[final_combined_data.ranking>1000,'subject_name'] = 'other'

# Save list of subject rankings. This is used for sorting for each job title.
final_combined_data.groupby(['cleaned_job_title','subject_name']).resume_id.agg(['sum','max']).reset_index()\
    .sort_values(by=['cleaned_job_title','sum','max'],ascending=False)\
    .to_csv(directory+'04_ranked_subjects.csv',index=False)
    
final_combined_data = final_combined_data.groupby(['cleaned_job_title','final_degree_category','subject_name'])\
    .resume_id.sum().unstack('subject_name')
final_combined_data = final_combined_data.reset_index()

# Save data records
with open(directory+'04_edu_data_bar_chart.json', 'w') as outfile: 
    json.dump([row.dropna().to_dict() for index,row in final_combined_data.iterrows()],outfile)
    
# Save distinct list of jobs
unique_jobs = pd.DataFrame(final_combined_data.cleaned_job_title.unique())\
    .rename(index=str, columns={0:'cleaned_job_title'})\
    .sort_values(by='cleaned_job_title')
unique_jobs.to_csv(directory+'04_unique_jobs.csv')

# END