In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import json

In [39]:
directory = '/Users/kwheatley/Desktop/Capstone/gcloud_data/'

In [40]:
# Load the list of relevant resume ids from 03_create_ngram_model
relevant_resume_ids = pd.read_csv(directory+'03_relevant_resume_ids.csv')
relevant_resume_ids.columns = ['resume_id']

# Load the list of relevant job titles from 03_create_ngram_model
relevant_job_titles = pd.read_csv(directory+'03_relevant_job_titles.csv')
relevant_job_titles.columns = ['cleaned_job_title']

# Load jobs
current_job_titles = pd.read_csv(directory+'02_resumes_work.csv')

In [41]:
# Remove any null job titles
current_job_titles = current_job_titles[~current_job_titles.cleaned_job_title.isnull()]

# Filter to only relevent job titles
current_job_titles = current_job_titles\
    [current_job_titles.cleaned_job_title.isin(relevant_job_titles.cleaned_job_title)]

# Create a shift to put current and next job title in the same row
current_job_titles = current_job_titles.sort_values(by=['resume_id','from_year','to_year'])\
    .rename(columns={'cleaned_job_title': 'curr_cleaned_job_title',
                     'from_year': 'curr_from_year'})
current_job_titles['next_cleaned_job_title'] = current_job_titles.groupby('resume_id')\
['curr_cleaned_job_title'].shift(-1).fillna("None")
current_job_titles['next_from_year'] = current_job_titles.groupby('resume_id')\
['curr_from_year'].shift(-1).fillna("None")

# Remove all records where the next job titles is the same as the current job title
current_job_titles = current_job_titles\
    [current_job_titles.curr_cleaned_job_title != current_job_titles.next_cleaned_job_title]

# Remove all records where there the next job title doesn't exist
current_job_titles = current_job_titles\
    [current_job_titles.next_cleaned_job_title != 'None']

# Select only these columns
current_job_titles = current_job_titles\
    [['resume_id','curr_cleaned_job_title','next_cleaned_job_title','curr_from_year','next_from_year']]

current_job_titles.drop_duplicates(inplace=True)

In [42]:
# current_job_titles_copy = current_job_titles.copy()
current_job_titles = current_job_titles_copy.copy()

In [50]:
# Remove all rows where the current job started before 10 years ago
# Remove all rows where the next job started before 5 years ago
current_job_titles = current_job_titles[(current_job_titles.curr_from_year > 2008)
                                        & (current_job_titles.next_from_year > 2013)
                                       ]

In [51]:
parameters = {
                "max_number_records":2500, # This is the max number of records for each job title
}

# This code samples the number of records to remove excessive numbers
new_job_descriptions = pd.DataFrame()
for name, group in current_job_titles.groupby('curr_cleaned_job_title'):
    new_job_descriptions = pd.concat([new_job_descriptions,group\
        .sort_values(by='curr_from_year', ascending=False).head(parameters['max_number_records'])])

new_job_descriptions.count()

resume_id                 58322
curr_cleaned_job_title    58322
next_cleaned_job_title    58322
curr_from_year            58322
next_from_year            58322
dtype: int64

In [52]:
grouped_jobs = current_job_titles\
    .groupby(['curr_cleaned_job_title','next_cleaned_job_title'])\
    .resume_id.count().reset_index()
grouped_jobs.columns = ['curr_cleaned_job_title','next_cleaned_job_title','value']

grouped_jobs['ranking'] = grouped_jobs.groupby(['curr_cleaned_job_title']).value\
    .rank(ascending=False, method='first').astype(int)
    
grouped_jobs = grouped_jobs[grouped_jobs.ranking <= 15].sort_values(by=['curr_cleaned_job_title','ranking'])

del grouped_jobs['ranking']

grouped_jobs = grouped_jobs\
    .groupby(['curr_cleaned_job_title','next_cleaned_job_title'])\
    .value.sum()

grouped_jobs = pd.DataFrame(grouped_jobs / grouped_jobs.groupby(level=0).sum()).reset_index()

In [53]:
grouped_jobs.columns = ['one','two','value']
grouped_jobs1 = grouped_jobs.copy()
grouped_jobs1['idx'] = grouped_jobs1.two
grouped_jobs1.set_index('idx', inplace=True )
grouped_jobs1.columns = ['one','two','value_two']
grouped_jobs2 = grouped_jobs.copy()
grouped_jobs2['idx'] = grouped_jobs2.one
grouped_jobs2.set_index('idx', inplace=True )
grouped_jobs2.columns = ['two','three','value_three']
curr_grouped_jobs = grouped_jobs1.merge(grouped_jobs2)
grouped_jobs3 = grouped_jobs.copy()
grouped_jobs3['idx'] = grouped_jobs3.one
grouped_jobs3.set_index('idx', inplace=True )
grouped_jobs3.columns = ['three','four','value_four']
curr_grouped_jobs['idx'] = curr_grouped_jobs.three
curr_grouped_jobs.set_index('idx',inplace=True)
curr_grouped_jobs.drop_duplicates(inplace=True)
curr_grouped_jobs = curr_grouped_jobs.merge(grouped_jobs3)
grouped = curr_grouped_jobs.groupby(['one','two','three','four']).first()

In [54]:
curr_parent = ''
curr_child1 = ''
curr_child2 = ''
curr_child3 = ''
parent_index = -1
child1_index = -1
child2_index = -1
child3_index = -1

for index, row in grouped.iterrows():
    
    parent = index[0]
    child1 = index[1]
    child2 = index[2]
    child3 = index[3]

    if curr_parent != parent:
        if curr_parent != '':
            with open(directory+'06_hierarchy_data/'+curr_parent.replace(' ','_')+'_hierarchy.json', 'w') as outfile:
                json.dump(d, outfile, indent=4)
        child1_index = -1
        child2_index = -1
        curr_parent = parent
        d = {"name": parent, "children": []}
        
    if curr_child1 != child1:
        child1_index += 1
        child2_index = -1
        curr_child1 = child1
        d['children'].append({"name": child1, 
                              "value": round(row[0],4),
                              "children": []})

    if curr_child2 != child2:
        child2_index += 1
        curr_child2 = child2
        d['children'][child1_index]['children'].append({"name": child2, 
                                                        "value": round(row[1],4),
                                                         "children": []})

    if curr_child3 != child3:
        curr_child3 = child3
        d['children'][child1_index]['children'][child2_index]['children'].append(
                                                                            {"name": child3, 
                                                                            "value": round(row[2],4)})

# End