In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../')

In [3]:
from datetime import datetime
from glob import glob
import os
import numpy as np
import pandas as pd

from tqdm import tqdm

from utils.db_util import create_postgres_engine, make_query
from utils.query_util import query_person_ids_in_organization
from utils.experience_processor import (
    parse_work_date, parse_work_duration, diff_month, get_person_stay_term,
    is_current_job
)

In [4]:
INPUT_DIR = '/home/sjb/Projects/Research/LinkedIn_OB/data/company_level_individual_profiles'
OUTPUT_DIR = '/home/sjb/Projects/Research/LinkedIn_OB/data/company_level_individual_stay_term'

In [5]:
db_conn = create_postgres_engine()

In [6]:
source_files = glob(os.path.join(INPUT_DIR,
                                 '*.csv'))

In [7]:
sample = pd.read_csv(source_files[75])

In [8]:
org_profile_link = sample['org_profile_link'].values[0]
person_ids = sample['person_id'].tolist()

In [9]:
org_person_df = query_person_ids_in_organization(db_conn, org_profile_link, person_ids)

In [10]:
assert len(set(org_person_df['person_id']).symmetric_difference(set(person_ids))) == 0

In [11]:
len(person_ids), org_person_df.shape[0]

(1984, 2374)

In [12]:
def apply_func_when_col_vals_not_null(func,
                                      r,
                                      arg_cols,
                                      cols,):
    check_null = r[cols]
    
    if check_null.isnull().any():
        return np.nan

    else:
        func_args = [r[c] for c in arg_cols]
        return func(*func_args)

In [13]:
org_person_df['date_start_parsed'] = org_person_df['date_start'].apply(
    lambda x: parse_work_date(x, date_type='start_date'))

org_person_df['date_end_parsed'] = org_person_df['date_end'].apply(
    lambda x: parse_work_date(x, date_type='end_date'))

org_person_df['duration_parsed'] = org_person_df.apply(
    lambda r: apply_func_when_col_vals_not_null(
        parse_work_duration,
        r,
        ['duration'],
        ['date_start_parsed', 'date_end_parsed'],
    ), axis=1)

# org_person_df['duration_calc'] = org_person_df.apply(
#     lambda r: apply_func_when_col_vals_not_null(
#         diff_month,
#         r,
#         ['date_end_parsed', 'date_start_parsed'],
#         ['date_start_parsed'],
#     ), axis=1)

In [14]:
# org_person_df[org_person_df[['date_start_parsed', 'date_end_parsed']].isnull().any(axis=1)].shape

In [15]:
person_stay_term_df = org_person_df.groupby('person_id').apply(get_person_stay_term).reset_index()

In [16]:
person_stay_term_df.columns = ['person_id', 'n_months']

In [17]:
person_stay_term_df[person_stay_term_df['n_months'] < 0]

Unnamed: 0,person_id,n_months


In [18]:
org_person_df[org_person_df['person_id'] == '4203562_85_First_1000_0']

Unnamed: 0,person_id,experience_id,org_name,org_profile_link,org_detail,experience_title,experience_location,experience_description,date_start,date_end,duration,is_current,date_start_parsed,date_end_parsed,duration_parsed
1336,4203562_85_First_1000_0,6,CBS Radio/WODS,/company/cbs.com?trk=ppro_cprof,"Public Company; 10,001+ employees; CBS;\nEnter...",Promotions Intern,,,2007,2007,less than a year,False,2007-02-01,2007-01-01,12.0
1337,4203562_85_First_1000_0,7,CBS Radio/WZLX,/company/cbs.com?trk=ppro_cprof,"Public Company; 10,001+ employees; CBS;\nEnter...",Account Executive,,,May 2007,2007,less than a year,False,2007-05-01,2007-01-01,12.0


In [19]:
min_person_stay_term = person_stay_term_df['n_months'].min() 

if min_person_stay_term < 0:
    raise ValueError('min of person stay term is {}, please investigate'.format(
        min_person_stay_term))

In [20]:
person_stay_term_df['is_current_job'] = person_stay_term_df['person_id'].apply(
    lambda x: is_current_job(x, org_person_df))

In [None]:
set(person_stay_term_df['person_id']).symmetric_difference(set(person_ids))

In [None]:
person_stay_term_df['is_current_job'].sum()

In [None]:
person_stay_term_df['n_months'].isnull().sum()

In [None]:
org_profile_link

In [None]:
person_stay_term_df.shape