In [1]:
from glob import glob
import os

import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
LIWC_VECTOR_ROOT_DIR = '/home/sjb/Projects/Research/LinkedIn_OB/data/word_features'
LIWC_DISTANCE_SAVE_SUBDIR = 'company_level_liwc/distances'
LIWC_DISTANCE_FILE_FORMAT = '{company_name}_distance.npz'

PERSON_STAY_DIR = '/home/sjb/Projects/Research/LinkedIn_OB/data/company_level_individual_stay_term/'
PERSON_STAY_TERM_FILE_FORMAT = '{company_name}_person_stay_term.csv'

In [3]:
liwc_distance_file_paths = glob(os.path.join(LIWC_VECTOR_ROOT_DIR, 
                                             LIWC_DISTANCE_SAVE_SUBDIR, '*_distance.npz'))

In [4]:
def _extract_company_name_from_distance_file(distance_file_path):
    return os.path.basename(distance_file_path).replace('_distance.npz', '')

In [5]:
len(liwc_distance_file_paths)

1382

In [6]:
COMPANY_PERSON_IDS_KEY = 'company_person_ids'

DISTANCE_KEYS = [
    'cosine_standardized',
    'cosine',
    'euclidean_standardized',
    'euclidean',
    'mahalonobis_standardized',
    'mahalonobis',
    'js_divergence_standardized',
    'js_divergence'
]

In [7]:
dfs = []

In [8]:
expected_liwc_data_keys = set([COMPANY_PERSON_IDS_KEY] + DISTANCE_KEYS)

for liwc_distance_file_path in tqdm(liwc_distance_file_paths):
    
    company_name = _extract_company_name_from_distance_file(liwc_distance_file_path)
    
    distance_data = np.load(liwc_distance_file_path, allow_pickle=True)

    distance_data_keys = set(distance_data.keys())
    
    # sanity check
    assert len(distance_data_keys.symmetric_difference(expected_liwc_data_keys)) == 0
    
    distance_data_dict = {}
    
    for k in expected_liwc_data_keys:
        distance_data_dict[k] = distance_data[k]
    
    distance_data_df = pd.DataFrame(distance_data_dict)
    
    person_stay_file_path = os.path.join(PERSON_STAY_DIR, PERSON_STAY_TERM_FILE_FORMAT).format(company_name=company_name)
    person_stay_df = pd.read_csv(person_stay_file_path)
    person_stay_df['company_person_ids'] = company_name + '__' + person_stay_df['person_id']
    
    # sanity check
    assert person_stay_df.shape[0] == distance_data_df.shape[0]
    assert len(set(person_stay_df['company_person_ids']).symmetric_difference(set(distance_data_df['company_person_ids']))) == 0
    
    company_level_info_df = person_stay_df.merge(distance_data_df, on='company_person_ids').drop('person_id', axis=1)
    company_level_info_df['company_name'] = company_name
    
    dfs.append(company_level_info_df)

100%|██████████| 1382/1382 [00:24<00:00, 56.54it/s]


In [9]:
all_df = pd.concat(dfs)

In [11]:
all_df.shape

(4307998, 12)

In [14]:
all_df.head()

Unnamed: 0,n_months,is_current_job,company_person_ids,euclidean,cosine_standardized,mahalonobis,euclidean_standardized,mahalonobis_standardized,js_divergence,cosine,js_divergence_standardized,company_name
0,36.0,False,the-leukemia-&-lymphoma-society__10053159_202_...,30.4218,0.106071,10.030937,9.225756,10.030937,0.20997,0.013549,0.372366,the-leukemia-&-lymphoma-society
1,6.0,False,the-leukemia-&-lymphoma-society__10099087_202_...,34.184982,0.060918,8.223884,6.896046,8.223884,0.197999,0.015253,0.341049,the-leukemia-&-lymphoma-society
2,3.0,False,the-leukemia-&-lymphoma-society__101376526_202...,43.699405,0.055393,8.590643,6.393712,8.590643,0.194588,0.024669,0.347662,the-leukemia-&-lymphoma-society
3,5.0,False,the-leukemia-&-lymphoma-society__101445195_202...,72.306364,0.057917,6.808498,6.648091,6.808498,0.206825,0.073217,0.278581,the-leukemia-&-lymphoma-society
4,8.0,False,the-leukemia-&-lymphoma-society__101497839_203...,80.857131,0.070323,8.072191,7.248587,8.072191,0.248206,0.087317,0.33853,the-leukemia-&-lymphoma-society


In [15]:
all_df['company_name'].nunique()

1382

In [16]:
all_df.to_pickle('../../../../data/combined_features/stay_term_and_distances.pkl')

In [17]:
all_df.to_csv('../../../../data/combined_features/stay_term_and_distances.csv', index=False)