In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../')

In [55]:
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
%matplotlib inline

from utils.db_util import create_postgres_engine
from utils.query_util import query_org_detail

In [4]:
db_conn = create_postgres_engine()

In [5]:
INPUT_FILE = '/home/sjb/Projects/Research/LinkedIn_OB/data/combined_features/stay_term_and_distances.pkl'

In [6]:
combined_feature_df = pd.read_pickle(INPUT_FILE)

In [7]:
uniq_company_names = combined_feature_df['company_name'].unique()

In [8]:
ORG_PROFILE_SUFFIX = '?trk=ppro_cprof'
ORG_PROFILE_PREFIX = '/company/'

def escape_single_quote(text):
    """Since postgres does not like single quote in text"""
    text = text.replace("'", "''")
    return text

In [9]:
MIN_ITEM_COUNT = 100

company_detail_infos = []

for company_name in tqdm(uniq_company_names):
    
    org_profile_link = '{}{}{}'.format(ORG_PROFILE_PREFIX, company_name, ORG_PROFILE_SUFFIX)
    org_profile_link_escaped = escape_single_quote(org_profile_link)
    
    org_detail_df = query_org_detail(db_conn, org_profile_link_escaped)
    org_detail_row = org_detail_df.sort_values('org_detail_count', ascending=False).iloc[0].to_dict()
    
    # sanity check
    org_detail_row['org_detail_count'] > MIN_ITEM_COUNT
    
    org_detail_row['company_name'] = company_name
    company_detail_infos.append(org_detail_row)
    
company_detail_df = pd.DataFrame(company_detail_infos)

100%|██████████| 1382/1382 [48:06<00:00,  2.09s/it]


In [33]:
temp = company_detail_df['org_detail'].values[3]

In [34]:
temp

'Public Company; 5001-10,000 employees; COMS;\n\t  \t      Computer Networking industry'

In [38]:
def parse_org_detail(org_detail_string):
    split_elements = org_detail_string.split(';')
    
    if len(split_elements) == 3:
        company_type, company_size, company_industry = split_elements
    elif len(split_elements) == 4:
        company_type, company_size, _, company_industry = split_elements
    else:
        print(org_detail_string)
        raise ValueError('split elements size is {} unexpected!'.format(len(split_elements)))
        
    company_type = company_type.strip()
    company_size = company_size.strip()
    company_industry = company_industry.strip()
    
    return pd.Series({
        'company_type': company_type,
        'company_size': company_size,
        'company_industry': company_industry,
    })

In [90]:
LARGE_COMPANY_TYPES = [
    '10,001+ employees',
    '1001-5000 employees',
    '5001-10,000 employees',
    '501-1000 employees',
    '201-500 employees',
    '51-200 employees',
]

MANUAL_ADJUSTMENT = {
    '/company/dunkin-donuts?trk=ppro_cprof': 'Public Company; 10,001+ employees; Food & Beverages industry',
    '/company/cvs-pharmacy?trk=ppro_cprof': 'Public Company; 10,001+ employees; Pharmaceuticals industry',
    '/company/mci-corporation?trk=ppro_cprof': 'Public Company; 10,001+ employees; Telecommunications industry',
}

In [89]:
company_detail_expanded_df = company_detail_df.copy()

In [92]:
for k, v in MANUAL_ADJUSTMENT.items():
    company_detail_expanded_df.loc[company_detail_expanded_df['org_profile_link'] == k, 'org_detail'] = v

In [100]:
company_detail_expanded_df = company_detail_expanded_df.assign(**company_detail_expanded_df['org_detail'].apply(parse_org_detail))

In [105]:
company_detail_expanded_df['company_size'].value_counts()

10,001+ employees        865
1001-5000 employees      258
5001-10,000 employees    218
501-1000 employees        16
201-500 employees         15
Myself Only                5
51-200 employees           3
1-10 employees             1
11-50 employees            1
Name: company_size, dtype: int64

In [102]:
company_detail_expanded_df['company_type'].value_counts()

Public Company             812
Educational Institution    229
Privately Held             198
Nonprofit                   66
Government Agency           59
Partnership                 14
Self-Employed                2
Sole Proprietorship          2
Name: company_type, dtype: int64

In [107]:
company_detail_expanded_df['company_industry'].value_counts()

Higher Education industry                       212
Retail industry                                  94
Financial Services industry                      84
Information Technology and Services industry     79
Hospital & Health Care industry                  64
                                               ... 
Furniture industry                                1
Religious Institutions industry                   1
Consumer Services industry                        1
Legislative Office industry                       1
Museums and Institutions industry                 1
Name: company_industry, Length: 108, dtype: int64

In [114]:
DEST_FILE_NAME = '/home/sjb/Projects/Research/LinkedIn_OB/data/covariates/company_level_info.csv'

In [115]:
company_detail_expanded_df.drop(['org_detail', 'org_detail_count'], axis=1).to_csv(DEST_FILE_NAME, index=False)

In [117]:
company_detail_expanded_df[~company_detail_expanded_df['company_size'].isin(LARGE_COMPANY_TYPES)]['company_name'].values

array(['freelance-graphic-design-services', 'currently-unemployed',
       'self-employed_1100', 'mary-kay-cosmetics_2', 'self-employed_1101',
       'independent-contractor_9', 'amoura-rose-beauty'], dtype=object)