In [None]:
import os
import re

import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
pd.set_option('display.max_rows', 100)

In [None]:
# RAW_FILE = '/home/sjb/Projects/Research/LinkedIn_OB/data/word_features/LIWC_idividual_company_mapped.csv'
RAW_FILE = '/home/sjb/Projects/Research/LinkedIn_OB/data/word_features/LIWC_individual_company_mapped_no_walmart.csv'

TARGET_DATA_DIR = '/home/sjb/Projects/Research/LinkedIn_OB/data/company_level_individual_stay_term/'

In [None]:
def get_target_data_company_names():
    company_names = [x.replace('_person_stay_term.csv', '') for x in os.listdir(TARGET_DATA_DIR)]
    return company_names

In [None]:
raw_df = pd.read_csv(RAW_FILE)

In [None]:
LIWC_COLUMN_NAMES = [
    'AllPunc', 'Analytic', 'Apostro', 'Authentic', 'Clout', 'Colon',
    'Comma', 'Dash', 'Dic', 'Exclam', 'OtherP', 'Parenth', 'Period',
    'QMark', 'Quote', 'SemiC', 'Sixltr', 'Tone', 'WPS', 'achieve', 'adj',
    'adverb', 'affect', 'affiliation', 'anger', 'anx', 'article', 'assent',
    'auxverb', 'bio', 'body', 'cause', 'certain', 'cogproc', 'compare',
    'conj', 'death', 'differ', 'discrep', 'drives', 'family', 'feel',
    'female', 'filler', 'focusfuture', 'focuspast', 'focuspresent',
    'friend', 'function.', 'health', 'hear', 'home', 'i', 'informal',
    'ingest', 'insight', 'interrog', 'ipron', 'leisure', 'male', 'money',
    'motion', 'negate', 'negemo', 'netspeak', 'nonflu', 'number', 'percept',
    'posemo', 'power', 'ppron', 'prep', 'pronoun', 'quant', 'relativ',
    'relig', 'reward', 'risk', 'sad', 'see', 'sexual', 'shehe', 'social',
    'space', 'swear', 'tentat', 'they', 'time', 'verb', 'we', 'work',
    'you',
]

In [None]:
len(LIWC_COLUMN_NAMES)

In [None]:
def check_person_ids_in_liwc(company_name, liwc_df):
    
    company_target_file_name = f'{company_name}_person_stay_term.csv'
    target_df = pd.read_csv(os.path.join(TARGET_DATA_DIR, company_target_file_name))
    
    REG_BEGIN_MATCH = re.compile('^.*__')
    REG_END_MATHCH = re.compile('\.txt$')
    
    def _transform_text(text):
        sub = re.sub(REG_BEGIN_MATCH, '', text)
        sub = re.sub(REG_END_MATHCH, '', sub)

        return sub
    
    liwc_df_person_ids = set(liwc_df['Filename.x'].apply(_transform_text))
    target_df_person_ids = set(target_df['person_id'])
    
    assert target_df.shape[0] == liwc_df.shape[0]

    check_diff = liwc_df_person_ids.symmetric_difference(target_df_person_ids)
    assert len(check_diff) == 0

In [None]:
def check_y_vector_in_liwc(company_name, liwc_df):
    y_column_names = [c + '.y' for c in LIWC_COLUMN_NAMES]
    
    y_col_df = liwc_df[y_column_names]
    max_diff = (y_col_df - y_col_df.iloc[0]).abs().max().max()
    
    if max_diff != 0:
        raise ValueError('Raise Error for {} - max_diff is {}'.format(company_name, max_diff))


In [None]:
for company_name, df in tqdm(raw_df.groupby('company')):
    # Sanity check person ids
    check_person_ids_in_liwc(company_name, df)
    
    # Sanity check y vector is same throughout
    check_y_vector_in_liwc(company_name, df)

In [None]:
# raw_df[raw_df['company'] != 'walmart'].to_csv('/home/sjb/Projects/Research/LinkedIn_OB/data/word_features/LIWC_individual_company_mapped_no_walmart.csv', 
#                                               index=False)

In [None]:
target_data_company_names = get_target_data_company_names()

In [None]:
set(raw_df['company']) - (set(target_data_company_names))

In [None]:
temp = pd.Series(list((set(target_data_company_names)) - set(raw_df['company']) - set(['walmart']))).sort_values()

In [None]:
temp.to_frame('company_name').to_csv('temp.csv', index=False)