In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../../')

In [3]:
import json
import os
from glob import glob
import numpy as np
import pandas as pd

from utils.parse_util import HTMLFileReader, LinkedInProfileParser
from utils.parse_pipeline import parse_profile
import webbrowser

In [4]:
pd.set_option('max_colwidth', 100)

In [5]:
LOG_ROOT = '/home/sjb/Projects/Research/LinkedIn_OB/data/parse_log/'
PARSED_ROOT = '/home/sjb/Projects/Research/LinkedIn_OB/data/parsed_profiles/'

### Log

In [6]:
os.listdir(LOG_ROOT)

['First_1000_1', 'First_1000_0', '2000', '1000_1500']

In [7]:
def read_log_file(sub_dir):

    log_fpaths = glob(os.path.join(sub_dir, '*.json'))
    for fpath in log_fpaths:
        with open(fpath, 'r') as f:
            content = json.load(f)
        yield fpath, content

In [8]:
def _get_zip_dir_from_path(file_path):
    return file_path.split('/')[-2]
    
def _get_parent_dir_from_path(file_path):
    return file_path.split('/')[-1].split('_')[0]

In [9]:
log_items = []

for sub_dir in os.listdir(LOG_ROOT):
    
    sub_dir_full = os.path.join(LOG_ROOT, sub_dir)
    log_file_contents = read_log_file(sub_dir_full)

    for fpath, log_content in log_file_contents:
        log_item = log_content.copy()
        log_item['empty_html_list'] = len(log_item['empty_html_list'])
        log_item['parse_error'] = len(log_item['parse_error'])
        log_item['file_path'] = fpath
        log_item['zip_dir'] = _get_zip_dir_from_path(fpath)
        log_item['parent_dir'] = _get_parent_dir_from_path(fpath)
        
        log_items.append(log_item)

In [10]:
log_items = pd.DataFrame(log_items)

In [11]:
log_items.shape

(1546, 7)

In [12]:
log_items.head()

Unnamed: 0,empty_html_list,parse_error,total_time,num_files,file_path,zip_dir,parent_dir
0,0,1,10.866657,30255,/home/sjb/Projects/Research/LinkedIn_OB/data/parse_log/First_1000_1/882_log.json,First_1000_1,882
1,0,16,15.747177,42652,/home/sjb/Projects/Research/LinkedIn_OB/data/parse_log/First_1000_1/516_log.json,First_1000_1,516
2,0,20,13.235617,31244,/home/sjb/Projects/Research/LinkedIn_OB/data/parse_log/First_1000_1/759_log.json,First_1000_1,759
3,0,1,14.786534,39993,/home/sjb/Projects/Research/LinkedIn_OB/data/parse_log/First_1000_1/634_log.json,First_1000_1,634
4,0,7,14.204176,38477,/home/sjb/Projects/Research/LinkedIn_OB/data/parse_log/First_1000_1/746_log.json,First_1000_1,746


In [14]:
log_items['zip_dir'].value_counts()

First_1000_1    500
First_1000_0    500
1000_1500       435
2000            111
Name: zip_dir, dtype: int64

In [19]:
temp = log_items[['empty_html_list', 'parse_error', 'num_files']].copy()

In [21]:
temp['total_error'] = temp['empty_html_list'] + temp['parse_error']

In [24]:
a = temp.sum()

In [27]:
(a / a['num_files']) * 100

empty_html_list      0.048783
parse_error          0.021085
num_files          100.000000
total_error          0.069868
dtype: float64

In [None]:
(temp / temp['num_files']) * 100

### Sample Log

In [None]:
log_file = '122_log.json'
log_path = os.path.join(LOG_DIR, log_file)
log_path

In [None]:
with open(log_path, 'r') as f:
    log_content = json.load(f)

In [None]:
log_content.keys()

In [None]:
log_content['parse_error']

In [None]:
errored_files = list(log_content['parse_error'].keys())
errored_files

#### Error file check

In [None]:
webbrowser.open('file://' + errored_files[4])

In [None]:
html_reader = HTMLFileReader(errored_files[0])
html_reader.load()
clean_html = html_reader.get_clean_html_text()

In [None]:
li_profile = LinkedInProfileParser(clean_html)
li_profile.parse()

In [None]:
li

### Parsed Dirs

In [None]:
subdirs = os.listdir(PARSED_DIR)

In [None]:
len(subdirs)

### Parsed Profile

In [None]:
parsed_file_paths = glob(os.path.join(PARSED_DIR, '*/*.json'))

In [None]:
len(parsed_file_paths)

In [None]:
sample_parsed_fp = np.random.choice(parsed_file_paths)

In [None]:
with open(sample_parsed_fp, 'r') as f:
    sample_parse_content = json.load(f)

In [None]:
sample_parse_content.keys()

In [None]:
sample_parse_content['summary']