In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys

In [3]:
sys.path.append('../../')

In [94]:
from glob import glob
import os
import shutil

import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from tqdm import tqdm

from utils.parse_util import HTMLFileReader, LinkedInProfileParser

In [5]:
DATA_DIR = '../../../../data/sample_unzip/'

#### Currently DATA_DIR contains unzipped contents of `2000.zip`

In [6]:
profile_dirs = [x for x in os.listdir(DATA_DIR) if not x.endswith('.zip')]

In [7]:
len(profile_dirs)

111

In [8]:
profile_dirs_check = sorted([int(x) for x in profile_dirs])

In [9]:
profile_dirs_min = profile_dirs_check[0]
profile_dirs_max = profile_dirs_check[-1]
profile_dirs_min, profile_dirs_max

(2026, 2136)

In [10]:
consec_dirs = list(range(profile_dirs_min, profile_dirs_max + 1))

In [11]:
set(consec_dirs).symmetric_difference(set(profile_dirs_check))

set()

#### This means that the folders are consecutive from 2026 to 2136

In [12]:
profile_dirs = sorted(profile_dirs, key=lambda x: int(x))

### Get Basic Stats

In [13]:
uniq_extensions = set()
infos = []

for profile_dir in tqdm(profile_dirs):
    files = os.listdir(os.path.join(DATA_DIR, profile_dir))
    extensions = [os.path.splitext(f)[1] for f in files]
    infos.append({
        'dir': profile_dir,
        'n_files': len(files)
    })
    uniq_extensions = uniq_extensions | set(extensions)

100%|██████████| 111/111 [00:05<00:00, 21.01it/s]


In [14]:
uniq_extensions

{'.html'}

In [15]:
stat_df = pd.DataFrame(infos)
stat_df.head()

Unnamed: 0,dir,n_files
0,2026,9329
1,2027,49261
2,2028,49331
3,2029,49152
4,2030,48780


In [16]:
pd.DataFrame(infos)['n_files'].sum()

5096381

### Test parsing sample profiles

In [395]:
sample_profile_dir = '2026'

In [407]:
sample_file_name = '101290520.html'

In [408]:
sample_file_path = os.path.join(DATA_DIR,
                                sample_profile_dir,
                                sample_file_name)

In [409]:
html_reader = HTMLFileReader(sample_file_path)
html_reader.load()
clean_html = html_reader.get_clean_html_text()

In [410]:
li_profile = LinkedInProfileParser(clean_html)

li_profile.parse()

li_profile_dict = li_profile.get_formatted_content()

In [411]:
temp = li_profile.experience_section.root.find('a', {'class':'company-profile-public'})

In [412]:
type(temp.attrs['href'])

str

In [413]:
li_profile_dict

{'header': {'name': {'given_name': 'Mary K', 'family_name': 'Leonard'},
  'title': 'Student at Grand Valley State University',
  'location': 'Holland, Michigan',
  'industry': None},
 'overview': {'current': None,
  'past': [{'role': 'Student Research Assistant',
    'at': 'Community Research Institute;Johnson Center for Philanthropy'},
   {'role': 'Receptionist/Intake Specialist', 'at': 'J & M Services'},
   {'role': 'Administrative Assistant (Adecco Staffing)',
    'at': 'Herman Miller'},
   {'role': 'Accounting Clerk', 'at': 'Louis Padnos Iron & Metal Company'},
   {'role': 'Commodity Accounting Supervisor',
    'at': 'Louis Padnos Iron & Metal Company'}],
  'education': ['Grand Valley State University',
   'Grand Rapids Community College'],
  'connections': '31',
  'recommendations': None,
  'websites': None},
 'summary': None,
 'specialty': None,
 'experience': [{'title': 'Student Research Assistant',
   'org_summary': 'Community Research Institute;Johnson Center for Philanthropy'

In [414]:
li_profile.education_section.get_formatted_content_as_df()

Unnamed: 0,org_name,degree,major,period.date_start,period.date_end,education_detail.Grade,education_detail.text,education_detail.Activities and Societies
0,Grand Valley State University,Bachelor of Science (B.S.),Statistics,2006,2013,3.75,Major: Statistics\nMinor: Mathematics\nTook th...,Honor Society of Phi Kappa Phi\nMu Sigma Rho S...
1,Grand Rapids Community College,Associates of Arts,MACRO for transfer to 4-year college,1996,2005,4.0,"Took general education requirement courses, as...",Phi Theta Kappa Honor Society


In [415]:
li_profile.summary_section.get_formatted_content()

In [416]:
li_profile.experience_section.get_formatted_content_as_df()

Unnamed: 0,title,org_summary,company_profile,org_detail,location,description,period.date_start,period.date_end,period.duration,period.text
0,Student Research Assistant,Community Research Institute;Johnson Center fo...,,,"Pew Campus at Grand Valley State University, G...",Assisted researchers with cleaning and merging...,May 2008,September 2009,(1 year 5 months),
1,Receptionist/Intake Specialist,J & M Services,,,"Holland, Michigan",Staffing Company; Processed job applicants for...,September 2001,July 2002,(11 months),
2,Administrative Assistant (Adecco Staffing),Herman Miller,/company/herman-miller?trk=ppro_cprof,"Public Company; 5001-10,000 employees; MLHR;\n...","Zeeland, Michigan",Corporate Catering Department; answered the ph...,November 2000,August 2001,(10 months),
3,Accounting Clerk,Louis Padnos Iron & Metal Company,/company/louis-padnos-iron-&-metal-company?trk...,Privately Held; 201-500 employees;\n\t \t ...,"Holland, Michigan",Filled in for absent Commodity Accounting pers...,October 1997,April 2000,(2 years 7 months),
4,Commodity Accounting Supervisor,Louis Padnos Iron & Metal Company,/company/louis-padnos-iron-&-metal-company?trk...,Privately Held; 201-500 employees;\n\t \t ...,"Holland, Michigan",Matched and input documents for scrap invoices...,January 1985,May 1990,(5 years 5 months),
