In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import sys
sys.path.append('../../')

In [None]:
import pandas as pd
import spacy
from tqdm import tqdm

from db_config import PERSON_SUMMARY_LENGTH_TABLE
from utils.db_util import create_postgres_engine, write_to_db
from utils.query_util import query_person_summary_in_parent_folder

In [None]:
ALL_PARENT_FOLDER_FILE = '../../data/all_parent_folders.pkl'

In [None]:
db_conn = create_postgres_engine()

In [None]:
nlp = spacy.load("en_core_web_sm")

### Sample

In [None]:
sample_person_df = query_person_summary_in_parent_folder(db_conn, 1337)

In [None]:
sample_person_df.shape

### Tokenize

In [None]:

# doc = nlp(u"Apple is looking at buying U.K. startup for $1 billion")
# for token in doc:
#     print(token.text, token.pos_, token.dep_)

In [None]:
sample_text = sample_person_df['person_summary'].sample(1).iloc[0]

In [None]:
sample_text

In [None]:
len(nlp(sample_text))

In [None]:
list(nlp(sample_text))[:10]

### Pipeline

In [None]:
def get_token_length(text):
    return len(nlp(text))

In [None]:
# %%time

# sample_person_df['person_summary'].apply(get_token_length)

In [None]:
def get_person_summary_stat_df(person_df):
    stat_df = person_df.copy()
    
    stat_df['word_length'] = person_df['person_summary'].apply(get_token_length)
    stat_df['char_length'] = person_df['person_summary'].str.len()
    
    stat_df = stat_df[['person_id', 'person_summary', 'word_length', 'char_length']]
    return stat_df

In [None]:
all_parent_folders = pd.read_pickle(ALL_PARENT_FOLDER_FILE)['parent_folder']

### Run

In [None]:
for parent_folder in tqdm(all_parent_folders):
    
    person_df = query_person_summary_in_parent_folder(db_conn, parent_folder)
    print('Running for {}, data size {}'.format(parent_folder, person_df.shape[0]))
    
    person_summary_stat_df = get_person_summary_stat_df(person_df)
    
    write_to_db(person_summary_stat_df, 
                db_conn, 
                PERSON_SUMMARY_LENGTH_TABLE, schema='linkedin', if_exists='append')