In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import sys
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
import time
from commons.func import *

# internal package
sys.path.append('/cnvrg')


In [3]:
pd.set_option('display.max_colwidth',300)

In [4]:
df = pd.read_parquet(r"datasets\job_title_dataset.parquet")
df.dropna(inplace=True)
df.drop(columns=['JobTitleId','JobId'], inplace=True)
df = df[df.JobTitle != '0000']
df = df[df.JobTitle != '0']
df.head(2)

Unnamed: 0,ExtJobTitleText,JobTitle,Description
0,Business Development Center Agent,Medical Scientist,"For generations, the Landers family has been in the car business.<br/>Steve Landers Chrysler Dodge Jeep continues to build its brand on family values and a philosophy of serving its guests, team members, and communities.<br/>Every Landers team member is absolutely critical to its success.<br/>Ou..."
1,Technician I,Automotive Technician,"<div class=""earcu_posdescriptionContainer""><div class=""earcu_posdescription""><div class=""earcu_posdescriptionnote""><p>Avis Budget Group is an action-packed, high-energy workplace. We are a global leader in the travel services industry operating two of the most recognized brands in the vehicle re..."


In [5]:
df.shape

(30677, 3)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30677 entries, 0 to 31426
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ExtJobTitleText  30677 non-null  object
 1   JobTitle         30677 non-null  object
 2   Description      30677 non-null  object
dtypes: object(3)
memory usage: 958.7+ KB


In [7]:
df.isna().sum()

ExtJobTitleText    0
JobTitle           0
Description        0
dtype: int64

In [8]:
# count the number of unique labels per document 
labels_per_doc = df.groupby('Description').agg({'JobTitle':'nunique'}).sort_values(by='JobTitle',ascending=False)
labels_per_doc.query("JobTitle>1")

Unnamed: 0_level_0,JobTitle
Description,Unnamed: 1_level_1


In [9]:
# check for missing Description
empty_ix = []
for ix, document in enumerate(df.Description.tolist()):
    if len(document)<2:
        empty_ix.append(ix)
        
print(len(empty_ix))
df.drop(empty_ix,inplace=True)
df.shape

0


(30677, 3)

In [10]:
df.drop_duplicates(subset=['Description'], inplace=True)
df.dropna(inplace=True)

In [11]:
def remove_html(text):
    text = BeautifulSoup(text).get_text()
    text = text.replace('\n', '')
    # text = text.replace('.', '. ')
    return text

In [12]:
# clean the html tags

start = time.time()
df['Description'] = df.Description.apply(remove_html)
end = time.time() 
print('execution time in minutes: ', (end - start)/60) 

execution time in minutes:  0.941144343217214


In [13]:
df = reshape_df(df,the_min_amount_of_rows = 10, the_max_amount_of_rows = 1000)

In [14]:
df = df[df['Description'].notnull()]

In [15]:
df.shape

(25405, 3)

In [16]:
df.sample(2)

Unnamed: 0,ExtJobTitleText,JobTitle,Description
29603,School Speech Language Pathologist,Pediatric Speech Language Pathologist,"We’re looking for an exceptional School Speech Language Pathologist for a full-time position in El Dorado, CA who can start immediately and work through the end of the 2021-2022 school year. This therapist must be able to work on-site with elementary, middle, and high school stude"
8005,DIRECT SUPPORT PROFESSIONAL- DAY SUPPORT-HEALTH SERVICES- NEWTON KS,Direct Support Professional (DSP),"Are you driven to serve and help others in your community? Caregivers and Direct Support Professionals (DSP) are the heart of our company with their compassion, dependability, and care. If you want to make an impact by helping people live their best life, read more below and apply today! Provide..."


In [17]:
df.to_csv('datasets/data_clean.csv', index=False)