## Load libraries

In [0]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import html2text
from sklearn.model_selection import train_test_split
import html

## Define own functions

In [0]:
def cleanhtml(raw_html):
    """Cleans text using regular expressions"""
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = re.sub('\r', '', cleantext)
    cleantext = re.sub('\n', ' ', cleantext)
    cleantext = re.sub('\xa0', '', cleantext)
    cleantext = re.sub('#*#', '', cleantext)
    cleantext = BeautifulSoup(html.unescape(cleantext)).text
    cleantext = re.sub(r"http[s]?://\S+", "", cleantext)
    cleantext = re.sub(r"\s+", " ", cleantext)  
    cleantext = re.sub(r'''(?i)\b((?:url_|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»""'']))''', " ", cleantext)
    cleantext = str.lower(cleantext)
    return cleantext

In [0]:
def export_clean_balanced_data(data,text_variable,label_variable,sample_size,SEED,export=True):
    """Creates cleaned and balanced training and test dataset"""
    # Casting into String
    data[text_variable] = data[text_variable].astype(str)
    data[text_variable] = [cleanhtml(i) for i in data[text_variable] ]
    
    # Temporary table with 'population' class imbalance
    temp_table = data[label_variable].value_counts()
    
    # Create new column for class frequency
    data['freq'] = [1/temp_table[i] for i in data[label_variable]]
    
    # Get weighted sample
    data=data.sample(n=sample_size, weights='freq', random_state=SEED)
    
    # Change missing value name
    data[text_variable] = ['nann' if i=='nan' else i for i in data[text_variable]]
    data[text_variable] = ['nann' if i=='' else i for i in data[text_variable]]
    
    # Train/Test Split
    train, test = train_test_split(data, random_state=SEED)
    train = train[[label_variable,text_variable]]
    test = test[[label_variable,text_variable]]
    
    # Save out files
    if export:
        train.to_csv('train_{}.csv'.format(text_variable),index=True)
        test.to_csv('test_{}.csv'.format(text_variable),index=True)
    
    return data

## Import dataset

In [0]:
# Import raw datafile
data = pd.read_csv('Data/emscad_v1.csv')

# Inspect dataframe
print(data.head())

                                       title            location department  \
0                           Marketing Intern    US, NY, New York  Marketing   
1  Customer Service - Cloud Video Production      NZ, , Auckland    Success   
2    Commissioning Machinery Assistant (CMA)       US, IA, Wever        NaN   
3          Account Executive - Washington DC  US, DC, Washington      Sales   
4                        Bill Review Manager  US, FL, Fort Worth        NaN   

  salary_range                                    company_profile  \
0          NaN  <h3>We're Food52, and we've created a groundbr...   
1          NaN  <h3>90 Seconds, the worlds Cloud Video Product...   
2          NaN  <h3></h3>\r\n<p>Valor Services provides Workfo...   
3          NaN  <p>Our passion for improving quality of life t...   
4          NaN  <p>SpotSource Solutions LLC is a Global Human ...   

                                         description  \
0  <p>Food52, a fast-growing, James Beard Award-w...  

## Replicate dataset from the study

In [0]:
# Filter for indicator variable
data_study = data[data.in_balanced_dataset=='t']

# Inspection/sanity check 
print(data_study.shape)

# Sanity check for class balance
print(data_study['fraudulent'].value_counts())

(900, 18)
t    450
f    450
Name: fraudulent, dtype: int64


In [0]:
# Define variable names
text_vars = ['description','benefits','requirements','company_profile']
label_var = ['fraudulent']
data_study = data_study[['fraudulent','description','benefits','requirements','company_profile']]



# Clean text variables
for text_variable in text_vars:
    data_study[text_variable] = data_study[text_variable].astype(str)
    data_study[text_variable] = [cleanhtml(i) for i in data_study[text_variable] ]
    # Change missing value name
    data_study[text_variable] = ['nann' if i=='nan' else i for i in data_study[text_variable]]
    data_study[text_variable] = ['nann' if i=='' else i for i in data_study[text_variable]]
    
    
# Inspect new dataset
data_study.head()

Unnamed: 0,fraudulent,description,benefits,requirements,company_profile
144,t,the group has raised a fund for the purchase o...,nann,nann,nann
180,t,sales executive,sales executive,sales executive,nann
493,t,"a newly established company seeks outgoing, fr...",nann,"must have good knowledge of outlook, microsoft...",nann
1152,t,administrative assistantessential job responsi...,nann,nann,nann
1297,f,normal 0 false false false en-us x-none x-none...,nann,normal 0 false false false en-us x-none x-none...,nann


In [0]:
# Save versions of full cleaned study dataset
for current_text_var in text_vars:
    data_study1 = data_study[['fraudulent',current_text_var]]
    data_study1.to_csv('study_{}.csv'.format(current_text_var),index=False)

## Remove duplicate observations

In [0]:
# Everything is defined to be a duplicate if the description is the same!
data = data.drop_duplicates(subset='description')

# Check new dimension
print(data.shape)

(15095, 18)


In [0]:
# Check class imbalance
data['fraudulent'].value_counts()

f    14436
t      659
Name: fraudulent, dtype: int64

## Create and export cleaned dataframes (our own "curated" dataset)

### More balanced sample for each text variable

In [0]:
# Loop to create and export separate cleaned dataframes for each 
text_vars = ['description','company_profile','requirements','benefits']

for text_var in text_vars:
    dataset2 = export_clean_balanced_data(data=data,
                                      text_variable=text_var,
                                      label_variable='fraudulent',sample_size=2000,SEED=3,export=True)

# Check class imbalance in our own "balanced dataset"
print(dataset2['fraudulent'].value_counts())

f    1414
t     586
Name: fraudulent, dtype: int64


### Full dataset

In [0]:
# Save subset of full, complete, imbalanced cleaned dataset
dataset3 = dataset2[['fraudulent',current_text_var]]
dataset3.to_csv('full_{}.csv'.format(current_text_var),index=False)