## Load libraries

In [1]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import html2text
from sklearn.model_selection import train_test_split
import html

## Define own functions

In [2]:
def cleanhtml(raw_html):
    """Cleans text using regular expressions"""
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = re.sub('\r', '', cleantext)
    cleantext = re.sub('\n', ' ', cleantext)
    cleantext = re.sub('\xa0', '', cleantext)
    cleantext = re.sub('#*#', '', cleantext)
    cleantext = BeautifulSoup(html.unescape(cleantext)).text
    cleantext = re.sub(r"http[s]?://\S+", "", cleantext)
    cleantext = re.sub(r"\s+", " ", cleantext)  
    cleantext = re.sub(r'''(?i)\b((?:url_|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»""'']))''', " ", cleantext)
    # Newly added
    cleantext = str.lower(cleantext)
    return cleantext

In [9]:
def export_clean_balanced_data(data,text_variable,label_variable,sample_size,SEED,export=True):
    """Creates cleaned and balanced training and test dataset"""
    # Casting into String
    data[text_variable] = data[text_variable].astype(str)
    data[text_variable] = [cleanhtml(i) for i in data[text_variable] ]
    
    # Temporary table with 'population' class imbalance
    temp_table = data[label_variable].value_counts()
    
    # Create new column for class frequency
    data['freq'] = [1/temp_table[i] for i in data[label_variable]]
    
    # Get weighted sample
    data=data.sample(n=sample_size, weights='freq', random_state=SEED)
    
    # Change missing value name
    data[text_variable] = ['nann' if i=='nan' else i for i in data[text_variable]]
    data[text_variable] = ['nann' if i=='' else i for i in data[text_variable]]
    
    # Train/Test Split
    train, test = train_test_split(data, random_state=SEED)
    train = train[[label_variable,text_variable]]
    test = test[[label_variable,text_variable]]
    
    # Save out files
    if export:
        train.to_csv('train_{}.csv'.format(text_variable),index=True)
        test.to_csv('test_{}.csv'.format(text_variable),index=True)
    
    return data

## Import dataset

In [10]:
# Import datafile
data = pd.read_csv('Data/emscad_v1.csv')
print(data.head())

                                       title            location department  \
0                           Marketing Intern    US, NY, New York  Marketing   
1  Customer Service - Cloud Video Production      NZ, , Auckland    Success   
2    Commissioning Machinery Assistant (CMA)       US, IA, Wever        NaN   
3          Account Executive - Washington DC  US, DC, Washington      Sales   
4                        Bill Review Manager  US, FL, Fort Worth        NaN   

  salary_range                                    company_profile  \
0          NaN  <h3>We're Food52, and we've created a groundbr...   
1          NaN  <h3>90 Seconds, the worlds Cloud Video Product...   
2          NaN  <h3></h3>\r\n<p>Valor Services provides Workfo...   
3          NaN  <p>Our passion for improving quality of life t...   
4          NaN  <p>SpotSource Solutions LLC is a Global Human ...   

                                         description  \
0  <p>Food52, a fast-growing, James Beard Award-w...  

## Run the function

In [11]:
# Save whole dataset(cleaned)
current_text_var = 'description'

dataset2 = export_clean_balanced_data(data=data,
                                      text_variable=current_text_var,
                                      label_variable='fraudulent',sample_size=len(data),SEED=3,export=False)

print(dataset2['fraudulent'].value_counts())

f    17014
t      866
Name: fraudulent, dtype: int64


In [12]:
print(dataset2.head())

                                             title            location  \
10239                  Customer Service Team Lead     US, NJ, Somerset   
14344                  Senior Interaction Designer     GB, LND, London   
5610                                 Optical Sales   US, NY, Plainview   
9327                    Senior Electrical Engineer   US, FL, Lake Mary   
17649  Data Entry Clerk / Administrative Assistant  US, DC, Washington   

           department salary_range  \
10239             NaN          NaN   
14344           Tech           NaN   
5610              NaN          NaN   
9327              NaN          NaN   
17649  Administrative     21-63000   

                                         company_profile  \
10239  <p>Novitex Enterprise Solutions, formerly Pitn...   
14344  <p>10 Minutes With is an educational website d...   
5610                                                 NaN   
9327                                                 NaN   
17649                     

In [13]:
# Save full cleaned dataset
dataset2.to_csv('full_{}.csv'.format(current_text_var),index=True)

In [14]:
text_vars = ['description','company_profile','requirements','benefits']

for text_var in text_vars:
    dataset2 = export_clean_balanced_data(data=data,
                                      text_variable=text_var,
                                      label_variable='fraudulent',sample_size=2000,SEED=3,export=True)

print(dataset2['fraudulent'].value_counts())

f    1294
t     706
Name: fraudulent, dtype: int64


In [15]:
## Potential stemming


#from nltk.stem.porter import PorterStemmer
#sample = open("myfile.txt", "r") 
#s = sample.read() 
#tokens = word_tokenize(s)
#data2 = sample[:100]
#porter = PorterStemmer()
#stemmed = [[ps.stem(token) for token in sentence.split(" ")] for sentence in tokens]
#stemmed = [porter.stem(word) for word in tokens]
#print(stemmed[:100])