In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('fake_job_postings.csv')

In [3]:
df.fillna("Not specified", inplace=True)

In [4]:
def get_rarity_threshold(p):

    if p < 0.01:
        return "very rare"
    elif p < 0.1:
        return "rare"
    elif p < 1:
        return "uncommon"
    elif p < 10:
        return "common"
    else:
        return "common"
    
def assign_rarity(col):
    d = dict(col.value_counts())
    
    # dont include 'not specified' in total count 
    d.pop('Not specified', None)
    total = sum(d.values())  

    mapping = {}

    for k,v in d.items():
        percentage = v * 100 / total
        mapping[k] = get_rarity_threshold(percentage)

    mapping['Not specified'] = 'Not specified'
    
    return col.apply(lambda x: mapping[x])    

def extract_country(location):
    if location != "Not specified":
        country = location.split(',')[0].strip()   
        if len(country):
            return country
    
    return "Not specified"

def extract_state(location):
    if location != "Not specified":
        state = location.split(',')[-1].strip()  
        if len(state):
            return state
    
    return "Not specified"

def process_experience(s):
    if s == 'Internship':
        return 1
    if s == 'Entry level':
        return 2
    if s == 'Associate':
        return 3
    if s == 'Mid-Senior level':
        return 4
    if s in ['Director', 'Executive']:
        return 5
    return 0

def process_education(s):
    
    if s == "Unspecified":
        return "Not specified"
    if "Vocational" in s:
        return "Vocational"
    if "Coursework" in s:
        return "Coursework"
    return s

def rank_education(s):
    if s == 'High School or equivalent':
        return 1
    if s in ['Vocational', 'Professional', 'Coursework', 'Certification']:
        return 2
    if s in ["Associate Degree", "Bachelor's Degree", "Master's Degree", "Doctorate"]:
        return 3
    return 0

In [5]:
df['country'] = df['location'].apply(lambda x: extract_country(x))
df['state'] = df['location'].apply(lambda x: extract_state(x))
df.drop(['location'], axis=1, inplace=True)    
df['country'] = assign_rarity(df['country'])
df['state'] = assign_rarity(df['state'])
df['salary_range'] = df['salary_range'].apply(lambda x: 0 if x == "Not specified" else 1)  
df['required_experience'] = df['required_experience'].apply(lambda x: process_experience(x))
df['required_education'] = df['required_education'].apply(lambda x: process_education(x))
df['required_education'] = df['required_education'].apply(lambda x: rank_education(x))
df['industry'] = assign_rarity(df['industry'])
df['function'] = assign_rarity(df['function'])

In [6]:
df.drop(columns=['department'], inplace=True)

In [7]:
df.columns

Index(['job_id', 'title', 'salary_range', 'company_profile', 'description',
       'requirements', 'benefits', 'telecommuting', 'has_company_logo',
       'has_questions', 'employment_type', 'required_experience',
       'required_education', 'industry', 'function', 'fraudulent', 'country',
       'state'],
      dtype='object')

In [8]:
dummies = pd.get_dummies(df[['employment_type', 'industry','function','country','state']], drop_first=True)
dummies

Unnamed: 0,employment_type_Full-time,employment_type_Not specified,employment_type_Other,employment_type_Part-time,employment_type_Temporary,industry_common,industry_rare,industry_uncommon,industry_very rare,function_common,function_uncommon,country_common,country_rare,country_uncommon,country_very rare,state_common,state_rare,state_uncommon,state_very rare
0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0
1,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
3,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0,0,0
4,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0
17876,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,1,0
17877,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0
17878,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0


In [9]:
df = df.drop(columns=['employment_type', 'industry','function','country','state'])
df = pd.concat([df, dummies], axis=1)
df.head()

Unnamed: 0,job_id,title,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,...,function_common,function_uncommon,country_common,country_rare,country_uncommon,country_very rare,state_common,state_rare,state_uncommon,state_very rare
0,1,Marketing Intern,0,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Not specified,0,1,0,...,1,0,1,0,0,0,1,0,0,0
1,2,Customer Service - Cloud Video Production,0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,...,1,0,1,0,0,0,1,0,0,0
2,3,Commissioning Machinery Assistant (CMA),0,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,Not specified,0,1,0,...,0,0,1,0,0,0,0,1,0,0
3,4,Account Executive - Washington DC,0,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,...,1,0,1,0,0,0,1,0,0,0
4,5,Bill Review Manager,0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,...,1,0,1,0,0,0,0,0,1,0


### Textual

In [10]:
import enchant

import html
import re

def clean_text(s):
    
    if len(s.strip()) == 0 or s == 'NA' or s == 'N/A' or s == "NaN" or type(s) != str:
        s = "Not specified"
    
    # remove html entites 
    s = html.unescape(s)
    
    # remove non ascii
    pattern = re.compile(r'[^\x00-\x7F]+')
    s = pattern.sub('', s)    
    
    # remove newline, tab etc 
    s = re.sub(r'[\n\r\t]+', '', s)
    
    # remove special stuff enclosed within hashtags
    s = re.sub(r'#\w+#', '', s)
    
    return s

def count_special_characters(s):
    count = 0
    for char in s:
        if not char.isalnum() and char != ' ':
            count += 1
    return count

dictionary = enchant.Dict("en_US") 
def spellchecker(word):

    if not word.isalnum():
        return 0
    
    if dictionary.check(word):
        return 0
    
    return 1

def check_spelling(tokens):
    count = 0
    
    for t in tokens:
        count += spellchecker(t)
        
    return count

In [11]:
text_cols = ['title', 'company_profile', 'description', 'requirements', 'benefits']

for c in text_cols:
    df[c] = df[c].apply(lambda x: clean_text(x))
    print('Done with ' + c)

Done with title
Done with company_profile
Done with description
Done with requirements
Done with benefits


In [12]:
import nltk
# nltk.download('punkt')

from nltk.tokenize import word_tokenize
text_cols_tokens = []
for c in text_cols:
    df[c + '_tokens'] = df[c].apply(lambda x: word_tokenize(x))
    text_cols_tokens.append(c + '_tokens')
    print('Done with ' + c)

Done with title
Done with company_profile
Done with description
Done with requirements
Done with benefits


In [13]:
for c in text_cols_tokens:
    df['spelling_errors_' + c[:-7]] = df[c].apply(lambda x: check_spelling(x))
    print('Done with ' + c)

Done with title_tokens
Done with company_profile_tokens
Done with description_tokens
Done with requirements_tokens
Done with benefits_tokens


In [14]:
for c in text_cols:
    df['special_characters_' + c] = df[c].apply(lambda x: count_special_characters(x))
    print('Done with ' + c)

Done with title
Done with company_profile
Done with description
Done with requirements
Done with benefits


In [15]:
df.head()

Unnamed: 0,job_id,title,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,...,spelling_errors_title,spelling_errors_company_profile,spelling_errors_description,spelling_errors_requirements,spelling_errors_benefits,special_characters_title,special_characters_company_profile,special_characters_description,special_characters_requirements,special_characters_benefits
0,1,Marketing Intern,0,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Not specified,0,1,0,...,0,7,11,9,0,0,31,23,19,0
1,2,Customer Service - Cloud Video Production,0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,...,0,4,10,7,8,1,34,51,26,22
2,3,Commissioning Machinery Assistant (CMA),0,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,Not specified,0,1,0,...,1,1,0,0,0,2,10,7,24,0
3,4,Account Executive - Washington DC,0,Our passion for improving quality of life thro...,THE COMPANY: ESRI Environmental Systems Resea...,"EDUCATION:Bachelors or Masters in GIS, busines...",Our culture is anything but corporatewe have a...,0,1,0,...,0,3,25,14,13,1,17,35,23,13
4,5,Bill Review Manager,0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,...,0,12,11,13,0,0,37,35,12,0


In [16]:
for c in text_cols:
    df[c + '_length'] = df[c].apply(lambda x: len(x))
    print('Done with ' + c)

Done with title
Done with company_profile
Done with description
Done with requirements
Done with benefits


In [17]:
df['fraudulent'].value_counts()

0    17014
1      866
Name: fraudulent, dtype: int64

In [18]:
df.head()

Unnamed: 0,job_id,title,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,...,special_characters_title,special_characters_company_profile,special_characters_description,special_characters_requirements,special_characters_benefits,title_length,company_profile_length,description_length,requirements_length,benefits_length
0,1,Marketing Intern,0,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,Not specified,0,1,0,...,0,31,23,19,0,16,885,901,852,13
1,2,Customer Service - Cloud Video Production,0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,...,1,34,51,26,22,41,1001,1994,1345,1287
2,3,Commissioning Machinery Assistant (CMA),0,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,Not specified,0,1,0,...,2,10,7,24,0,39,865,355,1360,13
3,4,Account Executive - Washington DC,0,Our passion for improving quality of life thro...,THE COMPANY: ESRI Environmental Systems Resea...,"EDUCATION:Bachelors or Masters in GIS, busines...",Our culture is anything but corporatewe have a...,0,1,0,...,1,17,35,23,13,33,611,2592,1425,779
4,5,Bill Review Manager,0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,...,0,37,35,12,0,19,1553,1445,752,21


In [19]:
df.drop(columns=['job_id']).to_csv('./cleaned.csv')

In [20]:
df = pd.read_csv('./cleaned.csv')

In [21]:
df.fillna("Not specified", inplace=True)

In [22]:
numerical = ['spelling_errors_title',
       'spelling_errors_company_profile', 'spelling_errors_description',
       'spelling_errors_requirements', 'spelling_errors_benefits',
       'special_characters_title', 'special_characters_company_profile',
       'special_characters_description', 'special_characters_requirements',
       'special_characters_benefits', 'title_length', 'company_profile_length',
       'description_length', 'requirements_length', 'benefits_length']

In [23]:
df_train, df_test = train_test_split(df, test_size=0.3, random_state=4222, stratify=df['fraudulent'])
df_valid, df_test = train_test_split(df_test, test_size=0.5, random_state=4222, stratify=df_test['fraudulent'])

In [24]:
from sklearn.preprocessing import StandardScaler

def normalise_numerical(df):
    scaled_features = StandardScaler().fit_transform(df[numerical])
    scaled_df = pd.DataFrame(scaled_features, index = df.index, columns = numerical)
    df = pd.concat([df.drop(columns=numerical), scaled_df], axis=1)
    return df

In [25]:
df_train = normalise_numerical(df_train)
df_valid = normalise_numerical(df_valid)
df_test = normalise_numerical(df_test)

In [26]:
df_train.drop(columns=['Unnamed: 0'], inplace=True)
df_valid.drop(columns=['Unnamed: 0'], inplace=True)
df_test.drop(columns=['Unnamed: 0'], inplace=True)

In [27]:
print(df_train.iloc[0])

title                                                Part Time Creative Project Manager
salary_range                                                                          0
company_profile                       NoSleepForSheep is a Nashville-based creative ...
description                           Are you passionate about making people happy a...
requirements                          1+ year project management experienceException...
benefits                              Flexible schedule.Opportunity to work with a s...
telecommuting                                                                         0
has_company_logo                                                                      1
has_questions                                                                         1
required_experience                                                                   0
required_education                                                                    0
fraudulent                      

In [28]:
df_train.to_csv("./train.csv")
df_valid.to_csv("./valid.csv")
df_test.to_csv("./test.csv")