In [1]:
# Import the data
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

In [2]:
original_df = pd.read_csv('./original dataset/fake_job_postings.csv')
print("Shape:", original_df.shape)
display(original_df.dtypes)
display(original_df.head())

Shape: (17880, 18)


job_id                  int64
title                  object
location               object
department             object
salary_range           object
company_profile        object
description            object
requirements           object
benefits               object
telecommuting           int64
has_company_logo        int64
has_questions           int64
employment_type        object
required_experience    object
required_education     object
industry               object
function               object
fraudulent              int64
dtype: object

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


## Inspect Column data

In [3]:
# amount of missing data per column
original_df.isna().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [4]:
np.set_printoptions(threshold=sys.maxsize)

def print_number_unique(col: pd.Series):
    print("Number unique:", len(col.value_counts()))

def print_unique_values(col: pd.Series):
    print(np.sort(col.dropna().unique()))

In [5]:
print_number_unique(original_df['location'])
print_unique_values(original_df['location'])
# usually country code (US, GB, AU), region/state code (AZ, LND, NSW), &
#   city (Phoenix, London, Sydney), comma separated
# - sometimes region/state is empty
# - sometimes no city
# - sometimes multiple cities appended, comma separated or pipe separated
#   - sometimes state or state code appended to extra cities with or without comma
# - some added details like "See the Requirements section for areas and locations available", "partially in the UK"
# - at least one appears to pass a zip code instead of a city, and another "All"
# - inconsistent spacing and capitalization patterns

Number unique: 3105
['AE, , ' 'AE, , Abudhabi' 'AE, , Dubai' 'AE, , Media City | Dubai'
 'AE, AZ, ' 'AE, AZ, Abudhabi' 'AE, DU, ' 'AE, DU, Dubai'
 'AE, DU, Dubai Internet City' 'AE, DU, Dubayy' 'AL, 11, ' 'AM, , Yerevan'
 'AM, ER, Yerevan' 'AR, , ' 'AT, , ' 'AT, , Salzburg' 'AT, 5, '
 'AT, 5, Salzburg' 'AT, 9, ' 'AT, 9, Vienna' 'AU' 'AU, , '
 'AU, , Australia' 'AU, , Australia ' 'AU, , Melbourne' 'AU, , Sydney'
 'AU, , Work from home' 'AU, ACT, ' 'AU, ACT, Canberra' 'AU, NSW, '
 'AU, NSW, 2010' 'AU, NSW, Albury' 'AU, NSW, Artarmon'
 'AU, NSW, Darlinghurst' 'AU, NSW, Newcastle ' 'AU, NSW, North Sydney'
 'AU, NSW, South West Sydney and M5 corridor' 'AU, NSW, Surry Hills'
 'AU, NSW, Sydney' 'AU, NSW, Toronto' 'AU, NSW, sydney' 'AU, NT, '
 'AU, QLD, ' 'AU, QLD, Brisbane' 'AU, QLD, Gold coast' 'AU, SA, Adelaide'
 'AU, TAS, Hobart' 'AU, TAS, Launceston' 'AU, VIC, ' 'AU, VIC, Carlton'
 'AU, VIC, Clayton' 'AU, VIC, Melbourne' 'AU, VIC, Melbourne '
 'AU, VIC, caulfield' 'AU, WA, Como' 'AU, WA, 

In [6]:
print_number_unique(original_df['department'])
print_unique_values(original_df['department'])
# usually title case
# - several are an acronym
# - sometimes a code, or code and name
# - at least one book-ended with parentheses
# - some refer to locations rather than department
# - one is the literal " "
# - inconsistent spacing and capitalization

Number unique: 1337
[' ' ' \tCorporate Shared Services' ' Lower Level Management'
 ' Marketing ' ' Moni Technologies' ' R&D' '(Consultant)' '.NET'
 '.net Development' '0' '1221' '130 - Nutritional Yeast Packaging' '1411'
 '20' '20131101' '30517' '3D Art' '4' '49' '5'
 '6 locations in the United States, 3 in Canada and 1 in UK' '640 Labs'
 '@ ecgstudio | process improvement specialists' 'A Techstars Company'
 'ACCOUNTING ' 'ACCT' 'ACES' 'ACT' 'ADMIN' 'ADS' 'AFS12' 'AFS13' 'AFS14'
 'AFS16' 'AFS18' 'AFSAU' 'AFSCC' 'AFSDT' 'AFSDTV' 'AFSHAR' 'AFSHO'
 'AFSHON' 'AFSHOT' 'AFSMCA' 'AFSSA' 'AGENCY CONFIDENTIAL' 'ALM Practice'
 'AMG' 'AMHS' 'AML' 'ANDROIDPIT' 'APP MEDIA' 'AR' 'AX 20140308/09 (2)'
 'Aberdeen ' 'Academic' 'Account' 'Account Handling' 'Account Management'
 'Account team' 'Account/finance' 'Accountant' 'Accounting'
 'Accounting & Finance' 'Accounting / Finance' 'Accounting and Finance'
 'Accounting/Finance' 'Accounting/Payroll' 'Accounts'
 'Acquisitions and Divesments' 'Ad Sales' 'Ad

In [7]:
print_number_unique(original_df['salary_range'])
print_unique_values(original_df['salary_range'])
# usually numbers separated by hyphen
# - some are a day-month or month-day rather than numbers
# TODO some are 20 and below, want to inspect those job listings to interpret unit context

Number unique: 874
['0-0' '0-1' '0-1000' '0-100000' '0-110406' '0-115' '0-120000' '0-1200000'
 '0-12500' '0-130000' '0-15000' '0-150000' '0-16000' '0-180000' '0-2000'
 '0-20000' '0-24000' '0-25000' '0-268' '0-30000' '0-32000' '0-34300'
 '0-35000' '0-38000' '0-43500' '0-45000' '0-48000' '0-50000' '0-60000'
 '0-65000' '0-70000' '0-90000' '0-92000' '0-9360000' '0-95000' '10-Nov'
 '10-Oct' '100-120' '100-150' '100-200' '1000-1000' '1000-10000'
 '1000-1100' '1000-1400' '1000-1500' '1000-2000' '1000-20000' '1000-26000'
 '1000-3000' '1000-4000' '1000-5000' '1000-6000' '10000-10000'
 '10000-100000' '10000-12000' '10000-120000' '10000-14000' '10000-15000'
 '10000-16000' '10000-18000' '10000-20000' '10000-22000' '10000-25000'
 '10000-250000' '10000-30000' '100000-110000' '100000-115000'
 '100000-117000' '100000-120000' '100000-125000' '100000-130000'
 '100000-150000' '100000-160000' '100000-170000' '100000-180000'
 '100000-200000' '100000-240000' '100000-250000' '100000-300000'
 '100000-400000' 

In [8]:
print_number_unique(original_df['company_profile'])
print_unique_values(original_df['company_profile'])
# usually multi-sentence descriptions
# - some are short like "Check us out"
# - sometimes appears newlines get removed without replacement; headings
#   run the last word into first word of line below
# - many languages

Number unique: 1709
[' Value Added Team of Creative ProfessionalsNetConstructor is a San Diego based company Co-Founded by Christian Hochfilzer, Leo Baghdassarian and Kris Fredrickson and is composed of a tightly-knit group of talented and experienced professionals in the realm of marketing and media development. At the heart of NetConstructor is the simple belief that results speak for themselves. We understand that regardless of how aesthetically pleasing an advertisement or website may be, or how much theoretical sense the technique makes, unless the campaign produces the desired results, it has failed.Our Mission:At NetConstructor, we pride ourselves in utilizing technology to develop innovative ROI-focused marketing and media solutions that help our clients embrace the constantly evolving business place. We aim to build and nurture mutually beneficial partnerships with our clients and take an immense amount of pride in the work we perform and campaigns we launch on behalf of each 

In [9]:
print_number_unique(original_df['description'])
# TODO only one description is na, want context

Number unique: 14801


In [10]:
print_number_unique(original_df['requirements'])

Number unique: 11967


In [11]:
print_number_unique(original_df['benefits'])

Number unique: 6204


In [12]:
print_number_unique(original_df['employment_type'])
print_unique_values(original_df['employment_type'])

Number unique: 5
['Contract' 'Full-time' 'Other' 'Part-time' 'Temporary']


In [13]:
print_number_unique(original_df['required_experience'])
print_unique_values(original_df['required_experience'])

Number unique: 7
['Associate' 'Director' 'Entry level' 'Executive' 'Internship'
 'Mid-Senior level' 'Not Applicable']


In [14]:
print_number_unique(original_df['required_education'])
print_unique_values(original_df['required_education'])

Number unique: 13
['Associate Degree' "Bachelor's Degree" 'Certification' 'Doctorate'
 'High School or equivalent' "Master's Degree" 'Professional'
 'Some College Coursework Completed' 'Some High School Coursework'
 'Unspecified' 'Vocational' 'Vocational - Degree'
 'Vocational - HS Diploma']


In [15]:
print_number_unique(original_df['industry'])
print_unique_values(original_df['industry'])
# consistent title case

Number unique: 131
['Accounting' 'Airlines/Aviation' 'Alternative Dispute Resolution'
 'Animation' 'Apparel & Fashion' 'Architecture & Planning' 'Automotive'
 'Aviation & Aerospace' 'Banking' 'Biotechnology' 'Broadcast Media'
 'Building Materials' 'Business Supplies and Equipment' 'Capital Markets'
 'Chemicals' 'Civic & Social Organization' 'Civil Engineering'
 'Commercial Real Estate' 'Computer & Network Security' 'Computer Games'
 'Computer Hardware' 'Computer Networking' 'Computer Software'
 'Construction' 'Consumer Electronics' 'Consumer Goods'
 'Consumer Services' 'Cosmetics' 'Defense & Space' 'Design' 'E-Learning'
 'Education Management' 'Electrical/Electronic Manufacturing'
 'Entertainment' 'Environmental Services' 'Events Services'
 'Executive Office' 'Facilities Services' 'Farming' 'Financial Services'
 'Fishery' 'Food & Beverages' 'Food Production' 'Fund-Raising' 'Furniture'
 'Gambling & Casinos' 'Government Administration' 'Government Relations'
 'Graphic Design' 'Health, We

In [16]:
print_number_unique(original_df['function'])
print_unique_values(original_df['function'])

Number unique: 37
['Accounting/Auditing' 'Administrative' 'Advertising' 'Art/Creative'
 'Business Analyst' 'Business Development' 'Consulting' 'Customer Service'
 'Data Analyst' 'Design' 'Distribution' 'Education' 'Engineering'
 'Finance' 'Financial Analyst' 'General Business' 'Health Care Provider'
 'Human Resources' 'Information Technology' 'Legal' 'Management'
 'Manufacturing' 'Marketing' 'Other' 'Product Management' 'Production'
 'Project Management' 'Public Relations' 'Purchasing' 'Quality Assurance'
 'Research' 'Sales' 'Science' 'Strategy/Planning' 'Supply Chain'
 'Training' 'Writing/Editing']


## Cleanup, Transform, Export

In [17]:
# TODO use module for cleanup &/| transform

In [18]:
# TODO export csv