In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
jobs_data = pd.read_csv("Job_Frauds.csv", encoding="iso-8859-1")
jobs_data.shape

(17880, 16)

In [3]:
jobs_data.columns

Index(['Job Title', 'Job Location', 'Department', 'Range_of_Salary', 'Profile',
       'Job_Description', 'Requirements', 'Job_Benefits', 'Telecomunication',
       'Comnpany_Logo', 'Type_of_Employment', 'Experience', 'Qualification',
       'Type_of_Industry', 'Operations', 'Fraudulent'],
      dtype='object')

In [4]:
#Obtaining the numerical columns
numerical_columns = list(jobs_data.dtypes[jobs_data.dtypes != "object"].index)
numerical_columns

['Telecomunication', 'Comnpany_Logo', 'Fraudulent']

We can see that we have only two numerical columns assides the target "Fraudulent" column

In [5]:
numerical_df = jobs_data[numerical_columns]
numerical_df.corr()["Fraudulent"]

Telecomunication    0.034523
Comnpany_Logo      -0.261971
Fraudulent          1.000000
Name: Fraudulent, dtype: float64

While we have some negative correlation between the company_logo and the fraudulent class, the corelation of the frauulent class with the "Telecommunication" feature is very low

In [6]:
jobs_data.isnull().sum()

Job Title                 0
Job Location            346
Department            11547
Range_of_Salary       15012
Profile                3308
Job_Description           1
Requirements           2695
Job_Benefits           7210
Telecomunication          0
Comnpany_Logo             0
Type_of_Employment     3471
Experience             7050
Qualification          8105
Type_of_Industry       4903
Operations             6455
Fraudulent                0
dtype: int64

In [7]:
#Creating a function for removing the null values above a particular threshold
def nullthresh_remover(df, threshold):
    sum_of_null = df.isnull().sum()
    percentage_null = sum_of_null/df.shape[0]
    chosen = percentage_null[percentage_null < threshold]
    new_df = df[chosen.index]
    return new_df

In [8]:
jobs_data = nullthresh_remover(jobs_data, 0.6)
list(jobs_data.columns)

['Job Title',
 'Job Location',
 'Profile',
 'Job_Description',
 'Requirements',
 'Job_Benefits',
 'Telecomunication',
 'Comnpany_Logo',
 'Type_of_Employment',
 'Experience',
 'Qualification',
 'Type_of_Industry',
 'Operations',
 'Fraudulent']

The above list shows the remaining columns with null values below the threshold of 60% and the target Fraudulent column.
Note that we have 13 features left

In [9]:
jobs_data.Fraudulent.value_counts()

0    17014
1      866
Name: Fraudulent, dtype: int64

The target column is very imbalanced with the fraudulent class 20 times lesser than the not fraudulent class. There would be a need to make use of a metric that can counter the imbalance in the data classes. 

In [10]:
jobs_data

Unnamed: 0,Job Title,Job Location,Profile,Job_Description,Requirements,Job_Benefits,Telecomunication,Comnpany_Logo,Type_of_Employment,Experience,Qualification,Type_of_Industry,Operations,Fraudulent
0,Marketing Intern,"US, NY, New York","We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,Other,Internship,,,Marketing,0
1,Customer Service - Cloud Video Production,"NZ, , Auckland","90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,Commissioning Machinery Assistant (CMA),"US, IA, Wever",Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,,,,,,0
3,Account Executive - Washington DC,"US, DC, Washington",Our passion for improving quality of life thro...,THE COMPANY: ESRI â Environmental Systems Re...,"EDUCATION:Â Bachelorâs or Masterâs in GIS,...",Our culture is anything but corporateâwe hav...,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,Bill Review Manager,"US, FL, Fort Worth",SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,Account Director - Distribution,"CA, ON, Toronto",Vend is looking for some awesome new talent to...,Just in case this is the first time youâve v...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,Full-time,Mid-Senior level,,Computer Software,Sales,0
17876,Payroll Accountant,"US, PA, Philadelphia",WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting-Â Desire to have ...,Health &amp; WellnessMedical planPrescription ...,0,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,Project Cost Control Staff Engineer - Cost Con...,"US, TX, Houston",We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,Full-time,,,,,0
17878,Graphic Designer,"NG, LA, Lagos",,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [11]:
#Importing the necessary libraries
import re
import string
import nltk
from nltk.corpus import stopwords

stopwords = stopwords.words("english")

In [12]:
def clean_entry1(entry, tokenizer):
    """Pre-process entry and generate word tokens

    Args:
        entry: Text to tokenize.

    Returns:
        Tokenized entry.
    """
    entry = str(entry) #convert the row to a string
    entry = re.sub(r"\s+", " ", entry)  # Remove multiple spaces in content
    entry = re.sub(f"[{re.escape(string.punctuation)}]", "", entry)  # Remove punctuations

    tokens = tokenizer(entry)  # Get tokens from entry
    tokens = [t for t in tokens if len(t) > 2]  # Remove short tokens
    entry = " ".join(tokens)
    return entry

def replace1(row):
    """Replace the entries of each of the selected rows

    Args:
        row: row to clean.

    Returns:
        clean row.
    """
    for i, entry in enumerate(row):
         
        # updating the value of the row
        row[i] = clean_entry1(entry, nltk.word_tokenize)
    return row

In [13]:
jobs_data[["Job Title", "Job Location"]] = jobs_data[["Job Title", "Job Location"]].copy().apply(lambda row : replace1(row))
jobs_data.head(2)

Unnamed: 0,Job Title,Job Location,Profile,Job_Description,Requirements,Job_Benefits,Telecomunication,Comnpany_Logo,Type_of_Employment,Experience,Qualification,Type_of_Industry,Operations,Fraudulent
0,Marketing Intern,New York,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,Other,Internship,,,Marketing,0
1,Customer Service Cloud Video Production,Auckland,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [14]:
def clean_entry2(entry, tokenizer, stopwords):
    """Pre-process entry and generate word tokens

    Args:
        entry: Text to tokenize.

    Returns:
        Tokenized entry.
    """
    entry = str(entry).lower()  # Lowercase words
    entry = re.sub("[^a-zA-Z0-9]", " ", entry)  # Remove punctuations
    entry = re.sub(r"\s+", " ", entry)  # Remove multiple spaces in content
    entry = entry.rstrip()

    tokens = tokenizer(entry)  # Get tokens from entry
    tokens = [t for t in tokens if not t in stopwords]  # Remove stopwords
    tokens = [t for t in tokens if len(t) > 2]  # Remove short tokens
    return " ".join(tokens)

def replace2(row):
    """Replace the entries of each of the selected rows

    Args:
        row: row to clean.

    Returns:
        clean row.
    """
    for i, entry in enumerate(row): 
        # updating the value of the row
        row[i] = clean_entry2(entry, nltk.word_tokenize, stopwords)
    return row

In [15]:
jobs_data[["Profile", "Job_Description", "Requirements", "Job_Benefits"]] = jobs_data[["Profile", "Job_Description", "Requirements", "Job_Benefits"]].copy().apply(lambda row : replace2(row))
jobs_data.head(2)

Unnamed: 0,Job Title,Job Location,Profile,Job_Description,Requirements,Job_Benefits,Telecomunication,Comnpany_Logo,Type_of_Employment,Experience,Qualification,Type_of_Industry,Operations,Fraudulent
0,Marketing Intern,New York,food52 created groundbreaking award winning co...,food52 fast growing james beard award winning ...,experience content management systems major pl...,,0,1,Other,Internship,,,Marketing,0
1,Customer Service Cloud Video Production,Auckland,seconds worlds cloud video production service ...,organised focused vibrant awesome passion cust...,expect key responsibility communicate client s...,get usthrough part seconds team gain experienc...,0,1,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0


In [16]:
jobs_data.Type_of_Employment.value_counts()

Full-time    11620
Contract      1524
Part-time      797
Temporary      241
Other          227
Name: Type_of_Employment, dtype: int64

We should use a one-hot encoder for the above

In [17]:
jobs_data.Experience.value_counts()

Mid-Senior level    3809
Entry level         2697
Associate           2297
Not Applicable      1116
Director             389
Internship           381
Executive            141
Name: Experience, dtype: int64

We should use an ordinal encoder for the above

In [18]:
jobs_data.Qualification.value_counts()

Bachelor's Degree                    5145
High School or equivalent            2080
Unspecified                          1397
Master's Degree                       416
Associate Degree                      274
Certification                         170
Some College Coursework Completed     102
Professional                           74
Vocational                             49
Some High School Coursework            27
Doctorate                              26
Vocational - HS Diploma                 9
Vocational - Degree                     6
Name: Qualification, dtype: int64

One hot encoder for the above

In [20]:
jobs_data.Type_of_Industry.value_counts()

Information Technology and Services    1734
Computer Software                      1376
Internet                               1062
Marketing and Advertising               828
Education Management                    822
                                       ... 
Shipbuilding                              1
Sporting Goods                            1
Museums and Institutions                  1
Wine and Spirits                          1
Ranching                                  1
Name: Type_of_Industry, Length: 131, dtype: int64

In [21]:
jobs_data.Operations.value_counts()

Information Technology    1749
Sales                     1468
Engineering               1348
Customer Service          1229
Marketing                  830
Administrative             630
Design                     340
Health Care Provider       338
Other                      325
Education                  325
Management                 317
Business Development       228
Accounting/Auditing        212
Human Resources            205
Project Management         183
Finance                    172
Consulting                 144
Writing/Editing            132
Art/Creative               132
Production                 116
Product Management         114
Quality Assurance          111
Advertising                 90
Business Analyst            84
Data Analyst                82
Public Relations            76
Manufacturing               74
General Business            68
Research                    50
Legal                       47
Strategy/Planning           46
Training                    38
Supply C