In [288]:
# Import and read the data
import pandas as pd
import numpy as np
df = pd.read_csv('fake_jobs.csv',low_memory=False)

In [289]:
df.shape

(17880, 18)

In [290]:
# Print the % of empty cells in the data
# Top offenders (>40%) are: Salary range, Department, Benefits, required_experience, required_education...
# We decide to drop department, salary_range and benefits and try to repair the rest
# Loi : Localisation, titre du poste, fiche de poste
print("Missing values distribution: ")
print(df.isnull().mean())

Missing values distribution: 
job_id                 0.000000
title                  0.000000
location               0.019351
department             0.645805
salary_range           0.839597
company_profile        0.185011
description            0.000056
requirements           0.150727
benefits               0.403244
telecommuting          0.000000
has_company_logo       0.000000
has_questions          0.000000
employment_type        0.194128
required_experience    0.394295
required_education     0.453300
industry               0.274217
function               0.361018
fraudulent             0.000000
dtype: float64


In [291]:
df.drop(['department','salary_range','benefits','company_profile'], axis = 1, inplace = True) 

In [292]:
df.shape
#columns have been dropped

(17880, 14)

In [293]:
df.isnull().mean()

job_id                 0.000000
title                  0.000000
location               0.019351
description            0.000056
requirements           0.150727
telecommuting          0.000000
has_company_logo       0.000000
has_questions          0.000000
employment_type        0.194128
required_experience    0.394295
required_education     0.453300
industry               0.274217
function               0.361018
fraudulent             0.000000
dtype: float64

In [294]:
rows = df[(df['required_education'].isnull()==True) & (df['required_experience'].isnull()==True) & (df['function'].isnull()==True)]
rows.shape

(4587, 14)

In [295]:
drop_rows = list(rows.index)
df_clean = df.drop(drop_rows, axis=0) #rows

In [296]:
df_clean.shape

(13293, 14)

In [297]:
df_clean.isnull().mean()

job_id                 0.000000
title                  0.000000
location               0.008125
description            0.000000
requirements           0.077560
telecommuting          0.000000
has_company_logo       0.000000
has_questions          0.000000
employment_type        0.043858
required_experience    0.185285
required_education     0.264651
industry               0.093508
function               0.140525
fraudulent             0.000000
dtype: float64

In [298]:
df_clean['required_experience'].value_counts()

Mid-Senior level    3809
Entry level         2697
Associate           2297
Not Applicable      1116
Director             389
Internship           381
Executive            141
Name: required_experience, dtype: int64

In [299]:
df_clean.loc[df_clean['required_experience']=='Mid-Senior level']['required_education'].value_counts()

Bachelor's Degree                    1994
Unspecified                           367
Master's Degree                       158
High School or equivalent             123
Certification                          44
Associate Degree                       43
Professional                           30
Vocational                             14
Doctorate                              11
Some College Coursework Completed       5
Vocational - Degree                     3
Some High School Coursework             1
Vocational - HS Diploma                 1
Name: required_education, dtype: int64

In [300]:
df_clean['required_experience'].unique()

array(['Internship', 'Not Applicable', 'Mid-Senior level', 'Associate',
       'Entry level', 'Executive', nan, 'Director'], dtype=object)

In [301]:
lst = list(df_clean['required_experience'].unique())
lst.pop(6)
lst

['Internship',
 'Not Applicable',
 'Mid-Senior level',
 'Associate',
 'Entry level',
 'Executive',
 'Director']

In [302]:
dic = dict(df_clean.loc[df_clean['required_experience']=='Mid-Senior level']['required_education'].value_counts())
dic

{"Bachelor's Degree": 1994,
 'Unspecified': 367,
 "Master's Degree": 158,
 'High School or equivalent': 123,
 'Certification': 44,
 'Associate Degree': 43,
 'Professional': 30,
 'Vocational': 14,
 'Doctorate': 11,
 'Some College Coursework Completed': 5,
 'Vocational - Degree': 3,
 'Some High School Coursework': 1,
 'Vocational - HS Diploma': 1}

In [303]:
list(dic.keys())[0]

"Bachelor's Degree"

In [304]:
for i in lst:
    dic = dict(df_clean.loc[df_clean['required_experience']==i]['required_education'].value_counts())
    print((i,list(dic.keys())[0]))

('Internship', "Bachelor's Degree")
('Not Applicable', 'High School or equivalent')
('Mid-Senior level', "Bachelor's Degree")
('Associate', "Bachelor's Degree")
('Entry level', 'High School or equivalent')
('Executive', "Bachelor's Degree")
('Director', "Bachelor's Degree")


In [305]:
filter1 = (df_clean["required_experience"]=="Internship") & (df_clean['required_education'].isnull()==True)
filter2 = (df_clean["required_experience"]=='Not Applicable') & (df_clean['required_education'].isnull()==True)
filter3 = (df_clean["required_experience"]=='Mid-Senior level') & (df_clean['required_education'].isnull()==True)
filter4 = (df_clean["required_experience"]=='Associate') & (df_clean['required_education'].isnull()==True)
filter5 = (df_clean["required_experience"]=='Entry level') & (df_clean['required_education'].isnull()==True)
filter6 = (df_clean["required_experience"]=='Executive') & (df_clean['required_education'].isnull()==True)
filter7 = (df_clean["required_experience"]=='Director') & (df_clean['required_education'].isnull()==True)

In [306]:
df_clean[filter1].head()

Unnamed: 0,job_id,title,location,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,Other,Internship,,,Marketing,0
562,563,Production Intern,"US, NY, Brooklyn",Adventure Cow is looking for a production inte...,,0,0,0,Temporary,Internship,,,Production,0
564,565,Wordpress Designer and Expert for Startup | St...,"US, ,",#URL_ab309fb672a2b26317bd303c09c3c6762986d45c2...,,0,1,0,Part-time,Internship,,,Information Technology,0
938,939,Provisions eCommerce Intern,,Do you obsess over great products -- both styl...,You may be a good fit for this position if you...,0,1,1,,Internship,,,General Business,0
1020,1021,Marketing Intern - Paid Position,"GB, LND, London",Fabrily is on the hunt for a brilliant marketi...,"* Self motivated, proactive, well organized an...",0,1,1,Full-time,Internship,,Internet,,0


In [307]:
df_clean['required_education'] = np.where(filter1,"Bachelor's Degree", df_clean['required_education'])
df_clean[filter1].head()

Unnamed: 0,job_id,title,location,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,Other,Internship,Bachelor's Degree,,Marketing,0
562,563,Production Intern,"US, NY, Brooklyn",Adventure Cow is looking for a production inte...,,0,0,0,Temporary,Internship,Bachelor's Degree,,Production,0
564,565,Wordpress Designer and Expert for Startup | St...,"US, ,",#URL_ab309fb672a2b26317bd303c09c3c6762986d45c2...,,0,1,0,Part-time,Internship,Bachelor's Degree,,Information Technology,0
938,939,Provisions eCommerce Intern,,Do you obsess over great products -- both styl...,You may be a good fit for this position if you...,0,1,1,,Internship,Bachelor's Degree,,General Business,0
1020,1021,Marketing Intern - Paid Position,"GB, LND, London",Fabrily is on the hunt for a brilliant marketi...,"* Self motivated, proactive, well organized an...",0,1,1,Full-time,Internship,Bachelor's Degree,Internet,,0


In [308]:
# ('Internship', "Bachelor's Degree")
# ('Not Applicable', 'High School or equivalent')
# ('Mid-Senior level', "Bachelor's Degree")
# ('Associate', "Bachelor's Degree")
# ('Entry level', 'High School or equivalent')
# ('Executive', "Bachelor's Degree")
# ('Director', "Bachelor's Degree")
df_clean['required_education'] = np.where(filter2,'High School or equivalent', df_clean['required_education'])
df_clean['required_education'] = np.where(filter3,"Bachelor's Degree", df_clean['required_education'])
df_clean['required_education'] = np.where(filter4,"Bachelor's Degree", df_clean['required_education'])
df_clean['required_education'] = np.where(filter5,'High School or equivalent', df_clean['required_education'])
df_clean['required_education'] = np.where(filter6,"Bachelor's Degree", df_clean['required_education'])
df_clean['required_education'] = np.where(filter7,"Bachelor's Degree", df_clean['required_education'])

In [309]:
df_clean.isnull().mean()

job_id                 0.000000
title                  0.000000
location               0.008125
description            0.000000
requirements           0.077560
telecommuting          0.000000
has_company_logo       0.000000
has_questions          0.000000
employment_type        0.043858
required_experience    0.185285
required_education     0.090198
industry               0.093508
function               0.140525
fraudulent             0.000000
dtype: float64

In [310]:
lst2 = list(df_clean['required_education'].unique())
lst2.pop(4)
lst2

["Bachelor's Degree",
 'High School or equivalent',
 "Master's Degree",
 'Unspecified',
 'Some College Coursework Completed',
 'Vocational',
 'Certification',
 'Associate Degree',
 'Professional',
 'Doctorate',
 'Some High School Coursework',
 'Vocational - Degree',
 'Vocational - HS Diploma']

In [311]:
dic2 = dict(df_clean.loc[df_clean['required_education']=='High School or equivalent']['required_experience'].value_counts())
dic2

{'Entry level': 1549,
 'Not Applicable': 590,
 'Associate': 243,
 'Mid-Senior level': 123,
 'Internship': 39,
 'Director': 2,
 'Executive': 1}

In [312]:
for i in lst2:
    dic2 = dict(df_clean.loc[df_clean['required_education']==i]['required_experience'].value_counts())
    print((i,list(dic2.keys())[0]))

("Bachelor's Degree", 'Mid-Senior level')
('High School or equivalent', 'Entry level')
("Master's Degree", 'Mid-Senior level')
('Unspecified', 'Mid-Senior level')
('Some College Coursework Completed', 'Internship')
('Vocational', 'Associate')
('Certification', 'Mid-Senior level')
('Associate Degree', 'Associate')
('Professional', 'Mid-Senior level')
('Doctorate', 'Mid-Senior level')
('Some High School Coursework', 'Entry level')
('Vocational - Degree', 'Mid-Senior level')
('Vocational - HS Diploma', 'Entry level')


In [313]:
filt1 = (df_clean["required_education"]=="Bachelor's Degree") & (df_clean['required_experience'].isnull()==True)
filt2 = (df_clean["required_education"]=='High School or equivalent') & (df_clean['required_experience'].isnull()==True)
filt3 = (df_clean["required_education"]=="Master's Degree") & (df_clean['required_experience'].isnull()==True)
filt4 = (df_clean["required_education"]=='Unspecified') & (df_clean['required_experience'].isnull()==True)
filt5 = (df_clean["required_education"]=='Some College Coursework Completed') & (df_clean['required_experience'].isnull()==True)
filt6 = (df_clean["required_education"]=='Vocational') & (df_clean['required_experience'].isnull()==True)
filt7 = (df_clean["required_education"]=='Certification') & (df_clean['required_experience'].isnull()==True)
filt8 = (df_clean["required_education"]=='Associate Degree') & (df_clean['required_experience'].isnull()==True)
filt9 = (df_clean["required_education"]=='Professional') & (df_clean['required_experience'].isnull()==True)
filt10 = (df_clean["required_education"]=='Doctorate') & (df_clean['required_experience'].isnull()==True)
filt11 = (df_clean["required_education"]=='Some High School Coursework') & (df_clean['required_experience'].isnull()==True)
filt12 = (df_clean["required_education"]=='Vocational - Degree') & (df_clean['required_experience'].isnull()==True)
filt13 = (df_clean["required_education"]=='Vocational - HS Diploma') & (df_clean['required_experience'].isnull()==True)

In [314]:
("Bachelor's Degree", 'Mid-Senior level')
('High School or equivalent', 'Entry level')
("Master's Degree", 'Mid-Senior level')
('Unspecified', 'Mid-Senior level')
('Some College Coursework Completed', 'Internship')
('Vocational', 'Associate')
('Certification', 'Mid-Senior level')
('Associate Degree', 'Associate')
('Professional', 'Mid-Senior level')
('Doctorate', 'Mid-Senior level')
('Some High School Coursework', 'Entry level')
('Vocational - Degree', 'Mid-Senior level')
('Vocational - HS Diploma', 'Entry level')

('Vocational - HS Diploma', 'Entry level')

In [315]:
df_clean['required_experience'] = np.where(filt1,'Mid-Senior level', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt2,'Entry level', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt3,'Mid-Senior level', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt4,'Mid-Senior level', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt5,'Internship', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt6,'Associate', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt7,'Mid-Senior level', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt8,'Associate', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt9,'Mid-Senior level', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt10,'Mid-Senior level', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt11,'Entry level', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt12,'Mid-Senior level', df_clean['required_experience'])
df_clean['required_experience'] = np.where(filt13,'Entry level', df_clean['required_experience'])

In [316]:
df_clean[filter1].head()

Unnamed: 0,job_id,title,location,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,Other,Internship,Bachelor's Degree,,Marketing,0
562,563,Production Intern,"US, NY, Brooklyn",Adventure Cow is looking for a production inte...,,0,0,0,Temporary,Internship,Bachelor's Degree,,Production,0
564,565,Wordpress Designer and Expert for Startup | St...,"US, ,",#URL_ab309fb672a2b26317bd303c09c3c6762986d45c2...,,0,1,0,Part-time,Internship,Bachelor's Degree,,Information Technology,0
938,939,Provisions eCommerce Intern,,Do you obsess over great products -- both styl...,You may be a good fit for this position if you...,0,1,1,,Internship,Bachelor's Degree,,General Business,0
1020,1021,Marketing Intern - Paid Position,"GB, LND, London",Fabrily is on the hunt for a brilliant marketi...,"* Self motivated, proactive, well organized an...",0,1,1,Full-time,Internship,Bachelor's Degree,Internet,,0


In [317]:
df_clean.isnull().mean()

job_id                 0.000000
title                  0.000000
location               0.008125
description            0.000000
requirements           0.077560
telecommuting          0.000000
has_company_logo       0.000000
has_questions          0.000000
employment_type        0.043858
required_experience    0.090198
required_education     0.090198
industry               0.093508
function               0.140525
fraudulent             0.000000
dtype: float64

In [318]:
teacher = (df_clean['title'].str.contains(r'Teacher')) & (df_clean['function'].isnull()==True)  #Education
cs = (df_clean['title'].str.contains(r'Customer Service')) & (df_clean['function'].isnull()==True) #Customer Service 
dev = (df_clean['title'].str.contains(r'Developer')) & (df_clean['function'].isnull()==True) #Engineering 
eng = (df_clean['title'].str.contains(r'Engineer')) & (df_clean['function'].isnull()==True)#Engineering  
science = (df_clean['title'].str.contains(r'Data Scientist'))  & (df_clean['function'].isnull()==True)#Science  
design = (df_clean['title'].str.contains(r'Design')) & (df_clean['function'].isnull()==True) #Design  
sales = (df_clean['title'].str.contains(r'Sales')) & (df_clean['function'].isnull()==True) #Sales  
mark = (df_clean['title'].str.contains(r'Marketing')) & (df_clean['function'].isnull()==True) #Marketing 
am = (df_clean['title'].str.contains(r'Account Manager')) & (df_clean['function'].isnull()==True) #Sales 
ae = (df_clean['title'].str.contains(r'Account Executive')) & (df_clean['function'].isnull()==True)#Sales 
acc = (df_clean['title'].str.contains(r'Accountant')) & (df_clean['function'].isnull()==True)#Finance 

In [319]:
df_clean['function'] = np.where(teacher,'Education', df_clean['function'])
df_clean['function'] = np.where(cs,'Customer Service', df_clean['function'])
df_clean['function'] = np.where(dev,'Engineering', df_clean['function'])
df_clean['function'] = np.where(eng,'Engineering', df_clean['function'])
df_clean['function'] = np.where(science,'Science', df_clean['function'])
df_clean['function'] = np.where(design,'Design', df_clean['function'])
df_clean['function'] = np.where(sales,'Sales', df_clean['function'])
df_clean['function'] = np.where(mark,'Marketing', df_clean['function'])
df_clean['function'] = np.where(am,'Sales', df_clean['function'])
df_clean['function'] = np.where(ae,'Sales', df_clean['function'])
df_clean['function'] = np.where(acc,'Finance', df_clean['function'])

In [320]:
df_clean.isnull().mean()

job_id                 0.000000
title                  0.000000
location               0.008125
description            0.000000
requirements           0.077560
telecommuting          0.000000
has_company_logo       0.000000
has_questions          0.000000
employment_type        0.043858
required_experience    0.090198
required_education     0.090198
industry               0.093508
function               0.070413
fraudulent             0.000000
dtype: float64

In [321]:
df_clean.to_csv("data_clean.csv")

In [322]:
df_clean.head(25)

Unnamed: 0,job_id,title,location,description,requirements,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,0,1,0,Other,Internship,Bachelor's Degree,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,0,1,0,Full-time,Not Applicable,High School or equivalent,Marketing and Advertising,Customer Service,0
3,4,Account Executive - Washington DC,"US, DC, Washington",THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
6,7,Head of Content (m/f),"DE, BE, Berlin",Your Responsibilities: Manage the English-spea...,Your Know-How: ...,0,1,1,Full-time,Mid-Senior level,Master's Degree,Online Media,Management,0
8,9,HP BSM SME,"US, FL, Pensacola",Implementation/Configuration/Testing/Training ...,MUST BE A US CITIZEN.An active TS/SCI clearanc...,0,1,1,Full-time,Associate,Bachelor's Degree,Information Technology and Services,,0
9,10,Customer Service Associate - Part Time,"US, AZ, Phoenix",The Customer Service Associate will be based i...,Minimum Requirements:Minimum of 6 months custo...,0,1,0,Part-time,Entry level,High School or equivalent,Financial Services,Customer Service,0
10,11,ASP.net Developer Job opportunity at United St...,"US, NJ, Jersey City",Position : #URL_86fd830a95a64e2b30ceed829e63fd...,Position : #URL_86fd830a95a64e2b30ceed829e63fd...,0,0,0,Full-time,Mid-Senior level,Bachelor's Degree,Information Technology and Services,Information Technology,0
12,13,"Applications Developer, Digital","US, CT, Stamford","The Applications Developer, Digital will devel...",Requirements:4 – 5 years’ experience in develo...,0,1,0,Full-time,Associate,Bachelor's Degree,Management Consulting,Information Technology,0
13,14,Installers,"US, FL, Orlando","Event Industry Installers Needed!! (Orlando, F...","Valid driver's license,Somewhat Clean driving ...",0,1,1,Full-time,Not Applicable,Unspecified,Events Services,Other,0
