# ANALYSIZING FAKE JOB POSTINGS
# BY - KARTIKSE & SHALINIA

In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
pd.set_option("mode.chained_assignment", None)
import numpy as np
import matplotlib.pyplot as plt

# Reading Data

In [2]:
data = pd.read_csv("fake_job_postings.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'fake_job_postings.csv'

In [None]:
data.shape

# Data Cleaning

### Data Cleaning Step 1 - Removing blanks from requirements

In [None]:
data = data[(data["requirements"].notna())]

In [None]:
data.shape

### Data Cleaning Step 2 - Changing blank values to "Not Provided" in all columns

In [None]:
data.replace(r'^\s*$', np.nan, regex=True)

In [None]:
data.fillna('not-provided', inplace=True)

In [None]:
data.shape

### Data Cleaning Step 3 - Removing "Other" from employment_type

In [None]:
data = data[(data.employment_type != "Other")]

In [None]:
data.shape

### Data Cleaning Step 4 - Split location into Country, State, and City

In [None]:
data[["location_country", "location_state_temp"]] = data.location.str.split(",", expand = True, n = 1)

In [None]:
data[["location_state", "location_city"]] = data.location_state_temp.str.split(",", expand = True, n = 1)

In [None]:
data = data.drop("location_state_temp", axis=1)

In [None]:
data = data.drop("location", axis=1)

In [None]:
data.shape

### Data Cleaning Step 5 - Split salary range column into minimum salary and maximum salary

In [None]:
data[["salary_min", "salary_max"]] = data.salary_range.str.split("-", expand = True, n = 1)

In [None]:
data = data.drop("salary_range", axis=1)

### Data Cleaning Step 6 - Replace blanks/not-provided salaries with 0 for min and max

In [None]:
data['salary_min'] = data['salary_min'].replace('not' ,'0')

In [None]:
data['salary_max'] = data['salary_max'].replace('provided' ,'0')

### Data Cleaning Step 7 - Converting string salaries to int

In [None]:
data.salary_min = pd.to_numeric(data.salary_min, errors='coerce')
data.salary_max = pd.to_numeric(data.salary_max, errors='coerce')

### Data Cleaning Step 8 - Remove special characters from strings

### Data Cleaning Step 9 - Remove extra white space characters from start and end of string columns

### Data Cleaning Step 10 - Replace uppercase characters with lowercase characters

In [None]:
text_columns = ["title", "department", "company_profile", "description", "requirements", "benefits", "employment_type", "required_experience", "required_education", "industry", "function", "location_country", "location_state", "location_city"]

for col in text_columns:
    #Step 8 - Remove special characters from strings
    data[col] = data[col].str.replace(r"[^0-9a-zA-Z]+", " ")
    #Step 9 - Remove extra white space characters from start and end of string columns
    data[col] = data[col].str.strip()
    #Step 10 - Replace uppercase characters with lowercase characters
    data[col] = data[col].str.lower()

### Data Cleaning Step 11 - Remove numbers from locations - country, state, and city

### Data Cleaning Step 12 - Removing locations which don't have country, state, or city

In [None]:
location_cols = ["location_country", "location_state", "location_city"]
for col in location_cols:
    #Step 11 - Remove numbers from locations - country, state, and city
    data[col] = data[col].str.replace('\d+', '')
    #Step 12 - Removing locations which don't have country, state, or city
    data = data[(data[col] != "") & (data[col] != " ")]

In [None]:
data

In [None]:
data.shape

### Data Cleaning Step 14 - Convert Data Type of Real/Fraud from int to Boolean

In [None]:
data["fraudulent"] = data["fraudulent"].astype(bool)

## EDA

### EDA 1 - Finding counts of values for every column

In [None]:
counts = {}
all_cols = list(data.columns.values)

for col in all_cols:
    print(col.upper())
    print()
    counts[col] = data[col].value_counts()
    print(counts[col])
    print()
    print()

### EDA 2 - Finding ratio of "not provided" with "provided" for every column

In [None]:
not_provided_counts, provided_counts, total = {}, {}, 12372
for col in counts:
    if 'not provided' in counts[col]:
        not_provided_counts[col] = counts[col]["not provided"]
        provided_counts[col] = total - counts[col]["not provided"]

In [None]:
plt.bar(range(len(not_provided_counts)), list(not_provided_counts.values()), align='center', color=['maroon'], label="not provided")
plt.bar(range(len(provided_counts)), list(provided_counts.values()), bottom=list(not_provided_counts.values()), align='center', color=['green'], label="provided")
plt.xticks(range(len(not_provided_counts)), list(not_provided_counts.keys()), rotation=90)
plt.legend(bbox_to_anchor=(1, 1))


font1 = {'family':'serif','color':'brown','size':18}

plt.ylabel("Counts", fontdict = font1)
plt.xlabel("Features", fontdict = font1)
plt.show()

### EDA 3 - Analysing features

In [None]:
#Feature analysis

### POST EDA DATA CLEANING

### Data Cleaning Step 13 - After feature analysis, drop unnecessary features - has_company_logo, has_questions

In [None]:
unnecessary_cols = ["has_company_logo", "has_questions"]
for col in unnecessary_cols:
    data = data.drop(col, axis=1)

### EDA 4 - Division based on location_country

In [None]:
i, loc_countries = 0, {}
for c in counts["location_country"].index:
    if c != "not provided":
        loc_countries[c] = counts["location_country"][c]
        i += 1
    if i == 14: break

In [None]:
plt.bar(range(len(loc_countries)), loc_countries.values(), align='center', color=['purple'])
plt.xticks(range(len(loc_countries)), list(loc_countries.keys()), rotation=90)

plt.ylabel("Counts", fontdict = font1)
plt.xlabel("Countries", fontdict = font1)

plt.show()

### EDA 5 - Finding ratio of Real vs Fraud

In [None]:
counts["fraudulent"].plot.pie(labels=["real", "fraud"], autopct='%1.1f%%')

### EDA 6 - Minimum of salary_min

In [None]:
minimum_of_salary_min = data["salary_min"].min()

In [None]:
minimum_of_salary_min

### EDA 7 - Maximum of salary_min

In [None]:
maximum_of_salary_min = data["salary_min"].max()

In [None]:
maximum_of_salary_min

### EDA 8 - Mean of salary_min

In [None]:
mean_of_salary_min = data["salary_min"].mean()

In [None]:
mean_of_salary_min

### EDA 9 - Minimum of salary_max

In [None]:
minimum_of_salary_max = data["salary_max"].min()

In [None]:
minimum_of_salary_max

### EDA 10 - Maximum of salary_max

In [None]:
maximum_of_salary_max = data["salary_max"].max()

In [None]:
maximum_of_salary_max

### EDA 11 - Mean of salary_max

In [None]:
mean_of_salary_max = data["salary_max"].mean()

In [None]:
mean_of_salary_max

### EDA 12 - Division of required_experience

In [None]:
data["required_experience"].value_counts().plot(kind="pie", autopct='%1.1f%%')

### EDA 13 - Division of required_education

In [None]:
ax = data["required_education"].value_counts().plot(kind="bar", color="darkblue")
ax.set_xlabel("Required Education", fontdict = font1)
ax.set_ylabel("Counts", fontdict = font1)

### EDA 14 - Division of employment_type

In [None]:
ax = data["employment_type"].value_counts().plot(kind="line", color="cyan")
ax.set_xlabel("Employment Type", fontdict = font1)
ax.set_ylabel("Counts", fontdict = font1)

In [None]:
data.head()


In [None]:
data_combined = pd.DataFrame()

In [None]:
data_combined['text'] = data['title']+' '+data['company_profile']+' '+data['description']+' '+data['requirements']+' '+data['benefits']

In [None]:
data_combined

In [None]:
data_combined['fraudulent'] = data['fraudulent']

In [None]:
data_combined

In [None]:
fraud_jobs_text = data_combined[data_combined.fraudulent == True].text

In [None]:
real_jobs_text = data_combined[data_combined.fraudulent == False].text

In [None]:
fraud_jobs_text

In [None]:
real_jobs_text

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
from wordcloud import WordCloud
import string
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, plot_confusion_matrix, classification_report, confusion_matrix, recall_score
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsRegressor
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_recall_curve

In [None]:
English

In [None]:
plt.figure(figsize = (16,14))

## Word Cloud - Fake Jobs

In [None]:
wc_fake_jobs = WordCloud(min_font_size = 3, width = 1600, height = 800, stopwords = STOP_WORDS).generate(str(" ".join(fraud_jobs_text)))

In [None]:
plt.imshow(wc_fake_jobs, interpolation="bilinear")

## Word Cloud - Real Jobs

In [None]:
wc_real_jobs = WordCloud(min_font_size = 3, width = 1600, height = 800, stopwords = STOP_WORDS).generate(str(" ".join(real_jobs_text)))

In [None]:
plt.imshow(wc_real_jobs, interpolation="bilinear")

## Finding punctuations and stopwords to remove them from text

In [None]:
punctuations = string.punctuation

In [None]:
stopwords = STOP_WORDS

In [None]:
stopwords

In [None]:
parser = English()


## Defining a tokenizer, lemmatizing all the words

In [None]:
def tokenizer(sentence):
    # Creating our token object
    tokens = parser(sentence)
    # Lemmatizing each token and converting each token into lowercase
    tokens = [ word.lower_ for word in tokens ]
    # Removing stop words
    tokens = [ word for word in tokens if word not in stopwords and word not in punctuations ]
    # return a preprocessed list of tokens
    return tokens

## Model 1 - K Nearest Neighbours

In [None]:
knn_model = KNeighborsRegressor(n_neighbors=3)

## Encoding features

In [None]:
le = preprocessing.LabelEncoder()
dc_text = le.fit_transform(data_combined.text)
dc_fraud = le.fit_transform(data_combined.fraudulent)

transformed_features = list(zip(dc_text, dc_fraud))


## Splitting data into test and train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(transformed_features, dc_fraud, test_size=0.3)

## Creating our model

In [None]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train,y_train)

## Classification Report for KNN

In [None]:
# Testing the classifier
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

## Confusion Matrix for KNN

In [None]:
plot_confusion_matrix(model, X_test, y_test, cmap='Blues', values_format=' ')

## ROC Curve for KNN

In [None]:
lr_precision, lr_recall, _ = precision_recall_curve(y_test, y_pred)
plt.plot(lr_recall, lr_precision, marker='.', label='KNN')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()

## Spliting the data into test and train data and creating our predictor class

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_combined.text, data_combined.fraudulent, test_size=0.3)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [text.strip().lower() for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

## Model 2 - Logistic Regression

In [None]:
# creating our bag of words
vector = CountVectorizer(tokenizer = tokenizer, ngram_range=(1,2))
vector

## Creating a pipeline for Logistic Regression

In [None]:
clf = LogisticRegression(class_weight = 'balanced')

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vector),
                 ('classifier', clf)])

# fitting our model.
pipe.fit(X_train,y_train)

## Classification report for Logistic Regression

In [None]:
y_pred = pipe.predict(X_test)

print(classification_report(y_test,y_pred))

## Confusion Matrix for Logistic Regression

In [None]:
plot_confusion_matrix(pipe, X_test, y_test, cmap='Blues', values_format=' ')
plt.title('Confusion Matrix')
plt.show()

## ROC Curve for Logistic Regression

In [None]:
lr_precision, lr_recall, _ = precision_recall_curve(y_test, y_pred)
plt.plot(lr_recall, lr_precision, marker='.', label='Logistic Regression')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()

## Model 3 -  Support Vector Machine

## Create a pipeline using Bag of Words

In [None]:
# Create pipeline using Bag of Words

pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', CountVectorizer(tokenizer = tokenizer, ngram_range=(1,3))),
                 ('classifier', SVC())])

#Training the model.
pipe.fit(X_train,y_train)


## Classification report for SVM

In [None]:
# Predicting with a test dataset
y_pred = pipe.predict(X_test)

print("Classification Report\n")
print(classification_report(y_test, y_pred))

## Confusion Matrix for SVM

In [None]:
plot_confusion_matrix(pipe, X_test, y_test, cmap='Blues', values_format=' ')
plt.title('Confusion Matrix')
plt.show()

## ROC Curve for SVM

In [None]:
lr_precision, lr_recall, _ = precision_recall_curve(y_test, y_pred)
plt.plot(lr_recall, lr_precision, marker='.', label='Support Vector Machine|')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()

## Model 4 - Decision Tree

## Create a pipeline for Decision Tree

In [None]:
clf = DecisionTreeClassifier(class_weight = 'balanced')

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vector),
                 ('classifier', clf)])

# fitting our model.
pipe.fit(X_train,y_train)
     

## Classification report for Decision Tree

In [None]:
y_pred = pipe.predict(X_test)

print(classification_report(y_test,y_pred))

## Confusion Matrix for Decision Tree

In [None]:
plot_confusion_matrix(pipe, X_test, y_test, cmap='Blues', values_format=' ')
plt.title('Confusion Matrix')
plt.show()

## ROC Curve for Decision Tree

In [None]:
lr_precision, lr_recall, _ = precision_recall_curve(y_test, y_pred)
plt.plot(lr_recall, lr_precision, marker='.', label='Decision Tree')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()

## Model 5 - XGBoost

## Creating a pipeline for XGBoost

In [None]:
clf = XGBClassifier(class_weight = 'balanced')

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vector),
                 ('classifier', clf)])

# fitting our model.
pipe.fit(X_train,y_train)

## Classification report for XGBoost

In [None]:
y_pred = pipe.predict(X_test)

print(classification_report(y_test,y_pred))

## Confusion Matrix for XGBoost

In [None]:
plot_confusion_matrix(pipe, X_test, y_test, cmap='Blues', values_format=' ')

## ROC Curve for XGBoost

In [None]:
lr_precision, lr_recall, _ = precision_recall_curve(y_test, y_pred)
plt.plot(lr_recall, lr_precision, marker='.', label='XGBoost')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
# show the plot
plt.show()