In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 

import re
import nltk
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('wordnet')

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from keras.models import Sequential
from keras.layers import Dense, Dropout

from imblearn.under_sampling import RandomUnderSampler

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# <center><span style="color:#800000;"> DATA PREPROCESSING </span></center>

##### Data Preprocessing part 1: 
1. Drop Columns: salary_range, department. <br>
2. Add country, state for each entry and calculate percentage of fake jobs based on state. 
3. Fill the NaN values with blank spaces in the textual 
4. Create a text column with all the textual categories and drop them from the dataset. 

In [2]:
data = pd.read_csv("/kaggle/input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv")
data.drop(["salary_range", "department"], axis='columns', inplace = True)

#function to  get the country where the job is posted 
def country(text):
    if type(text) != float: #location is not null
         return text.split(',')[0]
    else: return ' '

#adding a column to the original dataset with the country where the job is posted    
data['country'] = data.location.apply(country)

# Creating a new dataset with jobs posted only in the US
data_US = data[data["country"] == 'US']
data_US = data_US.reset_index()
data_US.drop('index', axis = 'columns', inplace = True)

#Adding a column that indicates the state where the job was posted
def state(text):
    if len(text) > 3: return text.split(',')[1]
    else: return ' '    

data_US['state'] = data_US.location.apply(state)
#Creating two seperate datasets for real jobs and fake jobs in the US
data_US_fake = data_US[data_US['fraudulent'] == 1]
# Creating a new attribute, for each state we calculate the pecentage of fake jobs
state_df = data_US.state.value_counts().to_frame().rename(columns = {'state' : 'no of jobs'})
state_df['no of fake jobs'] = data_US_fake.state.value_counts()
state_df['p_fake_jobs'] = (state_df['no of fake jobs'] / state_df['no of jobs'])
#adding a new column to the US dataset, a percentage of fake jobs 
state_df.drop(' ', axis = 'index', inplace = True)
states_percentage = state_df['p_fake_jobs'].to_dict()  #creating a dictionary with each state and the percentage of fake jobs to add the  value to the dataset
data_US['percentage of fake jobs'] = data_US['state'].map(states_percentage)

#Creating a column with all the textual data
data_US_text = data_US[['title', 'location', 'company_profile', 'description', 'requirements', 'benefits',
                        'employment_type', 'required_experience','required_education', 'industry', 'function', 'fraudulent']]
data_US_text = data_US_text.fillna(' ')
data_US_text['text'] = data_US_text['title'] + ' ' + data_US_text['location'] + ' ' + data_US_text['company_profile'] + ' '+ data_US_text['description'] + ' '+ data_US_text['requirements'] + ' '+ data_US_text['benefits'] + ' '+ data_US_text['employment_type'] + ' '+ data_US_text['required_experience'] + ' ' + data_US_text['required_education'] + ' '+ data_US_text['industry'] + ' ' + data_US_text['function']
data_US_text.drop(columns = ['title', 'location', 'company_profile', 'description', 'requirements', 'benefits',
                        'employment_type', 'required_experience','required_education', 'industry', 'function' ], inplace = True)

data_US_text_real = data_US_text[data_US_text['fraudulent'] == 0]
data_US_text_fake = data_US_text[data_US_text['fraudulent'] == 1]

In [3]:
data_US_text.to_csv('data_US_text_not_clean.csv', index=False)

##### Data Preprocessing part 2: 
1. Convert to lower case.
2. Clean text from punctuation, numbers, links (https), symbols etc.
3. Clean text from stopwords.
4. Stemming 

In [4]:
def clean_text(row):
    t = row['text']
    #Lower case
    t = t.lower()
    #Removing punctuation, links, numbers, _/-/@/% etc. 
    t = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|[0-9]|(\w+:\/\/\S+)|^rt|http.+?", "", t)
    #Removing the extra spaces created 
    t = re.sub(' +', ' ', t)
    #English stopwords are removed
    fulltext = t.split()
    stop = stopwords.words('english')
    t = " ".join([w for w in fulltext if w not in (stop)])
    #Stemming
    fulltext_stem = []
    stemmer = PorterStemmer()
    for w in fulltext:
        fulltext_stem.append(stemmer.stem(w))

    return ' '.join(s for s in fulltext_stem)

data_US_text['clean_text'] = data_US_text.apply(clean_text, axis=1)

In [5]:
data_US_text.to_csv('data_US_text.csv', index=False)

In [6]:
data_b = pd.read_csv("/kaggle/working/data_US_text.csv")

##### Converting to numerical form and spitting train/test.

In [7]:
def ready_for_training(text, y, ngram, bbool):
    #Converting the textual data into numerical form to feed to the predicting models, using the Bag of Words approach
    vec = CountVectorizer(ngram_range = ngram, binary = bbool)
    X = vec.fit_transform(text).toarray() #X = vec.fit_transform(data_US_text['clean_text']).toarray()
    #The BoW approach gives each word a score based on its occurence in the text, but does not take into consideration how
    #frequent this word is in all the texts, thus I use TF-IDF that considers the all the texts to assign a weightage to a word.
    tfidf = TfidfTransformer()
    X = tfidf.fit_transform(X).toarray()    
    return train_test_split(X, y, test_size = 0.3, random_state=0) #splitting to train/test

X_train, X_test, y_train, y_test = ready_for_training(data_b['clean_text'], data_b.fraudulent,(1,1), False)
#considering bigrams (1,2) the session crashes. 

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7459, 93480)
(3197, 93480)
(7459,)
(3197,)


***

# <center><span style="color:#800000;"> PREDICTIONS</center>

## <span style="color:#4169E1;">1. Random Forest


### <span style="color:#4169E1;">1.1. Class Weighting

In [8]:
print('Random Forest With Class Weighting')

classifier = RandomForestClassifier(class_weight='balanced')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

Random Forest With Class Weighting
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      2998
           1       1.00      0.49      0.66       199

    accuracy                           0.97      3197
   macro avg       0.98      0.75      0.82      3197
weighted avg       0.97      0.97      0.96      3197



### <span style="color:#4169E1;">1.2. Random Undersampling

In [9]:
undersample = RandomUnderSampler(sampling_strategy='majority')
#For example, if we set sampling_strategy to 0.5 in an imbalanced data dataset with 1,000 examples in the majority class and 100 examples in the minority class, then there would be 200 examples for the majority class in the transformed dataset (or 100/200 = 0.5).
undersample1 = RandomUnderSampler(sampling_strategy=0.5)
X_under, y_under = undersample.fit_resample(X_train, y_train)
X_under1, y_under1 = undersample1.fit_resample(X_train, y_train)

In [10]:
print('Random Forest with UnderSampling')
classifier = RandomForestClassifier()
classifier.fit(X_under, y_under)
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

Random Forest with UnderSampling
              precision    recall  f1-score   support

           0       0.99      0.90      0.94      2998
           1       0.38      0.92      0.53       199

    accuracy                           0.90      3197
   macro avg       0.69      0.91      0.74      3197
weighted avg       0.96      0.90      0.92      3197



Using the above undersampling technique for random forest, we see that our recall value for the original minority class increases incredibly to a 0.91, however, our precision is lowered to 0.41, the lowest value I have seen so far for this prediction problem. This can be explained as the model is predicting a lot more cases as fake jobs, even cases that are in fact not fake. Thus, the recall score also increasesas the model is predicting as positive a larger percentage of the overall fake jobs. On the other side, the recall score for real jobs is relatively lower as the ratio of the rightfully predicted real jobs over all real jobs in the dataset is smaller.
<br>
**This is due to the undersampling method loosing valuable information from the majority class to balance the dataset.**

In [11]:
print('Random Forest with UnderSampling to 0.5 ratio majority/minority')
classifier.fit(X_under1, y_under1)
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

Random Forest with UnderSampling to 0.5 ratio majority/minority
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2998
           1       0.91      0.71      0.80       199

    accuracy                           0.98      3197
   macro avg       0.95      0.85      0.89      3197
weighted avg       0.98      0.98      0.98      3197



**In this case I chose to rather than balance the number of real and fake jobs, to undersample the majority class to a 1/2 ration of majority/minority class. This way it can be observed an imporved f1-score better than the Logistic Regression and Random Forest previously used on the origical imbalanced data. This significantly performs better than the previous undersampling method as it increasies by 50% the amount of data to train the model.**

***

## <span style="color:#4169E1;">2. LOGISTIC REGRESSION

### <span style="color:#4169E1;">2.1. Class Weighting

In [12]:
print('LR With Class Weighting')

classifier = LogisticRegression(class_weight='balanced')
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

LR With Class Weighting
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      2998
           1       0.73      0.87      0.79       199

    accuracy                           0.97      3197
   macro avg       0.86      0.92      0.89      3197
weighted avg       0.97      0.97      0.97      3197



### <span style="color:#4169E1;">2.2. Random Undersampling

In [13]:
print('Logistic Regression with UnderSampling to 0.5 ratio majority/minority')

classifier = LogisticRegression()
classifier.fit(X_under1, y_under1)
y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

Logistic Regression with UnderSampling to 0.5 ratio majority/minority
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2998
           1       0.85      0.72      0.78       199

    accuracy                           0.97      3197
   macro avg       0.92      0.86      0.88      3197
weighted avg       0.97      0.97      0.97      3197



**Using the same undersampling technique on the training data before using Logistic Regression, we see a big improvement in performance, f-1 score for fake jobs prediction is increased from 0.56 (original imbalanced data) to 0.80 (undersampled data).**

<span style="font-size:18px;" >USING UNDERSAMPLING METHOD WITH 0.5 RATIO IS THE BEST METHOD FOUND TO INCREASE PERFORMANCE WITH THE LOGISTIC REGRESSION AND RANDOM FOREST CLASSIFIER METHODS. 😁

<span style="color:#800000;font-size:22px;" >However I was not able to improve the performance of these models enough to perform better than the Neural Network used previously. Thus, below I am going to work on improving the performance of the Neural Network model. </span> ⬇️

***

## <span style="color:#4169E1;">3. Neural Network

### <span style="color:#4169E1;">3.1. Random Undersampling

In [14]:
model = Sequential()
model.add(Dense(50, input_shape=(X_under1.shape[1],), activation='relu')) 
model.add(Dense(30, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary() 

model.compile(optimizer='Adam', loss='binary_crossentropy',  metrics=['accuracy'])

#no neeed for early stopping, no of epoch small
history = model.fit(X_under1, y_under1,
                    epochs=10, 
                    batch_size=10,
                    validation_split=0.2,
                    shuffle=True,
                    verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                4674050   
                                                                 
 dense_1 (Dense)             (None, 30)                1530      
                                                                 
 dense_2 (Dense)             (None, 15)                465       
                                                                 
 dense_3 (Dense)             (None, 1)                 16        
                                                                 
Total params: 4,676,061
Trainable params: 4,676,061
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
y_pred = np.round(model.predict(X_test),0)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      2998
           1       0.83      0.77      0.80       199

    accuracy                           0.98      3197
   macro avg       0.91      0.88      0.89      3197
weighted avg       0.98      0.98      0.98      3197



### <span style="color:#4169E1;">3.2. Random Undersampling

In [16]:
model = Sequential()
model.add(Dense(50, input_shape=(X_train.shape[1],), activation='relu')) 
model.add(Dense(30, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary() 
weights = {0:1, 1:13}
model.compile(optimizer='Adam', loss='binary_crossentropy',  metrics=['accuracy'])

#no neeed for early stopping, no of epoch small

history = model.fit(X_train, y_train,
                    epochs=10, 
                    batch_size=10,
                    class_weight = weights,
                    validation_split=0.2,
                    shuffle=True,
                    verbose=1)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 50)                4674050   
                                                                 
 dense_5 (Dense)             (None, 30)                1530      
                                                                 
 dense_6 (Dense)             (None, 15)                465       
                                                                 
 dense_7 (Dense)             (None, 1)                 16        
                                                                 
Total params: 4,676,061
Trainable params: 4,676,061
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [17]:
y_pred = np.round(model.predict(X_test),0)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2998
           1       0.82      0.84      0.83       199

    accuracy                           0.98      3197
   macro avg       0.91      0.91      0.91      3197
weighted avg       0.98      0.98      0.98      3197

