In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 

import re
import nltk
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from keras.models import Sequential
from keras.layers import Dense, Dropout

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# DATA PREPROCESSING

##### Data Preprocessing part 1: 
1. Drop Columns: salary_range, department. <br>
2. Add country, state for each entry and calculate percentage of fake jobs based on state. 
3. Fill the NaN values with blank spaces in the textual 
4. Create a text column with all the textual categories and drop them from the dataset. 

In [2]:
data = pd.read_csv("/kaggle/input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv")
data.drop(["salary_range", "department"], axis='columns', inplace = True)

#function to  get the country where the job is posted 
def country(text):
    if type(text) != float: #location is not null
         return text.split(',')[0]
    else: return ' '

#adding a column to the original dataset with the country where the job is posted    
data['country'] = data.location.apply(country)

# Creating a new dataset with jobs posted only in the US
data_US = data[data["country"] == 'US']
data_US = data_US.reset_index()
data_US.drop('index', axis = 'columns', inplace = True)

#Adding a column that indicates the state where the job was posted
def state(text):
    if len(text) > 3: return text.split(',')[1]
    else: return ' '    

data_US['state'] = data_US.location.apply(state)
#Creating two seperate datasets for real jobs and fake jobs in the US
data_US_fake = data_US[data_US['fraudulent'] == 1]
# Creating a new attribute, for each state we calculate the pecentage of fake jobs
state_df = data_US.state.value_counts().to_frame().rename(columns = {'state' : 'no of jobs'})
state_df['no of fake jobs'] = data_US_fake.state.value_counts()
state_df['p_fake_jobs'] = (state_df['no of fake jobs'] / state_df['no of jobs'])
#adding a new column to the US dataset, a percentage of fake jobs 
state_df.drop(' ', axis = 'index', inplace = True)
states_percentage = state_df['p_fake_jobs'].to_dict()  #creating a dictionary with each state and the percentage of fake jobs to add the  value to the dataset
data_US['percentage of fake jobs'] = data_US['state'].map(states_percentage)

#Creating a column with all the textual data
data_US_text = data_US[['title', 'location', 'company_profile', 'description', 'requirements', 'benefits',
                        'employment_type', 'required_experience','required_education', 'industry', 'function', 'fraudulent']]
data_US_text = data_US_text.fillna(' ')
data_US_text['text'] = data_US_text['title'] + ' ' + data_US_text['location'] + ' ' + data_US_text['company_profile'] + ' '+ data_US_text['description'] + ' '+ data_US_text['requirements'] + ' '+ data_US_text['benefits'] + ' '+ data_US_text['employment_type'] + ' '+ data_US_text['required_experience'] + ' ' + data_US_text['required_education'] + ' '+ data_US_text['industry'] + ' ' + data_US_text['function']
data_US_text.drop(columns = ['title', 'location', 'company_profile', 'description', 'requirements', 'benefits',
                        'employment_type', 'required_experience','required_education', 'industry', 'function' ], inplace = True)

data_US_text_real = data_US_text[data_US_text['fraudulent'] == 0]
data_US_text_fake = data_US_text[data_US_text['fraudulent'] == 1]

##### Data Preprocessing part 2: 
1. Convert to lower case.
2. Clean text from punctuation, numbers, links (https), symbols etc.
3. Clean text from stopwords.
4. Stemming 

In [3]:
def clean_text(row):
    t = row['text']
    #Lower case
    t = t.lower()
    #Removing punctuation, links, numbers, _/-/@/% etc. 
    t = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|[0-9]|(\w+:\/\/\S+)|^rt|http.+?", "", t)
    #Removing the extra spaces created 
    t = re.sub(' +', ' ', t)
    #English stopwords are removed
    fulltext = t.split()
    stop = stopwords.words('english')
    t = " ".join([w for w in fulltext if w not in (stop)])
    #Stemming
    fulltext = t.split()
    fulltext_stem = []
    stemmer = PorterStemmer()
    for w in fulltext:
        fulltext_stem.append(stemmer.stem(w))

    return ' '.join(s for s in fulltext_stem)

data_US_text['clean_text'] = data_US_text.apply(clean_text, axis=1)

In [4]:
data_US_text.to_csv('data_US_text.csv', index=False)

In [5]:
data_b = pd.read_csv("/kaggle/working/data_US_text.csv")

##### Converting to numerical form and spitting train/test.

In [6]:
def ready_for_training(text, y, ngram, bbool):
    #Converting the textual data into numerical form to feed to the predicting models, using the Bag of Words approach
    vec = CountVectorizer(ngram_range = ngram, binary = bbool)
    X = vec.fit_transform(text).toarray() #X = vec.fit_transform(data_US_text['clean_text']).toarray()
    #The BoW approach gives each word a score based on its occurence in the text, but does not take into consideration how
    #frequent this word is in all the texts, thus I use TF-IDF that considers the all the texts to assign a weightage to a word.
    tfidf = TfidfTransformer()
    X = tfidf.fit_transform(X).toarray()    
    return train_test_split(X, y, test_size=0.3, random_state=0) #splitting to train/test

X_train, X_test, y_train, y_test = ready_for_training(data_b['clean_text'], data_b.fraudulent,(1,1), False)
#considering bigrams (1,2) the session crashes. 

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(7459, 93400)
(3197, 93400)
(7459,)
(3197,)


# PREDICTIONS

In [7]:
def predictions(X_train, X_test, y_train, y_test):
    
    models = [LogisticRegression(), RandomForestClassifier()]
    for classifier in models:
        print('Model used: ' + str(classifier))
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        print(classification_report(y_test, y_pred))
                

In [8]:
predictions(X_train, X_test, y_train, y_test)

Model used: LogisticRegression()
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      2998
           1       1.00      0.40      0.57       199

    accuracy                           0.96      3197
   macro avg       0.98      0.70      0.78      3197
weighted avg       0.96      0.96      0.96      3197

Model used: RandomForestClassifier()
              precision    recall  f1-score   support

           0       0.97      1.00      0.99      2998
           1       1.00      0.60      0.75       199

    accuracy                           0.98      3197
   macro avg       0.99      0.80      0.87      3197
weighted avg       0.98      0.98      0.97      3197



In [9]:
model = Sequential()
model.add(Dense(50, input_shape=(X_train.shape[1],), activation='relu')) 
model.add(Dense(30, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary() 

model.compile(optimizer='Adam', loss='binary_crossentropy',  metrics=['accuracy'])

#no neeed for early stopping, no of epoch small
history = model.fit(X_train,y_train,
                    epochs=10, 
                    batch_size=10,
                    validation_split=0.2,
                    shuffle=True,
                    verbose=1)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                4670050   
                                                                 
 dense_1 (Dense)             (None, 30)                1530      
                                                                 
 dense_2 (Dense)             (None, 15)                465       
                                                                 
 dense_3 (Dense)             (None, 1)                 16        
                                                                 
Total params: 4,672,061
Trainable params: 4,672,061
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
y_pred = np.round(model.predict(X_test),0)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2998
           1       0.92      0.82      0.86       199

    accuracy                           0.98      3197
   macro avg       0.95      0.91      0.93      3197
weighted avg       0.98      0.98      0.98      3197

