## Packages

In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 

import re
import nltk
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

from keras.models import Sequential
from keras.layers import Dense, Dropout

#nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
pip install nlpaug

Collecting nlpaug
  Downloading nlpaug-1.1.11-py3-none-any.whl (410 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gdown>=4.0.0
  Downloading gdown-4.7.1-py3-none-any.whl (15 kB)
Installing collected packages: gdown, nlpaug
Successfully installed gdown-4.7.1 nlpaug-1.1.11
[0mNote: you may need to restart the kernel to use updated packages.


In [3]:
import nlpaug as nlpaug
import nlpaug.augmenter.word as naw

## Data Preprocessing

In [4]:
data = pd.read_csv("/kaggle/input/real-or-fake-fake-jobposting-prediction/fake_job_postings.csv")
data.drop(["salary_range", "department"], axis='columns', inplace = True)

#function to  get the country where the job is posted 
def country(text):
    if type(text) != float: #location is not null
         return text.split(',')[0]
    else: return ' '
#adding a column to the original dataset with the country where the job is posted    
data['country'] = data.location.apply(country)

# Creating a new dataset with jobs posted only in the US
data_US = data[data["country"] == 'US']
data_US = data_US.reset_index()
data_US.drop('index', axis = 'columns', inplace = True)

#Adding a column that indicates the state where the job was posted
def state(text):
    if len(text) > 3: return text.split(',')[1]
    else: return ' '    

data_US['state'] = data_US.location.apply(state)
#Creating two seperate datasets for real jobs and fake jobs in the US
data_US_fake = data_US[data_US['fraudulent'] == 1]
# Creating a new attribute, for each state we calculate the pecentage of fake jobs
state_df = data_US.state.value_counts().to_frame().rename(columns = {'state' : 'no of jobs'})
state_df['no of fake jobs'] = data_US_fake.state.value_counts()
state_df['p_fake_jobs'] = (state_df['no of fake jobs'] / state_df['no of jobs'])
#adding a new column to the US dataset, a percentage of fake jobs 

state_df.drop(' ', axis = 'index', inplace = True)
states_percentage = state_df['p_fake_jobs'].to_dict()  #creating a dictionary with each state and the percentage of fake jobs to add the  value to the dataset
data_US['percentage of fake jobs'] = data_US['state'].map(states_percentage)

#Creating a column with all the textual data
data_US_text = data_US[['title', 'location', 'company_profile', 'description', 'requirements', 'benefits',
                        'employment_type', 'required_experience','required_education', 'industry', 'function', 'fraudulent']]
data_US_text = data_US_text.fillna(' ')
data_US_text['text'] = data_US_text['title'] + ' ' + data_US_text['location'] + ' ' + data_US_text['company_profile'] + ' '+ data_US_text['description'] + ' '+ data_US_text['requirements'] + ' '+ data_US_text['benefits'] + ' '+ data_US_text['employment_type'] + ' '+ data_US_text['required_experience'] + ' ' + data_US_text['required_education'] + ' '+ data_US_text['industry'] + ' ' + data_US_text['function']
data_US_text.drop(columns = ['title', 'location', 'company_profile', 'description', 'requirements', 'benefits',
                        'employment_type', 'required_experience','required_education', 'industry', 'function' ], inplace = True)

del data
del country
del state
del state_df
del data_US
del data_US_fake
del states_percentage

## Train and Test split

In [5]:
X = data_US_text['text']
y = data_US_text.fraudulent
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=0)

## Augment Data

In [6]:
# seperate train data (X and y) in fraudulent and real cases. 
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)
train_no = len(train_df)

train_df_fake = train_df[train_df['fraudulent'] == 1]

del X_train
del X_test
del y_train
del y_test

In [7]:
aug_syn = naw.SynonymAug(aug_src = 'wordnet', lang = 'eng')
aug_emb = naw.ContextualWordEmbsAug(model_path = 'roberta-base', action = "insert")

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...


Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/501M [00:00<?, ?B/s]

In [8]:
def augment(X, y): #input is series
    list_aug_text = []
    list_aug_label = []
    num = 1
    all = len(X)
    for i in X.index:
        print(str(round((num/all)*100)) + '% done')
        num += 1
        aug_text = aug_syn.augment(X[i])
        aug_text = aug_emb.augment(aug_text[0])
        list_aug_text.append(aug_text[0])
        list_aug_label.append(y[i])
  
    list_aug_text_series = pd.Series(list_aug_text)
    list_aug_label_series = pd.Series(list_aug_label)

    X_augmented = X.append(list_aug_text_series, ignore_index = True)
    y_augmented = y.append(list_aug_label_series, ignore_index = True)

    return X_augmented, y_augmented #series

#X_fake_aug, y_fake_aug = augment(train_df_fake.text, train_df_fake.fraudulent)

In [9]:
#import the already augmented data
aug_data = pd.read_csv("/kaggle/input/augmented-data/X_train_fake_aug_df.csv")
aug_data.drop(["Unnamed: 0"], axis='columns', inplace = True)
aug_data['fraudulent'] = 1
aug_no = len(aug_data)
aug_data

Unnamed: 0,text,fraudulent
0,"Lawn and Maintenance Contractors US, MD, Coll...",1
1,"Administrative Assistant US, NV, LAS VEGAS A...",1
2,"SAP Support Advisor US, TX, Houston Aker Solut...",1
3,"Project Subcontracting Manager US, TX, Houston...",1
4,"Clerical Personnel US, OH, Cincinnati Bradley ...",1
...,...,...
1057,"Health + Environmental Professional US, CA, Ba...",1
1058,"Project QA Engineer US, TX, Houston Corporate ...",1
1059,"Class A - CDL Driver - Doubles Endorsed US, OH...",1
1060,"Project Controls Tech US, CO, Denver Staffing ...",1


## Cleaning the text

In [10]:
#CLEANING
def clean_text(row):
    t = row['text']
    #Lower case
    t = t.lower()
    #Removing punctuation, links, numbers, _/-/@/% etc. 
    t = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|[0-9]|(\w+:\/\/\S+)|^rt|http.+?", "", t)
    #Removing the extra spaces created 
    t = re.sub(' +', ' ', t)
    #English stopwords are removed
    fulltext = t.split()
    stop = stopwords.words('english')
    t = " ".join([w for w in fulltext if w not in (stop)])
    #Stemming
    fulltext = t.split()
    fulltext_stem = []
    stemmer = PorterStemmer()
    for w in fulltext:
        fulltext_stem.append(stemmer.stem(w))

    return ' '.join(s for s in fulltext_stem)

In [11]:
# Cleaning aug data
aug_data["clean_text"] = aug_data.apply(clean_text, axis = 1)
aug_data.drop(["text"], axis='columns', inplace = True)

In [12]:
# Cleaning train data
train_df["clean_text"] = train_df.apply(clean_text, axis = 1)
train_df.drop(["text"], axis='columns', inplace = True)

In [13]:
# Cleaning test data
test_df["clean_text"] = test_df.apply(clean_text, axis = 1)
test_df.drop(["text"], axis='columns', inplace = True)

In [14]:
aug_data['fraudulent'] = 1
final_df = aug_data.append(train_df, ignore_index=True)
final_df = final_df.append(test_df, ignore_index=True)

In [15]:
del aug_data
del train_df
del test_df

## Getting the data ready for the models

In [16]:
#Converting the textual data into numerical form to feed to the predicting models, using the Bag of Words approach
vec = CountVectorizer(ngram_range = (1,1), binary = False)
X_final = vec.fit_transform(final_df['clean_text']).toarray() #X = vec.fit_transform(data_US_text['clean_text']).toarray()
#The BoW approach gives each word a score based on its occurence in the text, but does not take into consideration how
#frequent this word is in all the texts, thus I use TF-IDF that considers the all the texts to assign a weightage to a word.
tfidf = TfidfTransformer()
X_final = tfidf.fit_transform(X_final).toarray()

In [17]:
# seperate the augmented data
aug_X = X_final[:aug_no,:]
aug_y = final_df.fraudulent[:aug_no]
# train and test split
X_train = X_final[aug_no:(aug_no + train_no),:]
y_train = final_df.fraudulent[aug_no:(aug_no + train_no)]

X_test = X_final[(aug_no + train_no):, :]
y_test = final_df.fraudulent[(aug_no + train_no):]

del X_final
del final_df

# add the augmented data to the train 
X_train_ = np.append(X_train, aug_X, axis = 0)
del X_train
del aug_X
y_train_ = y_train.append(aug_y, ignore_index=True)
del aug_y
del y_train

## Predictions

In [18]:
def predictions(X_train, X_test, y_train, y_test):
    
    models = [LogisticRegression(), RandomForestClassifier()]
    for classifier in models:
        print('Model used: ' + str(classifier))
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        print(classification_report(y_test, y_pred))
        
predictions(X_train_, X_test, y_train_, y_test)

Model used: LogisticRegression()
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2998
           1       0.95      0.68      0.80       199

    accuracy                           0.98      3197
   macro avg       0.97      0.84      0.89      3197
weighted avg       0.98      0.98      0.98      3197

Model used: RandomForestClassifier()
              precision    recall  f1-score   support

           0       0.98      1.00      0.99      2998
           1       1.00      0.65      0.79       199

    accuracy                           0.98      3197
   macro avg       0.99      0.82      0.89      3197
weighted avg       0.98      0.98      0.98      3197



In [19]:
model = Sequential()
model.add(Dense(50, input_shape=(X_train_.shape[1],), activation='relu')) 
model.add(Dense(30, activation='relu'))
model.add(Dense(15, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.summary() 

model.compile(optimizer='Adam', loss='binary_crossentropy',  metrics=['accuracy'])

history = model.fit(X_train_, y_train_,
                    epochs=8, 
                    batch_size=3,
                    validation_split=0.2,
                    shuffle=True,
                    verbose=1)

y_pred = np.round(model.predict(X_test),0)
print(classification_report(y_test, y_pred))

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 50)                4702300   
                                                                 
 dense_1 (Dense)             (None, 30)                1530      
                                                                 
 dense_2 (Dense)             (None, 15)                465       
                                                                 
 dense_3 (Dense)             (None, 1)                 16        
                                                                 
Total params: 4,704,311
Trainable params: 4,704,311
Non-trainable params: 0
_________________________________________________________________
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
              precision    recall  f1-score   support

           0       0.99      0.99      0.99