In [17]:
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, dataset, TensorDataset
import torch.optim.adam as adam
import torch

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

import os
import time
from tqdm import tqdm
tqdm.pandas()
from collections import Counter
import re

import nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize


In [2]:
data = pd.read_csv("fake_job_postings.csv")
data.head(4)

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0


### For the purpose of text classification i am focussing on Job description
### to determine wheter the job description is fraudalent or not

In [3]:
## Taking description and fraudalent column as features and Target
short_data = data[['description', 'fraudulent']]
short_data.head(4)

Unnamed: 0,description,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0


In [4]:
short_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  17879 non-null  object
 1   fraudulent   17880 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 279.5+ KB


In [5]:
short_data = short_data.dropna()
short_data.isnull().sum()

description    0
fraudulent     0
dtype: int64

In [6]:
short_data.fraudulent.value_counts()

0    17014
1      865
Name: fraudulent, dtype: int64

In [None]:
spaces = [' ', '\t', '\n']
def remove_space(text):
    """
    remove extra spaces and ending space if any
    """
    for space in spaces:
        text = text.replace(space, ' ')
    text = text.strip()
    text = re.sub('\s+', ' ', text)
    return text


def clean_special_punctuations(text):
    """
    Remove special punctuations from the given text.
    """
    for punc in special_punc_mappings:
        if punc in text:
            text = text.replace(punc, special_punc_mappings[punc])
    return text

print(remove_space("Organised - Focused -         Vibrant - Awesome!Do you"))

In [7]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
stop_words = set(stopwords.words('English'))


def remove_hyperlinks(text):
    return re.sub(r'http\S+', ' ', text)

def remove_punctuation(text):
    return re.sub(r'[\"\#\$\%\&\'\(\)\*\+\/\:\;\,\<\=\>\@\[\\\]\^\_\`\{\|\}\~]', '', text)

def rm_html_tags(text):
    text = re.sub(r'<.*?>', '', text)
    return re.sub(r'<br />', '', text)

def space_bt_punct(text):
    pattern = r'([.,!?-])'
    s = re.sub(pattern, r' \1 ', text)  # add whitespaces between punctuation
    s = re.sub(r'\s{2,}', ' ', s)  # remove double whitespaces
    return s

def remove_integers(text):
    return re.sub(r'\d+', '', text)

def rm_whitespaces(text):
    return re.sub('\s+', ' ', text)

def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)

def rm_nonascii(text):
    return re.sub(r'[^\x00-\x7f]', r'', text)


def clean_pipeline(text):
    text = text.lower()
    no_link = remove_hyperlinks(text)
    no_html = rm_html_tags(no_link)
    space_punct = space_bt_punct(no_html)
    no_punct = remove_punctuation(space_punct)
    no_number = remove_integers(no_punct)
    no_whitespaces = rm_whitespaces(no_number)
    no_nonasci = rm_nonascii(no_whitespaces)
    #spell_corrected = spell_correction(no_emoji)
    return no_nonasci


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/harshayarravarapu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [8]:
short_data['description'] = short_data['description'].progress_apply(clean_pipeline)


100%|██████████| 17879/17879 [00:03<00:00, 5810.10it/s]


In [9]:
def tokenize_words(text):
    # Tokenizes the text into list of tokens(words)
    # "go until jurong point crazy" --> "go", "until","jurong", "point", "crazy",
    return word_tokenize(text)

def remove_stopwords(text):
    ## Remove stopwords such as is am it there
    return [word for word in text if word not in stop_words]

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in text:
        pos_tag = nltk.pos_tag([word])[0][1][0].upper()
        wordnet_pos = {'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV, 'J': wordnet.ADJ}.get(pos_tag, wordnet.NOUN)
        # Lemmatize the word with the specified POS tag
        lemma = lemmatizer.lemmatize(word, pos=wordnet_pos)
        lemmas.append(lemma)
    ## Lemmatize words like running --> run
    # make sure lemmas does not contains stopwords
    return remove_stopwords(lemmas)


def preprocess_pipeline(text):
    tokens = tokenize_words(text)
    no_stopwords = remove_stopwords(tokens)
    lemmas = lemmatize(no_stopwords)
    return ' '.join(lemmas)

In [10]:
short_data['description'] = short_data['description'].progress_apply(preprocess_pipeline)

100%|██████████| 17879/17879 [03:38<00:00, 81.66it/s] 


In [11]:
reviews = short_data.description.values
# merge into single variable, separated by whitespaces
words = ' '.join(reviews)
# obtain list of words
words = words.split()
# build vocabulary
counter = Counter(words)
# only keep top 2000 words
vocab = sorted(counter, key=counter.get, reverse=True)[:2000]
int2word = dict(enumerate(vocab, 2))
int2word[0] = '<PAD>'
int2word[1] = '<UNK>'
word2int = {word: id for id, word in int2word.items()}

In [12]:
reviews_enc = [[word2int[word] if word in word2int else word2int['<UNK>'] for word in review.split()] for review in tqdm(reviews, desc='encoding')]


encoding: 100%|██████████| 17879/17879 [00:00<00:00, 41942.50it/s]


In [13]:
def pad_features(reviews, pad_id, seq_length=128):
    # features = np.zeros((len(reviews), seq_length), dtype=int)
    features = np.full((len(reviews), seq_length), pad_id, dtype=int)

    for i, row in enumerate(reviews):
        start_index = max(0, seq_length - len(row))
        # if seq_length < len(row) then review will be trimmed
        features[i, start_index:] = np.array(row)[:min(seq_length, len(row))]

    return features


seq_length = 128
features = pad_features(reviews_enc, pad_id=word2int['<PAD>'], seq_length=seq_length)


In [15]:
labels = short_data.fraudulent.to_numpy()

# train test split
train_size = .75  # we will use 75% of whole data as train set
val_size = .5  # and we will use 50% of test set as validation set

# stratify will make sure that train and test set have same distribution of labels
train_x, test_x, train_y, test_y = train_test_split(features, labels, test_size=1 - train_size, stratify=labels)

# split test set into validation and test set
val_x, test_x, val_y, test_y = train_test_split(test_x, test_y, test_size=val_size, stratify=test_y)

In [18]:
# define batch size
batch_size = 64

# create tensor datasets
train_dataset = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_dataset = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_dataset = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# create dataloaders
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

In [19]:
class Rnn_textclassification(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size):
        super(Rnn_textclassification, self).__init__()
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size

        self.embedding_layer = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.RNN(embedding_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        embedded = self.embedding_layer(x)
        output, hidden = self.rnn(embedded)
        # Take the last output of the RNN
        last_output = output[:, -1, :]
        final_out = torch.sigmoid(self.fc(last_output))
        return final_out


In [20]:
vocab_size = len(vocab) + 2
output_size = 1
embedding_dim = 100
hidden_dim = 256
num_layers = 2

model = Rnn_textclassification(vocab_size, embedding_dim, hidden_dim, output_size)

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = model.to(device)

In [22]:
lr = 0.0001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [23]:
def train_model(model, criterion, optimizer, train_loader, device):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), labels.float())

        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    return epoch_loss


In [24]:
num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train_model(model, criterion, optimizer, train_loader, device)
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {train_loss:.4f}')

Epoch [1/10], Loss: 0.2558
Epoch [2/10], Loss: 0.1885
Epoch [3/10], Loss: 0.1783
Epoch [4/10], Loss: 0.1646
Epoch [5/10], Loss: 0.1542
Epoch [6/10], Loss: 0.1438
Epoch [7/10], Loss: 0.1355
Epoch [8/10], Loss: 0.1272
Epoch [9/10], Loss: 0.1206
Epoch [10/10], Loss: 0.1123


In [25]:
def calculate_accuracy(model, test_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            predicted = (outputs > 0.5).float()  # Convert probabilities to binary predictions
            correct += (predicted == labels.view_as(predicted)).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return accuracy

# Calculate accuracy
accuracy = calculate_accuracy(model, test_loader, device)
print(f'Accuracy: {accuracy:.2f}')


Accuracy: 0.96
