In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from wordcloud import WordCloud, STOPWORDS
from bs4 import BeautifulSoup
import re
import string
import unicodedata
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from keras.models import Sequential
from keras.layers import Dense

# Download NLTK data files
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [20]:
# Load the dataset
df = pd.read_csv("/content/drive/MyDrive/Colab çalışma/Neıman/textclasifer/fake_job_postings.csv")

# Display the first few rows of the dataset
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [21]:
# Check and drop the columns if they exist
columns_to_drop = ['salary_range', 'job_id']
for column in columns_to_drop:
    if column in df.columns:
        df.drop(column, axis=1, inplace=True)

# Fill missing values with an empty string
df.fillna(" ", inplace=True)

# Check if the columns exist before combining text columns into a single column
text_columns = ['title', 'location', 'department', 'company_profile', 'description',
                'requirements', 'benefits', 'employment_type', 'required_education',
                'industry', 'function']

# Combine text columns into a single column if they exist
df['text'] = ''
for column in text_columns:
    if column in df.columns:
        df['text'] += ' ' + df[column]

# Drop original columns if they exist
for column in text_columns:
    if column in df.columns:
        df.drop(column, axis=1, inplace=True)

# Strip any leading/trailing whitespace from the 'text' column
df['text'] = df['text'].str.strip()

# Display the first few rows of the cleaned dataset
df.head()

Unnamed: 0,telecommuting,has_company_logo,has_questions,required_experience,fraudulent,text
0,0,1,0,Internship,0,"Marketing Intern US, NY, New York Marketing We..."
1,0,1,0,Not Applicable,0,"Customer Service - Cloud Video Production NZ, ..."
2,0,1,0,,0,"Commissioning Machinery Assistant (CMA) US, IA..."
3,0,1,0,Mid-Senior level,0,"Account Executive - Washington DC US, DC, Wash..."
4,0,1,1,Mid-Senior level,0,"Bill Review Manager US, FL, Fort Worth SpotS..."


In [22]:
# Define stopwords and punctuation
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

# Define a function to clean the text
def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)

    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop]

    # Lemmatize the text
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

# Apply the cleaning function to the text column
df['text'] = df['text'].apply(clean_text)

# Display the first few rows of the preprocessed dataset
df.head()

  text = BeautifulSoup(text, "html.parser").get_text()


Unnamed: 0,telecommuting,has_company_logo,has_questions,required_experience,fraudulent,text
0,0,1,0,Internship,0,marketing intern u ny new york marketing food ...
1,0,1,0,Not Applicable,0,customer service cloud video production nz auc...
2,0,1,0,,0,commissioning machinery assistant cma u ia wev...
3,0,1,0,Mid-Senior level,0,account executive washington dc u dc washingto...
4,0,1,1,Mid-Senior level,0,bill review manager u fl fort worth spotsource...


In [23]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the text data
X = tfidf_vectorizer.fit_transform(df['text'])

# Display the shape of the transformed data
print(X.shape)

(17880, 5000)


In [24]:
# Encode the target labels
label_encoder = LabelBinarizer()
y = label_encoder.fit_transform(df['fraudulent'])

# Display the shape of the encoded labels
print(y.shape)

(17880, 1)


In [25]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print(X_train.shape, X_test.shape)

(14304, 5000) (3576, 5000)


In [26]:
# Define the model
model = Sequential()
model.add(Dense(512, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               2560512   
                                                                 
 dense_1 (Dense)             (None, 256)               131328    
                                                                 
 dense_2 (Dense)             (None, 128)               32896     
                                                                 
 dense_3 (Dense)             (None, 1)                 129       
                                                                 
Total params: 2724865 (10.39 MB)
Trainable params: 2724865 (10.39 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
# Convert the sparse matrix to a dense matrix
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Train the model
history = model.fit(X_train_dense, y_train, epochs=10, batch_size=32, validation_data=(X_test_dense, y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [30]:
# Evaluate the Model
loss, accuracy = model.evaluate(X_test_dense, y_test)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

Test Accuracy: 98.04%


In [31]:
# Make Predictions
y_pred = model.predict(X_test_dense)
y_pred_binary = [1 if pred > 0.5 else 0 for pred in y_pred]

print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      3395
           1       0.80      0.82      0.81       181

    accuracy                           0.98      3576
   macro avg       0.90      0.90      0.90      3576
weighted avg       0.98      0.98      0.98      3576

