### Load Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
# Load data from CSV file in Google Drive
df = pd.read_csv("fake_job_postings.csv")

### Pre-Processing

In [None]:
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [None]:
# Understanding the dataset
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [None]:
# Check for duplicates based on 'description' and 'title'
duplicate_rows = df.duplicated(subset=['description', 'title'], keep='first').sum()

duplicate_rows

2093

In [None]:
# Remove duplicate rows
df = df.drop_duplicates(subset=['description', 'title'], keep='first')

In [None]:
df.fillna(" ",inplace = True)

In [None]:
df2 = df.copy()

columns_to_drop = df2.select_dtypes(include=[object]).columns

df2.drop(columns=columns_to_drop, inplace=True)

df2['text'] = df.select_dtypes(include=[object]).agg(' '.join, axis=1)

In [None]:
df2.head()

### Model Experiment


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Ensure the necessary NLTK datasets are downloaded
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\csg20\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\csg20\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\csg20\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def run_experiment(dataframe, text_column, label_column, experiment_config, grid_search_configs):
    results = []
    confusion_matrices = {}

    # Preprocess text based on experiment_config
    lemmatize = experiment_config['preprocessing']
    dataframe['processed_text'] = dataframe[text_column].apply(lambda x: preprocess_text(x, lemmatize=lemmatize))

    # Identify the grid search configuration for the classifier in experiment_config
    classifier_name = experiment_config['classifier'].__class__.__name__
    grid_search_config = next((config for config in grid_search_configs if config['classifier_name'] == classifier_name), None)

    if grid_search_config:
        # Modify the pipeline to include vectorization as specified in experiment_config
        vec_cond = experiment_config['vectorizer_condition']
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english', ngram_range=vec_cond)),
            (classifier_name, experiment_config['classifier'])
        ])

        # Update param_grid to respect the current vectorizer condition
        param_grid = {**grid_search_config['param_grid'], 'tfidf__ngram_range': [vec_cond]}

        # Conduct grid search
        print(f"Running grid search for {classifier_name}")
        grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=-1)
        grid_search.fit(dataframe['processed_text'], dataframe[label_column])
        best_params = grid_search.best_params_
        print(f"Best parameters for {classifier_name}: {best_params}")

        # Train/test split
        X_train, X_test, y_train, y_test = train_test_split(dataframe['processed_text'], dataframe[label_column], test_size=0.3, random_state=42)

        # Reconfigure pipeline with best parameters for retraining
        best_pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english', **{k.replace('tfidf__', ''): v for k, v in best_params.items() if 'tfidf__' in k})),
            (classifier_name, grid_search.best_estimator_.named_steps[classifier_name])
        ])

        # Retrain on X_train and y_train
        print("Retraining model with best parameters on training set...")
        best_pipeline.fit(X_train, y_train)

        y_pred = best_pipeline.predict(X_test)

        # Calculate metrics and confusion matrix
        accuracy = accuracy_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred, zero_division=0)
        precision = precision_score(y_test, y_pred, zero_division=0)
        cm = confusion_matrix(y_test, y_pred)

        # Store results
        experiment_name = f"{classifier_name}"
        confusion_matrices[experiment_name] = cm
        results.append({
            'Experiment': experiment_name,
            'Accuracy': accuracy,
            'Recall': recall,
            'Precision': precision,
            'Best Params': best_params
        })
    else:
        print(f"No grid search configuration found for {classifier_name}")

    return pd.DataFrame(results), confusion_matrices

In [None]:
grid_search_configs = [
    {
        'classifier_name': 'SVC',
        'classifier': SVC(),
        'param_grid': {
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'SVC__C': [0.1, 1, 10],
            'SVC__kernel': ['linear', 'rbf', 'poly'],
            #'SVC__gamma': ['scale', 'auto', 0.1, 1, 10],  # Optional for exploration, can be computational intensive
            #'SVC__coef0': [0.0, 0.5, 1.0]
        }
    },
    {
        'classifier_name': 'MultinomialNB',
        'classifier': MultinomialNB(),
        'param_grid': {
            'tfidf__ngram_range': [(1, 1), (1, 2)],
            'tfidf__max_df': [0.5, 0.75, 1.0],
            'tfidf__min_df': [1, 2, 3],
            'tfidf__max_features': [None, 5000, 10000],
            'MultinomialNB__alpha': [0.1, 1.0, 10.0]
        }
    }
]

experiment_configs = [
    # Preprocessing with Lemmatization
    {'preprocessing': True, 'vectorizer_condition': (1, 1), 'classifier': MultinomialNB()},
    {'preprocessing': True, 'vectorizer_condition': (1, 3), 'classifier': MultinomialNB()},
    {'preprocessing': True, 'vectorizer_condition': (2, 2), 'classifier': MultinomialNB()},

    # Preprocessing without Lemmatization
    {'preprocessing': False, 'vectorizer_condition': (1, 1), 'classifier': MultinomialNB()},
    {'preprocessing': False, 'vectorizer_condition': (1, 3), 'classifier': MultinomialNB()},
    {'preprocessing': False, 'vectorizer_condition': (2, 2), 'classifier': MultinomialNB()},

    # Assuming a second classifier is also MultinomialNB, repeat the same configurations
    # Preprocessing with Lemmatization
    {'preprocessing': True, 'vectorizer_condition': (1, 1), 'classifier': SVC()},
    {'preprocessing': True, 'vectorizer_condition': (1, 3), 'classifier': SVC()},
    {'preprocessing': True, 'vectorizer_condition': (2, 2), 'classifier': SVC()},

    # Preprocessing without Lemmatization
    {'preprocessing': False, 'vectorizer_condition': (1, 1), 'classifier': SVC()},
    {'preprocessing': False, 'vectorizer_condition': (1, 3), 'classifier': SVC()},
    {'preprocessing': False, 'vectorizer_condition': (2, 2), 'classifier': SVC()},
]

In [None]:
results_df, confusion_matrices = run_experiment(df2, 'text', 'fraudulent', experiment_configs[0], grid_search_configs)


Running grid search for MultinomialNB
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters for MultinomialNB: {'MultinomialNB__alpha': 0.1, 'tfidf__max_df': 1.0, 'tfidf__max_features': 10000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 1)}


In [None]:
pd.set_option('display.max_colwidth', None)
results_df

Unnamed: 0,Experiment,Accuracy,Recall,Precision,Best Params
0,MultinomialNB,0.982478,0.666667,0.907895,"{'MultinomialNB__alpha': 0.1, 'tfidf__max_df': 1.0, 'tfidf__max_features': 10000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 1)}"


In [None]:
confusion_matrices

{'MultinomialNB': array([[4516,   14],
        [  69,  138]], dtype=int64)}

In [None]:
results_df2, confusion_matrices2 = run_experiment(df2, 'text', 'fraudulent', experiment_configs[1], grid_search_configs)

Running grid search for MultinomialNB
Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best parameters for MultinomialNB: {'MultinomialNB__alpha': 0.1, 'tfidf__max_df': 0.5, 'tfidf__max_features': None, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}


In [None]:
results_df2

Unnamed: 0,Experiment,Accuracy,Recall,Precision,Best Params
0,MultinomialNB,0.9867,0.772947,0.909091,"{'MultinomialNB__alpha': 0.1, 'tfidf__max_df': 0.5, 'tfidf__max_features': None, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 3)}"


In [None]:
confusion_matrices2

{'MultinomialNB': array([[4514,   16],
        [  47,  160]], dtype=int64)}

In [None]:
results_df3, confusion_matrices3 = run_experiment(df2, 'text', 'fraudulent', experiment_configs[2], grid_search_configs)

Running grid search for MultinomialNB
Fitting 5 folds for each of 81 candidates, totalling 405 fits
