In [18]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# %matplotlib inline

**Augmentation must be applied only on train set**

Performance xgboost gets worse using augmentation with back translation

In [19]:
data_train = pd.read_csv('../../data/processed/back_translation/augmented_train_data.csv')
data_train.shape

(8328, 24)

In [20]:
data_test = pd.read_csv('../../data/processed/back_translation/data_test.csv')
data_test.shape

(1702, 24)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Vectorize Using TFIDF
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), stop_words='english')
X_text_tfidf_train = tfidf.fit_transform(data_train['cleaned_text'])
X_text_tfidf_test = tfidf.fit_transform(data_test['cleaned_text'])

In [22]:
X_text_tfidf_train.shape, X_text_tfidf_test.shape

((8328, 5000), (1702, 5000))

In [23]:
# vectorize embedding

from sentence_transformers import SentenceTransformer
import xgboost as xgb

# Load model embedding
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

X_text_embedding_train = model.encode(data_train['cleaned_text'].tolist())
X_text_embedding_test = model.encode(data_test['cleaned_text'].tolist())



In [24]:
X_text_embedding_train.shape, X_text_embedding_test.shape

((8328, 384), (1702, 384))

In [25]:
data_train.columns

Index(['title', 'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent', 'country', 'state', 'city', 'combined_text',
       'cleaned_text', 'industry_grouped', 'function_grouped',
       'country_grouped', 'state_grouped', 'city_grouped'],
      dtype='object')

In [26]:
data_train_meta = data_train[['telecommuting','has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry_grouped', 'function_grouped', 'country_grouped', 'state_grouped', 'city_grouped']]
data_test_meta = data_test[['telecommuting','has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry_grouped', 'function_grouped', 'country_grouped', 'state_grouped', 'city_grouped']]

In [27]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Custom order for ordinal features
experience_order = [
    'Internship',
    'Entry level',
    'Associate',
    'Mid-Senior level',
    'Director',
    'Executive',
    'Not Applicable',
    'Unknown'
]

education_order = [
    'Some High School Coursework',
    'High School or equivalent',
    'Vocational - HS Diploma',
    'Some College Coursework Completed',
    'Associate Degree',
    'Vocational',
    'Vocational - Degree',
    'Certification',
    "Bachelor's Degree",
    "Master's Degree",
    'Professional',
    'Doctorate',
    'Unspecified',
    'Unknown'
]

preprocessor = ColumnTransformer(
    transformers=[
        # Ordinal encoding for ordered columns
        ('ord', OrdinalEncoder(categories=[experience_order, education_order]), 
         ['required_experience', 'required_education']),
        
        # One-hot encoding for nominal columns
        ('ohe', OneHotEncoder(drop='first', handle_unknown='ignore'),
         ['employment_type','industry_grouped','function_grouped', 'country_grouped', 'state_grouped', 'city_grouped'])
    ],
    remainder='passthrough'  # Keep other features 
)


In [28]:
y_train = data_train['fraudulent']
data_meta_train_encoded = preprocessor.fit_transform(data_train_meta)

y_test = data_test['fraudulent']
data_meta_test_encoded = preprocessor.transform(data_test_meta)



In [29]:
data_meta_train_encoded.shape, data_meta_test_encoded.shape

((8328, 471), (1702, 471))

In [30]:
# embedding + tfidf

from scipy.sparse import hstack

X_train = hstack([X_text_tfidf_train, data_meta_train_encoded])  
X_test = hstack([X_text_tfidf_test, data_meta_test_encoded])  

In [31]:
X_train.shape, X_test.shape

((8328, 5471), (1702, 5471))

In [32]:
y_train.value_counts()

fraudulent
0    6113
1    2215
Name: count, dtype: int64

### Modelling

In [34]:
from xgboost import XGBClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support

In [35]:
# ratio = 12166/586
ratio = 6113/2215
ratio

2.7598194130925506

In [37]:
# model with tfidf
ratio = 6113/2215

clf_xgb = XGBClassifier(scale_pos_weight=ratio)
clf_xgb.fit(X_train, y_train)

y_pred = clf_xgb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.44      0.61      1529
           1       0.16      0.95      0.28       173

    accuracy                           0.50      1702
   macro avg       0.58      0.70      0.45      1702
weighted avg       0.90      0.50      0.58      1702



Augmentation on minority class makes model perform worse