In [8]:
import pandas as pd
import numpy as np

In [9]:
raw_data = pd.read_csv('../../data/processed/preprocessed_job_dataset.csv')
print(raw_data.shape)
raw_data['fraudulent'].value_counts()

(17589, 23)


fraudulent
0    16734
1      855
Name: count, dtype: int64

In [10]:
# minimize only data with labels 0 not fraud
df = raw_data[raw_data['fraudulent']==0]

In [11]:
df.shape

(16734, 23)

In [12]:
df.isnull().sum()

title                  0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
state                  0
city                   0
combined_text          0
industry_grouped       0
function_grouped       0
country_grouped        0
state_grouped          0
city_grouped           0
dtype: int64

In [13]:
# Combine relevant text columns
df['combined_text'] = (
    df['title'] + ' ' +
    df['company_profile'] + ' ' +
    df['description'] + ' ' +
    df['requirements'] + ' ' +
    df['benefits']
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['combined_text'] = (


In [14]:
import re
from bs4 import BeautifulSoup
import html

def clean_text(text):
    t = BeautifulSoup(text, "html.parser").get_text()
    t = html.unescape(t)
    t = re.sub(r"#URL_[^#]+#", "", t)
    t = re.sub(r"\s+", " ", t).strip()
    
    return t

df['cleaned_text'] = df['combined_text'].map(clean_text)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_text'] = df['combined_text'].map(clean_text)


In [15]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

# Custom order for ordinal features
experience_order = [
    'Internship',
    'Entry level',
    'Associate',
    'Mid-Senior level',
    'Director',
    'Executive',
    'Not Applicable',
    'Unknown'
]

education_order = [
    'Some High School Coursework',
    'High School or equivalent',
    'Vocational - HS Diploma',
    'Some College Coursework Completed',
    'Associate Degree',
    'Vocational',
    'Vocational - Degree',
    'Certification',
    "Bachelor's Degree",
    "Master's Degree",
    'Professional',
    'Doctorate',
    'Unspecified',
    'Unknown'
]

preprocessor = ColumnTransformer(
    transformers=[
        # Ordinal encoding for ordered columns
        ('ord', OrdinalEncoder(categories=[experience_order, education_order]), 
         ['required_experience', 'required_education']),
        
        # One-hot encoding for nominal columns
        ('ohe', OneHotEncoder(handle_unknown='ignore'),
         ['employment_type','industry_grouped','function_grouped', 'country_grouped', 'state_grouped', 'city_grouped'])
    ],
    remainder='passthrough'  # Keep other features 
)


In [16]:
meta_data = df[['telecommuting','has_company_logo', 'has_questions', 'employment_type', 'required_experience', 'required_education', 'industry_grouped', 'function_grouped', 'country_grouped', 'state_grouped', 'city_grouped', 'fraudulent']]

X_meta = preprocessor.fit_transform(meta_data)

In [17]:
df.shape

(16734, 24)

In [18]:
from sentence_transformers import SentenceTransformer

# Text embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')
text_embeddings = model.encode(df['cleaned_text'].tolist())


  from .autonotebook import tqdm as notebook_tqdm


In [19]:
# Combine into one vector per row

# Convert to dense if it's sparse
if not isinstance(X_meta, np.ndarray):
    X_meta = X_meta.toarray()

combined_features = np.hstack([text_embeddings, X_meta])


In [20]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_matrix = cosine_similarity(combined_features)
threshold = 0.99

# Find duplicates
duplicates = np.argwhere(similarity_matrix > threshold)


In [21]:
duplicates

array([[    0,     0],
       [    0,  4363],
       [    0, 10916],
       ...,
       [16733, 12291],
       [16733, 15034],
       [16733, 16733]], shape=(600812, 2))

In [33]:
df.iloc[[0,4363], :]

Unnamed: 0,title,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,...,country,state,city,combined_text,industry_grouped,function_grouped,country_grouped,state_grouped,city_grouped,cleaned_text
0,Marketing Intern,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,No benefits listed,0,1,0,Other,Internship,...,US,NY,New York,"Marketing Intern We're Food52, and we've creat...",Unknown,Marketing,US,NY,New York,"Marketing Intern We're Food52, and we've creat..."
4363,Audience Development Intern,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,No benefits listed,0,1,0,Other,Internship,...,US,NY,New York,"Audience Development Intern We're Food52, and ...",Unknown,Marketing,US,NY,New York,"Audience Development Intern We're Food52, and ..."


In [23]:
# Remove self matches and mirrored pairs
pairs = [(i, j) for i, j in duplicates if i < j]


In [24]:
to_remove = set()

for i, j in pairs:
    to_remove.add(j)  # mark the second as duplicate

to_remove = sorted(to_remove)


In [25]:
duplicates_data = df.iloc[to_remove]
print(duplicates_data)


                                                   title  \
24                                     Customer Service    
25                             H1B SPONSOR FOR L1/L2/OPT   
36                    Graduates: English Teacher Abroad    
40                               English Teacher Abroad    
48                               English Teacher Abroad    
...                                                  ...   
17584                   Account Director - Distribution    
17585                                 Payroll Accountant   
17586  Project Cost Control Staff Engineer - Cost Con...   
17587                                   Graphic Designer   
17588                         Web Application Developers   

                                         company_profile  \
24                           No company profile provided   
25     i28 Technologies has demonstrated expertise in...   
36     We help teachers get safe &amp; secure jobs ab...   
40     We help teachers get safe &amp; 

In [26]:
df = df.reset_index(drop=True)
data_clean = df.drop(index=to_remove)

In [27]:
data_clean.shape

(7730, 24)

In [28]:
data_clean.head()

Unnamed: 0,title,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,...,country,state,city,combined_text,industry_grouped,function_grouped,country_grouped,state_grouped,city_grouped,cleaned_text
0,Marketing Intern,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,No benefits listed,0,1,0,Other,Internship,...,US,NY,New York,"Marketing Intern We're Food52, and we've creat...",Unknown,Marketing,US,NY,New York,"Marketing Intern We're Food52, and we've creat..."
1,Customer Service - Cloud Video Production,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,...,NZ,Unknown,Auckland,Customer Service - Cloud Video Production 90 S...,Marketing and Advertising,Customer Service,NZ,Unknown,Auckland,Customer Service - Cloud Video Production 90 S...
2,Commissioning Machinery Assistant (CMA),Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,No benefits listed,0,1,0,Unknown,Unknown,...,US,IA,Wever,Commissioning Machinery Assistant (CMA) Valor ...,Unknown,Unknown,US,IA,Other,Commissioning Machinery Assistant (CMA) Valor ...
3,Account Executive - Washington DC,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,...,US,DC,Washington,Account Executive - Washington DC Our passion ...,Computer Software,Sales,US,DC,Washington,Account Executive - Washington DC Our passion ...
4,Bill Review Manager,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,...,US,FL,Fort Worth,Bill Review Manager SpotSource Solutions LLC i...,Hospital & Health Care,Health Care Provider,US,FL,Fort Worth,Bill Review Manager SpotSource Solutions LLC i...


In [29]:
df_label_1 = raw_data[raw_data['fraudulent']==1]

In [30]:
df_dedup = pd.concat([data_clean, df_label_1], axis=0)

In [31]:
df_dedup.shape

(8585, 24)

In [32]:
df_dedup.to_csv('../../data/processed/undersampling_no_similar_data.csv', index=False)