In [1]:
import pandas as pd
import re

import nltk
from nltk.corpus import stopwords

from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.svm import SVC

from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

import joblib

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
df = pd.read_csv('data/train.csv')

In [3]:
df.drop(['salary_range','job_id'], axis = 1, inplace = True)
df.fillna(" ",inplace = True)
df['text'] = df['title'] +  ' ' + df['department'] + ' ' + df['company_profile'] + ' ' + df['description'] + ' ' + df['requirements'] + ' ' + df['benefits'] + ' ' + df['employment_type'] + ' ' + df['required_education'] + ' ' + df['industry'] + ' ' + df['function'] 
df.drop(['title','department','company_profile','description','requirements','benefits','employment_type','required_experience','required_education','industry','function'], axis = 1, inplace = True)

In [4]:
def preprocess(text):
    # Remove URL
    text = re.sub(r'https?://\S+|www\.\S+', '', str(text))
    # Remove emoji
    text = re.sub("["
                  u"\U0001F600-\U0001F64F"  # emoticons
                  u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                  u"\U0001F680-\U0001F6FF"  # transport & map symbols
                  u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                  u"\U00002702-\U000027B0"
                  u"\U000024C2-\U0001F251"
                  "]+", '', str(text), flags=re.UNICODE)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', str(text))
    # Case folding
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, pos='v') for word in filtered_words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

In [5]:
#Preprocess
df['text'] = df['text'].apply(preprocess)

In [6]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df['text'])

In [24]:
joblib.dump(tfidf.vocabulary_, 'model/vocabulary.joblib')

['model/vocabulary.joblib']

In [9]:
tfidf_vect_df = pd.DataFrame(tfidf_matrix.toarray(), columns = tfidf.get_feature_names_out())

df_tfidf = pd.concat([df, tfidf_vect_df], axis = 1)
df_tfidf = df_tfidf.drop(['text'], axis = 1)

In [10]:
target = df_tfidf['frauds']

features = df_tfidf.drop(columns=['frauds'])

In [11]:
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.30, random_state=42)

In [12]:
from imblearn.over_sampling import BorderlineSMOTE
bsm = BorderlineSMOTE()

X_train_res, y_train_res = bsm.fit_resample(X_train, y_train)

In [13]:
svc_c01 = SVC(kernel='linear', C=0.1)

svc_c01.fit(X_train, y_train)

In [14]:
svc_predc01 = svc_c01.predict(X_val)

confusion_matrixc01 = confusion_matrix(y_val, svc_predc01)

print(confusion_matrixc01)
print(classification_report(y_val, svc_predc01))

[[742   0]
 [ 84  14]]
              precision    recall  f1-score   support

           0       0.90      1.00      0.95       742
           1       1.00      0.14      0.25        98

    accuracy                           0.90       840
   macro avg       0.95      0.57      0.60       840
weighted avg       0.91      0.90      0.87       840



In [None]:
# joblib.dump(svc_c01, 'model/svc_c01.joblib')

In [15]:
svc_c01bs = SVC(kernel='linear', C=0.1)

svc_c01bs.fit(X_train_res, y_train_res)

In [17]:
svc_predc01bs = svc_c01bs.predict(X_val)


confusion_matrixc01bs = confusion_matrix(y_val, svc_predc01bs)

print(confusion_matrixc01bs)
print(classification_report(y_val, svc_predc01bs))

[[625 117]
 [ 19  79]]
              precision    recall  f1-score   support

           0       0.97      0.84      0.90       742
           1       0.40      0.81      0.54        98

    accuracy                           0.84       840
   macro avg       0.69      0.82      0.72       840
weighted avg       0.90      0.84      0.86       840



In [None]:
# joblib.dump(svc_c01bs, 'model/svc_c01bs.joblib')

In [16]:
svc_c1 = SVC(kernel='linear', C=1)

svc_c1.fit(X_train, y_train)

In [18]:
svc_predc1 = svc_c1.predict(X_val)

confusion_matrixc1 = confusion_matrix(y_val, svc_predc1)

print(confusion_matrixc1)
print(classification_report(y_val, svc_predc1))

[[741   1]
 [ 29  69]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       742
           1       0.99      0.70      0.82        98

    accuracy                           0.96       840
   macro avg       0.97      0.85      0.90       840
weighted avg       0.97      0.96      0.96       840



In [None]:
# joblib.dump(svc_c1, 'model/svc_c1.joblib')

In [19]:
svc_c1bs = SVC(kernel='linear', C=1)

svc_c1bs.fit(X_train_res, y_train_res)

In [20]:
svc_predc1bs = svc_c1bs.predict(X_val)

confusion_matrixc1bs = confusion_matrix(y_val, svc_predc1bs)

print(confusion_matrixc1bs)
print(classification_report(y_val, svc_predc1bs))

[[729  13]
 [ 18  80]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       742
           1       0.86      0.82      0.84        98

    accuracy                           0.96       840
   macro avg       0.92      0.90      0.91       840
weighted avg       0.96      0.96      0.96       840



In [None]:
# joblib.dump(svc_c1bs, 'model/svc_c1bs.joblib')

In [21]:
df_val = pd.read_csv('data/val.csv')

In [22]:
# Menampilkan hasil klasifikasi
result_df = pd.DataFrame({
    'telecommuting' : df_val['telecommuting'],
    'has_company_logo' : df_val['has_company_logo'],
    'has_questions' : df_val['has_questions'],
    'text' : df_val['text'],
    'Label Asli': df_val['frauds'],
    'Prediksi Model': svc_predc1bs 
})

result_df['Label Asli'] = result_df['Label Asli'].map({0: 'asli', 1: 'palsu'})
result_df['Prediksi Model'] = result_df['Prediksi Model'].map({0: 'asli', 1: 'palsu'})

result_df['Prediksi Benar'] = (result_df['Label Asli'] == result_df['Prediksi Model'])

print(result_df)

     telecommuting  has_company_logo  has_questions   
0                0                 1              1  \
1                0                 1              1   
2                0                 1              1   
3                0                 0              0   
4                0                 1              1   
..             ...               ...            ...   
835              0                 1              1   
836              0                 1              1   
837              0                 1              0   
838              0                 1              1   
839              0                 1              1   

                                                  text Label Asli   
0    clinical nurse manager human capital usually b...       asli  \
1    branch associate outstanding member service st...       asli   
2    sales associate green street advisors industry...       asli   
3    linux engineer aws look candidates would help ...       as

In [38]:
df_test = pd.read_csv('data/test.csv')

In [39]:
df_test.drop(['salary_range','job_id'], axis = 1, inplace = True)
df_test.fillna(" ",inplace = True)
df_test['text'] = df_test['title'] +  ' ' + df_test['department'] + ' ' + df_test['company_profile'] + ' ' + df_test['description'] + ' ' + df_test['requirements'] + ' ' + df_test['benefits'] + ' ' + df_test['employment_type'] + ' ' + df_test['required_education'] + ' ' + df_test['industry'] + ' ' + df_test['function'] 
df_test.drop(['title','department','company_profile','description','requirements','benefits','employment_type','required_experience','required_education','industry','function'], axis = 1, inplace = True)

In [40]:
#Preprocess
df_test['text'] = df_test['text'].apply(preprocess)

In [41]:
vocabulary = joblib.load('model/vocabulary.joblib')

loaded_vec = TfidfVectorizer(decode_error="replace", vocabulary=vocabulary)

In [42]:
tfidf_test_matrix = loaded_vec.fit_transform(df_test['text'])

In [43]:
tfidf_test_vect_df = pd.DataFrame(tfidf_test_matrix.toarray(), columns = loaded_vec.get_feature_names_out())

df_tfidf_test = pd.concat([df_test, tfidf_test_vect_df], axis = 1)
df_tfidf_test = df_tfidf_test.drop(['text'], axis = 1)

In [44]:
y = df_tfidf_test['frauds']
X = df_tfidf_test.drop(['frauds'], axis=1)

In [45]:
svc_c01_test = joblib.load("model/svc_c01.joblib")
hasil_svc_c01_test = svc_c01_test.predict(X)

cm_svc_c01_test = confusion_matrix(y, hasil_svc_c01_test)

print(cm_svc_c01_test)
print(classification_report(y, hasil_svc_c01_test))

[[1050    0]
 [ 137   13]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.94      1050
           1       1.00      0.09      0.16       150

    accuracy                           0.89      1200
   macro avg       0.94      0.54      0.55      1200
weighted avg       0.90      0.89      0.84      1200



In [46]:
svc_c01bs_test = joblib.load("model/svc_c01bs.joblib")

hasil_svc_c01bs_test = svc_c01bs_test.predict(X)

cm_svc_c01bs_test = confusion_matrix(y, hasil_svc_c01bs_test)

print(cm_svc_c01bs_test)
print(classification_report(y, hasil_svc_c01bs_test))

[[899 151]
 [ 22 128]]
              precision    recall  f1-score   support

           0       0.98      0.86      0.91      1050
           1       0.46      0.85      0.60       150

    accuracy                           0.86      1200
   macro avg       0.72      0.85      0.75      1200
weighted avg       0.91      0.86      0.87      1200



In [47]:
svc_c1_test = joblib.load("model/svc_c1.joblib")

hasil_svc_c1_test = svc_c1_test.predict(X)

cm_svc_c1_test = confusion_matrix(y, hasil_svc_c1_test)

print(cm_svc_c01bs_test)
print(classification_report(y, hasil_svc_c1_test))

[[899 151]
 [ 22 128]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1050
           1       0.98      0.73      0.84       150

    accuracy                           0.96      1200
   macro avg       0.97      0.86      0.91      1200
weighted avg       0.96      0.96      0.96      1200



In [48]:
svc_c1bs_test = joblib.load("model/svc_c1bs.joblib")

hasil_svc_c1bs_test = svc_c1bs_test.predict(X)

cm_svc_c1bs_test = confusion_matrix(y, hasil_svc_c1bs_test)

print(cm_svc_c1bs_test)
print(classification_report(y, hasil_svc_c1bs_test))

[[1040   10]
 [  30  120]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1050
           1       0.92      0.80      0.86       150

    accuracy                           0.97      1200
   macro avg       0.95      0.90      0.92      1200
weighted avg       0.97      0.97      0.97      1200



In [50]:
# Menampilkan hasil klasifikasi
result_df_test = pd.DataFrame({
    'telecommuting' : df_test['telecommuting'],
    'has_company_logo' : df_test['has_company_logo'],
    'has_questions' : df_test['has_questions'],
    'text' : df_test['text'],
    'Label Asli': df_test['frauds'],
    'Prediksi Model': svc_c1bs_test 
})

result_df_test['Label Asli'] = result_df_test['Label Asli'].map({0: 'asli', 1: 'palsu'})
result_df_test['Prediksi Model'] = result_df_test['Prediksi Model'].map({0: 'asli', 1: 'palsu'})

result_df_test['Prediksi Benar'] = (result_df_test['Label Asli'] == result_df_test['Prediksi Model'])

print(result_df)

     telecommuting  has_company_logo  has_questions   
0                0                 1              1  \
1                0                 1              1   
2                0                 1              1   
3                0                 0              0   
4                0                 1              1   
..             ...               ...            ...   
835              0                 1              1   
836              0                 1              1   
837              0                 1              0   
838              0                 1              1   
839              0                 1              1   

                                                  text Label Asli   
0    clinical nurse manager human capital usually b...       asli  \
1    branch associate outstanding member service st...       asli   
2    sales associate green street advisors industry...       asli   
3    linux engineer aws look candidates would help ...       as