In [1]:
import json
import pandas as pd
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import joblib
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold


In [2]:
df = pd.read_csv('DataLatih_NoStopWord.csv')
df.head()

Unnamed: 0,username,MBTI,IE,NS,TF,JP,full_text
0,DylanSahara,ENFJ,E,N,F,J,about my wedding makeup kali makeup resepsi by...
1,DylanSahara,ENFJ,E,N,F,J,bapak prabowo subianto berkenan hadir memberi
2,DylanSahara,ENFJ,E,N,F,J,halo calon suami hahahaha
3,DylanSahara,ENFJ,E,N,F,J,muka dicoretcoret pakai kostum kayak orang gil...
4,DylanSahara,ENFJ,E,N,F,J,muka cakep dicemongcemongin


In [3]:
df.shape

(9350, 7)

In [4]:
df.username.value_counts()

username
DylanSahara        50
RickyHarun45MF     50
NRamadhani         50
morganoey          50
Haruka_NKGW10      50
                   ..
titokarnavian_     50
budimandjatmiko    50
Freya_JKT48        50
L_MarshaJKT48      50
chelseaislan       50
Name: count, Length: 187, dtype: int64

In [5]:
df.username.nunique()

187

In [6]:
# Ubah label IE ke format biner
df['IE_bin'] = df['IE'].apply(lambda x: 1 if x == 'I' else 0)
df['NS_bin'] = df['NS'].apply(lambda x: 1 if x == 'N' else 0)
df['TF_bin'] = df['TF'].apply(lambda x: 1 if x == 'T' else 0)
df['JP_bin'] = df['JP'].apply(lambda x: 1 if x == 'J' else 0)

In [7]:
df.head()

Unnamed: 0,username,MBTI,IE,NS,TF,JP,full_text,IE_bin,NS_bin,TF_bin,JP_bin
0,DylanSahara,ENFJ,E,N,F,J,about my wedding makeup kali makeup resepsi by...,0,1,0,1
1,DylanSahara,ENFJ,E,N,F,J,bapak prabowo subianto berkenan hadir memberi,0,1,0,1
2,DylanSahara,ENFJ,E,N,F,J,halo calon suami hahahaha,0,1,0,1
3,DylanSahara,ENFJ,E,N,F,J,muka dicoretcoret pakai kostum kayak orang gil...,0,1,0,1
4,DylanSahara,ENFJ,E,N,F,J,muka cakep dicemongcemongin,0,1,0,1


In [20]:
df.isna().sum()

username      0
MBTI          0
IE            0
NS            0
TF            0
JP            0
full_text    72
IE_bin        0
NS_bin        0
TF_bin        0
JP_bin        0
dtype: int64

In [21]:
df.dropna(inplace=True)

In [22]:
# Membuat TF-IDF vectorizer
texts = df['full_text']
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(texts)
y = df[['IE_bin', 'NS_bin', 'TF_bin', 'JP_bin']] 

In [23]:
# Inisialisasi K-Fold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_class_reports = []
# Menyimpan model terbaik dari setiap label
best_models = {}
label_list = ['IE_bin', 'NS_bin', 'TF_bin', 'JP_bin']
all_best_model_reports = []

for label in label_list:
    print(f"\nTraining SVM untuk label {label} dengan K-Fold...")
    best_score = 0
    best_model = None
    best_report = None
    fold_num = 1

    for train_index, val_index in kf.split(X, y[label]):
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold = y[label].iloc[train_index]
        y_val_fold = y[label].iloc[val_index]

        model = SVC(C=10, gamma='scale', kernel='rbf')
        model.fit(X_train_fold, y_train_fold)

        y_pred = model.predict(X_val_fold)
        report = classification_report(y_val_fold, y_pred, output_dict=True)
        f1 = report['weighted avg']['f1-score']
        print(f"Fold {fold_num} - F1 Score: {f1:.4f}")
        fold_num += 1

        if f1 > best_score:
            best_score = f1
            best_model = model
            best_report = report

    # Simpan model terbaik
    best_models[label] = best_model
    joblib.dump(best_model, f'{label}_svm_best_kfold.pkl')

    # Simpan laporan performa
    for kelas in ['0', '1']:
        if kelas in best_report:
            precision = best_report[kelas]['precision']
            recall = best_report[kelas]['recall']
            f1_score_kelas = best_report[kelas]['f1-score']
        else:
            precision = recall = f1_score_kelas = 0.0

        all_best_model_reports.append({
            'Label': label,
            'Kelas': kelas,
            'Precision': round(precision, 3),
            'Recall': round(recall, 3),
            'F1_Score': round(f1_score_kelas, 3),
            'Accuracy': round(best_report['accuracy'], 3)
        })



Training SVM untuk label IE_bin dengan K-Fold...
Fold 1 - F1 Score: 0.6475
Fold 2 - F1 Score: 0.6541
Fold 3 - F1 Score: 0.6440
Fold 4 - F1 Score: 0.6322
Fold 5 - F1 Score: 0.6324

Training SVM untuk label NS_bin dengan K-Fold...
Fold 1 - F1 Score: 0.6316
Fold 2 - F1 Score: 0.6249
Fold 3 - F1 Score: 0.6377
Fold 4 - F1 Score: 0.6412
Fold 5 - F1 Score: 0.6308

Training SVM untuk label TF_bin dengan K-Fold...
Fold 1 - F1 Score: 0.6601
Fold 2 - F1 Score: 0.6908
Fold 3 - F1 Score: 0.6750
Fold 4 - F1 Score: 0.6768
Fold 5 - F1 Score: 0.6725

Training SVM untuk label JP_bin dengan K-Fold...
Fold 1 - F1 Score: 0.6501
Fold 2 - F1 Score: 0.6511
Fold 3 - F1 Score: 0.6513
Fold 4 - F1 Score: 0.6620
Fold 5 - F1 Score: 0.6486


In [24]:
df_all_class_reports_kfold = pd.DataFrame(all_best_model_reports)
df_all_class_reports_kfold.to_excel('kfold_no_stopword.xlsx', index=False)

In [25]:
# Misal list labelnya
labels = ['IE_bin', 'NS_bin', 'TF_bin', 'JP_bin']

# Load data validasi
# Pastikan X_val dan y_val sudah didefinisikan sebelumnya

# Load model yang sudah disimpan
svm_models = {}
for label in labels:
    svm_model = joblib.load(f'{label}_svm_best_kfold.pkl')
    svm_models[label] = svm_model


## Data Tes

In [27]:
df_test = pd.read_csv('DataTest_NoStopWord.csv')

In [28]:
# Ubah label IE ke format biner
df_test['IE_bin'] = df_test['IE'].apply(lambda x: 1 if x == 'I' else 0)
df_test['NS_bin'] = df_test['NS'].apply(lambda x: 1 if x == 'N' else 0)
df_test['TF_bin'] = df_test['TF'].apply(lambda x: 1 if x == 'T' else 0)
df_test['JP_bin'] = df_test['JP'].apply(lambda x: 1 if x == 'J' else 0)

In [29]:
df_test.dropna(inplace=True)

In [30]:
X_test, y_test = df_test["full_text"], df_test[["IE_bin", "NS_bin", "TF_bin", "JP_bin"]]

X_test_vec = vectorizer.transform(X_test) 

In [31]:
# Membuat dictionary untuk menyimpan laporan klasifikasi
test_reports = {
    'SVM': {},
}

for label in labels:
    # Predict dan generate report untuk SVM
    y_pred_svm = svm_models[label].predict(X_test_vec)
    report_svm = classification_report(y_test[label], y_pred_svm, output_dict=True)
    test_reports['SVM'][label] = report_svm

In [32]:
import pandas as pd

# Fungsi untuk mengekstrak metrik dari laporan klasifikasi
def extract_metrics(report):
    metrics = {
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1-score': report['weighted avg']['f1-score'],
        'accuracy': report['accuracy']
    }
    return metrics

# Menyusun hasil evaluasi dalam DataFrame
comparison_list = []

for model_name, model_reports in test_reports.items():
    for label, report in model_reports.items():
        metrics = extract_metrics(report)
        metrics.update({'Model': model_name, 'Label': label})
        comparison_list.append(metrics)

# Konversi list of dictionaries menjadi DataFrame
comparison_df = pd.DataFrame(comparison_list)

# Pivot table untuk format tampilan yang lebih rapi
comparison_table = comparison_df.pivot(index='Label', columns='Model', values=['precision', 'recall', 'f1-score', 'accuracy'])

# Flatten MultiIndex columns
comparison_table.columns = ['_'.join(col).strip() for col in comparison_table.columns.values]

# Reset index
comparison_table = comparison_table.reset_index()

# Membulatkan angka desimal
comparison_table = comparison_table.round(3)

# Menambahkan judul
print("\n=== Akurasi Data Test ===\n")

# Menggunakan pandas Styler untuk menambahkan gridlines
styled_table = comparison_table.style.set_table_styles(
    [
        {'selector': 'th', 'props': [('border', '1px solid black'), ('padding', '5px'), ('font-size', '12pt')]},
        {'selector': 'td', 'props': [('border', '1px solid black'), ('padding', '5px'), ('font-size', '10pt')]},
        {'selector': 'table', 'props': [('border-collapse', 'collapse')]},
    ]
).set_properties(**{'border': '1px solid black', 'padding': '5px'})

# Menambahkan warna latar belakang (opsional)
styled_table = styled_table.background_gradient(cmap='YlGnBu')

# Tampilkan tabel dengan styling di Jupyter Notebook atau lingkungan yang mendukung HTML
styled_table



=== Akurasi Data Test ===



Unnamed: 0,Label,precision_SVM,recall_SVM,f1-score_SVM,accuracy_SVM
0,IE_bin,0.562,0.613,0.575,0.613
1,JP_bin,0.539,0.535,0.523,0.535
2,NS_bin,0.53,0.536,0.532,0.536
3,TF_bin,0.544,0.579,0.541,0.579


In [33]:
# Menyimpan hasil evaluasi per label dan per kelas (0 & 1)
all_class_reports = []

for label in labels:
    # Prediksi pada data test
    y_pred = svm_models[label].predict(X_test_vec)

    # Ambil classification report
    report = classification_report(y_test[label], y_pred, output_dict=True)

    # Cek apakah kedua kelas ada
    for kelas in ['0', '1']:
        if kelas in report:
            precision = report[kelas]['precision']
            recall = report[kelas]['recall']
            f1_score = report[kelas]['f1-score']
        else:
            precision = recall = f1_score = 0.0  # Jika tidak ditemukan

        # Tambahkan ke list
        all_class_reports.append({
            'Label': label,
            'Kelas': kelas,
            'Precision': round(precision, 3),
            'Recall': round(recall, 3),
            'F1_Score': round(f1_score, 3),
            'Accuracy': round(report['accuracy'], 3)
        })

# Buat dataframe
df_all_class_reports_test = pd.DataFrame(all_class_reports)

# Tampilkan di terminal
print("=== Laporan Evaluasi Tiap Label dan Kelas ===")
print(df_all_class_reports_test)

=== Laporan Evaluasi Tiap Label dan Kelas ===
    Label Kelas  Precision  Recall  F1_Score  Accuracy
0  IE_bin     0      0.674   0.822     0.741     0.613
1  IE_bin     1      0.330   0.181     0.234     0.613
2  NS_bin     0      0.432   0.393     0.412     0.536
3  NS_bin     1      0.599   0.637     0.617     0.536
4  TF_bin     0      0.618   0.808     0.700     0.579
5  TF_bin     1      0.428   0.223     0.294     0.579
6  JP_bin     0      0.526   0.695     0.599     0.535
7  JP_bin     1      0.551   0.375     0.446     0.535


In [34]:
df_all_class_reports_test.to_excel('report_before_augment_noStopWord.xlsx',index=False)