In [1]:
import json
import pandas as pd
import random
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import joblib
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV, StratifiedKFold


In [2]:
df = pd.read_csv('Data_Train.csv')
df.head()

Unnamed: 0,full_text,username,MBTI,IE,NS,TF,JP
0,masih about my wedding makeup kali ini makeup ...,DylanSahara,ENFJ,E,N,F,J
1,terima kasih kepada bapak prabowo subianto yan...,DylanSahara,ENFJ,E,N,F,J
2,halo calon suami hahahaha,DylanSahara,ENFJ,E,N,F,J
3,muka dicoretcoret pakai kostum kayak orang gil...,DylanSahara,ENFJ,E,N,F,J
4,saat muka masih cakep belum dicemongcemongin,DylanSahara,ENFJ,E,N,F,J


In [None]:
df.shape

(9350, 7)

In [4]:
df.username.value_counts()

Unnamed: 0_level_0,count
username,Unnamed: 1_level_1
DylanSahara,50
chelseaolivia92,50
EmilDardak,50
cuttaryofficial,50
elviraelph,50
...,...
hanggini,50
dionwiyoko,50
dimsanggara,50
irisLoco,50


In [5]:
df.username.nunique()

187

In [6]:
# Ubah label IE ke format biner
df['IE_bin'] = df['IE'].apply(lambda x: 1 if x == 'I' else 0)
df['NS_bin'] = df['NS'].apply(lambda x: 1 if x == 'N' else 0)
df['TF_bin'] = df['TF'].apply(lambda x: 1 if x == 'T' else 0)
df['JP_bin'] = df['JP'].apply(lambda x: 1 if x == 'J' else 0)

In [7]:
df.head()

Unnamed: 0,full_text,username,MBTI,IE,NS,TF,JP,IE_bin,NS_bin,TF_bin,JP_bin
0,masih about my wedding makeup kali ini makeup ...,DylanSahara,ENFJ,E,N,F,J,0,1,0,1
1,terima kasih kepada bapak prabowo subianto yan...,DylanSahara,ENFJ,E,N,F,J,0,1,0,1
2,halo calon suami hahahaha,DylanSahara,ENFJ,E,N,F,J,0,1,0,1
3,muka dicoretcoret pakai kostum kayak orang gil...,DylanSahara,ENFJ,E,N,F,J,0,1,0,1
4,saat muka masih cakep belum dicemongcemongin,DylanSahara,ENFJ,E,N,F,J,0,1,0,1


In [8]:
df_test = pd.read_csv('Data_Test.csv')

In [9]:
df_test.head()

Unnamed: 0,full_text,username,MBTI,IE,NS,TF,JP
0,lah bagaimana,bismakarisma,ISFJ,I,S,F,J
1,uji coba yok,bismakarisma,ISFJ,I,S,F,J
2,ngeri antusiasku bisa sampai belum tidur gini ...,bismakarisma,ISFJ,I,S,F,J
3,aduh ini badut romantis yang selalu romantis l...,bismakarisma,ISFJ,I,S,F,J
4,lop yu dari sagi,bismakarisma,ISFJ,I,S,F,J


In [10]:
df_test.shape

(2300, 7)

In [11]:
df_test.username.value_counts()

Unnamed: 0_level_0,count
username,Unnamed: 1_level_1
bismakarisma,50
AndovidaLopez,50
kikysaputrii,50
nessiejudge,50
ekagustiwana,50
CoachJustinL,50
ernestprakasa,50
Arie_Kriting,50
suryainsomnia,50
ArnoldPoernomo,50


In [12]:
df_test.username.nunique()

46

In [13]:
# Ubah label IE ke format biner
df_test['IE_bin'] = df_test['IE'].apply(lambda x: 1 if x == 'I' else 0)
df_test['NS_bin'] = df_test['NS'].apply(lambda x: 1 if x == 'N' else 0)
df_test['TF_bin'] = df_test['TF'].apply(lambda x: 1 if x == 'T' else 0)
df_test['JP_bin'] = df_test['JP'].apply(lambda x: 1 if x == 'J' else 0)

In [14]:
df_test.head()

Unnamed: 0,full_text,username,MBTI,IE,NS,TF,JP,IE_bin,NS_bin,TF_bin,JP_bin
0,lah bagaimana,bismakarisma,ISFJ,I,S,F,J,1,0,0,1
1,uji coba yok,bismakarisma,ISFJ,I,S,F,J,1,0,0,1
2,ngeri antusiasku bisa sampai belum tidur gini ...,bismakarisma,ISFJ,I,S,F,J,1,0,0,1
3,aduh ini badut romantis yang selalu romantis l...,bismakarisma,ISFJ,I,S,F,J,1,0,0,1
4,lop yu dari sagi,bismakarisma,ISFJ,I,S,F,J,1,0,0,1


In [17]:
X_test, y_test = df_test["full_text"], df_test[["IE_bin", "NS_bin", "TF_bin", "JP_bin"]]

X_test_vec = vectorizer.transform(X_test)

In [16]:
# Membuat TF-IDF vectorizer dengan batas 1000 fitur
texts = df['full_text']
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(texts)

# Label
y = df[['IE_bin', 'NS_bin', 'TF_bin', 'JP_bin']]


In [18]:
# Inisialisasi K-Fold
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_class_reports = []
# Menyimpan model terbaik dari setiap label
best_models = {}
label_list = ['IE_bin', 'NS_bin', 'TF_bin', 'JP_bin']
all_best_model_reports = []

for label in label_list:
    print(f"\nTraining SVM untuk label {label} dengan K-Fold...")
    best_score = 0
    best_model = None
    best_report = None
    fold_num = 1

    for train_index, val_index in kf.split(X, y[label]):
        X_train_fold, X_val_fold = X[train_index], X[val_index]
        y_train_fold = y[label].iloc[train_index]
        y_val_fold = y[label].iloc[val_index]

        model = SVC(C=10, gamma='scale', kernel='rbf')
        model.fit(X_train_fold, y_train_fold)

        y_pred = model.predict(X_val_fold)
        report = classification_report(y_val_fold, y_pred, output_dict=True)
        f1 = report['weighted avg']['f1-score']
        print(f"Fold {fold_num} - F1 Score: {f1:.4f}")
        fold_num += 1

        if f1 > best_score:
            best_score = f1
            best_model = model
            best_report = report

    # Simpan model terbaik
    best_models[label] = best_model
    joblib.dump(best_model, f'{label}_svm_best_kfold.pkl')

    # Simpan laporan performa
    for kelas in ['0', '1']:
        if kelas in best_report:
            precision = best_report[kelas]['precision']
            recall = best_report[kelas]['recall']
            f1_score_kelas = best_report[kelas]['f1-score']
        else:
            precision = recall = f1_score_kelas = 0.0

        all_best_model_reports.append({
            'Label': label,
            'Kelas': kelas,
            'Precision': round(precision, 3),
            'Recall': round(recall, 3),
            'F1_Score': round(f1_score_kelas, 3),
            'Accuracy': round(best_report['accuracy'], 3)
        })



Training SVM untuk label IE_bin dengan K-Fold...
Fold 1 - F1 Score: 0.6238
Fold 2 - F1 Score: 0.6121
Fold 3 - F1 Score: 0.6188
Fold 4 - F1 Score: 0.6418
Fold 5 - F1 Score: 0.6393

Training SVM untuk label NS_bin dengan K-Fold...
Fold 1 - F1 Score: 0.6275
Fold 2 - F1 Score: 0.6065
Fold 3 - F1 Score: 0.5900
Fold 4 - F1 Score: 0.5961
Fold 5 - F1 Score: 0.6041

Training SVM untuk label TF_bin dengan K-Fold...
Fold 1 - F1 Score: 0.6771
Fold 2 - F1 Score: 0.6694
Fold 3 - F1 Score: 0.6657
Fold 4 - F1 Score: 0.6587
Fold 5 - F1 Score: 0.6702

Training SVM untuk label JP_bin dengan K-Fold...
Fold 1 - F1 Score: 0.6148
Fold 2 - F1 Score: 0.6207
Fold 3 - F1 Score: 0.6136
Fold 4 - F1 Score: 0.6161
Fold 5 - F1 Score: 0.6157


In [19]:
df_all_class_reports_kfold = pd.DataFrame(all_best_model_reports)
df_all_class_reports_kfold.to_excel('all_class_reports_kfold.xlsx', index=False)

In [20]:
# Misal list labelnya
labels = ['IE_bin', 'NS_bin', 'TF_bin', 'JP_bin']

# Load data validasi
# Pastikan X_val dan y_val sudah didefinisikan sebelumnya

# Load model yang sudah disimpan
svm_models = {}
for label in labels:
    svm_model = joblib.load(f'{label}_svm_best_kfold.pkl')
    svm_models[label] = svm_model


## Data Tes

In [21]:
# Membuat dictionary untuk menyimpan laporan klasifikasi
test_reports = {
    'SVM': {},
}

for label in labels:
    # Predict dan generate report untuk SVM
    y_pred_svm = svm_models[label].predict(X_test_vec)
    report_svm = classification_report(y_test[label], y_pred_svm, output_dict=True)
    test_reports['SVM'][label] = report_svm

In [22]:
import pandas as pd

# Fungsi untuk mengekstrak metrik dari laporan klasifikasi
def extract_metrics(report):
    metrics = {
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1-score': report['weighted avg']['f1-score'],
        'accuracy': report['accuracy']
    }
    return metrics

# Menyusun hasil evaluasi dalam DataFrame
comparison_list = []

for model_name, model_reports in test_reports.items():
    for label, report in model_reports.items():
        metrics = extract_metrics(report)
        metrics.update({'Model': model_name, 'Label': label})
        comparison_list.append(metrics)

# Konversi list of dictionaries menjadi DataFrame
comparison_df = pd.DataFrame(comparison_list)

# Pivot table untuk format tampilan yang lebih rapi
comparison_table = comparison_df.pivot(index='Label', columns='Model', values=['precision', 'recall', 'f1-score', 'accuracy'])

# Flatten MultiIndex columns
comparison_table.columns = ['_'.join(col).strip() for col in comparison_table.columns.values]

# Reset index
comparison_table = comparison_table.reset_index()

# Membulatkan angka desimal
comparison_table = comparison_table.round(3)

# Menambahkan judul
print("\n=== Akurasi Data Test ===\n")

# Menggunakan pandas Styler untuk menambahkan gridlines
styled_table = comparison_table.style.set_table_styles(
    [
        {'selector': 'th', 'props': [('border', '1px solid black'), ('padding', '5px'), ('font-size', '12pt')]},
        {'selector': 'td', 'props': [('border', '1px solid black'), ('padding', '5px'), ('font-size', '10pt')]},
        {'selector': 'table', 'props': [('border-collapse', 'collapse')]},
    ]
).set_properties(**{'border': '1px solid black', 'padding': '5px'})

# Menambahkan warna latar belakang (opsional)
styled_table = styled_table.background_gradient(cmap='YlGnBu')

# Tampilkan tabel dengan styling di Jupyter Notebook atau lingkungan yang mendukung HTML
styled_table



=== Akurasi Data Test ===



Unnamed: 0,Label,precision_SVM,recall_SVM,f1-score_SVM,accuracy_SVM
0,IE_bin,0.561,0.604,0.574,0.604
1,JP_bin,0.581,0.573,0.564,0.573
2,NS_bin,0.536,0.541,0.538,0.541
3,TF_bin,0.546,0.573,0.548,0.573


In [23]:
# Menyimpan hasil evaluasi per label dan per kelas (0 & 1)
all_class_reports = []

for label in labels:
    # Prediksi pada data test
    y_pred = svm_models[label].predict(X_test_vec)

    # Ambil classification report
    report = classification_report(y_test[label], y_pred, output_dict=True)

    # Cek apakah kedua kelas ada
    for kelas in ['0', '1']:
        if kelas in report:
            precision = report[kelas]['precision']
            recall = report[kelas]['recall']
            f1_score = report[kelas]['f1-score']
        else:
            precision = recall = f1_score = 0.0  # Jika tidak ditemukan

        # Tambahkan ke list
        all_class_reports.append({
            'Label': label,
            'Kelas': kelas,
            'Precision': round(precision, 3),
            'Recall': round(recall, 3),
            'F1_Score': round(f1_score, 3),
            'Accuracy': round(report['accuracy'], 3)
        })

# Buat dataframe
df_all_class_reports_test = pd.DataFrame(all_class_reports)

# Tampilkan di terminal
print("=== Laporan Evaluasi Tiap Label dan Kelas ===")
print(df_all_class_reports_test)

=== Laporan Evaluasi Tiap Label dan Kelas ===
    Label Kelas  Precision  Recall  F1_Score  Accuracy
0  IE_bin     0      0.674   0.797     0.731     0.604
1  IE_bin     1      0.328   0.204     0.251     0.604
2  NS_bin     0      0.440   0.408     0.424     0.541
3  NS_bin     1      0.604   0.635     0.619     0.541
4  TF_bin     0      0.621   0.765     0.686     0.573
5  TF_bin     1      0.429   0.274     0.335     0.573
6  JP_bin     0      0.557   0.723     0.629     0.573
7  JP_bin     1      0.605   0.424     0.499     0.573


In [24]:
df_all_class_reports_test.to_excel('report_before_augment.xlsx',index=False)