In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# ====== 0) Konfigurasi ======
MAX_TFIDF_FEATURES = 1000     # target jumlah fitur TF-IDF
JUMLAH_FITUR = 1000           # target k untuk Chi-Square (akan disesuaikan otomatis)

# ====== 1) Fungsi bantu ======
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def get_text_col(df):
    if 'full_text' in df.columns: return 'full_text'
    if 'text' in df.columns: return 'text'
    raise KeyError("Tidak ditemukan kolom teks. Harap sediakan 'full_text' atau 'text'.")

In [3]:
# ====== 2) Load & prepro TRAIN ======
df = pd.read_csv("Data_Train.csv")
text_col = get_text_col(df)
df['clean_text'] = df[text_col].apply(clean_text)


In [6]:
# Label dimensi
for col in ['IE', 'NS', 'TF', 'JP']:
    if col not in df.columns:
        raise KeyError(f"Kolom label '{col}' tidak ditemukan di Data_Train.csv")

y_IE = df['IE'].values
y_NS = df['NS'].values
y_TF = df['TF'].values
y_JP = df['JP'].values

# 3 Split indeks
train_idx, val_idx = train_test_split(
    np.arange(len(df)),
    test_size=0.2,
    random_state=42,
    stratify=y_IE
)


In [7]:
# ====== 4) TF-IDF: fit di TRAIN saja, transform VAL ======
vectorizer = TfidfVectorizer(max_features=MAX_TFIDF_FEATURES)
X_train_tfidf = vectorizer.fit_transform(df.loc[train_idx, 'clean_text']).toarray()
X_val_tfidf   = vectorizer.transform(df.loc[val_idx,   'clean_text']).toarray()

# ====== 5) Binerisasi pakai threshold MEDIAN dari TRAIN ======
threshold = np.median(X_train_tfidf, axis=0)
X_train = (X_train_tfidf > threshold).astype(int)
X_val   = (X_val_tfidf   > threshold).astype(int)

# ====== 6) Siapkan label train/val per dimensi ======
y_train_IE, y_val_IE = y_IE[train_idx], y_IE[val_idx]
y_train_NS, y_val_NS = y_NS[train_idx], y_NS[val_idx]
y_train_TF, y_val_TF = y_TF[train_idx], y_TF[val_idx]
y_train_JP, y_val_JP = y_JP[train_idx], y_JP[val_idx]

labels = {
    'IE': (y_train_IE, y_val_IE),
    'NS': (y_train_NS, y_val_NS),
    'TF': (y_train_TF, y_val_TF),
    'JP': (y_train_JP, y_val_JP),
}

print(f"Jumlah data training   : {X_train.shape[0]}")
print(f"Jumlah data validasi   : {X_val.shape[0]}")


Jumlah data training   : 7480
Jumlah data validasi   : 1870


In [9]:
# 7) Load dataTEST
df_test = pd.read_csv("Data_Test.csv")
text_col_test = get_text_col(df_test)
df_test['clean_text'] = df_test[text_col_test].apply(clean_text)

for col in ['IE', 'NS', 'TF', 'JP']:
    if col not in df_test.columns:
        raise KeyError(f"Kolom label '{col}' tidak ditemukan di Data_Test.csv")

# Transform TF-IDF TEST pakai vectorizer TRAIN, lalu binerisasi pakai threshold TRAIN
X_test_tfidf = vectorizer.transform(df_test['clean_text']).toarray()
X_test       = (X_test_tfidf > threshold).astype(int)

labels_test = {
    'IE': df_test['IE'].values,
    'NS': df_test['NS'].values,
    'TF': df_test['TF'].values,
    'JP': df_test['JP'].values,
}


In [10]:
# --- 1. Training per dimensi ---
trained_models = {}
trained_selectors = {}

n_features = X_train.shape[1]
k_eff = min(JUMLAH_FITUR, n_features)
print(f"Jumlah fitur TF-IDF aktual: {n_features} | k efektif Chi-Square: {k_eff}")

for dim, (y_tr, y_va) in labels.items():
    print(f"\n=== Training Dimensi {dim} ===")

    # Feature selection
    selector = SelectKBest(score_func=chi2, k=k_eff)
    Xtr_sel = selector.fit_transform(X_train, y_tr)
    Xva_sel = selector.transform(X_val)

    # Train SVM
    clf = SVC(kernel='linear', random_state=42)
    clf.fit(Xtr_sel, y_tr)

    # Simpan model & selector
    trained_models[dim] = clf
    trained_selectors[dim] = selector

    # Validasi
    y_pred_val = clf.predict(Xva_sel)
    print("Akurasi (Val):", accuracy_score(y_va, y_pred_val))



Jumlah fitur TF-IDF aktual: 1000 | k efektif Chi-Square: 1000

=== Training Dimensi IE ===
Akurasi (Val): 0.6550802139037433

=== Training Dimensi NS ===
Akurasi (Val): 0.5871657754010695

=== Training Dimensi TF ===
Akurasi (Val): 0.6898395721925134

=== Training Dimensi JP ===
Akurasi (Val): 0.6149732620320856


In [11]:
# --- 2. Testing per dimensi ---
print("\n=== Pengujian dengan Data Test ===")
for dim in labels.keys():
    selector = trained_selectors[dim]
    clf = trained_models[dim]

    Xte_sel = selector.transform(X_test)
    y_test_dim = labels_test[dim]

    y_pred_test = clf.predict(Xte_sel)

    print(f"\nDimensi {dim} - Akurasi (Test):", accuracy_score(y_test_dim, y_pred_test))
    print("Confusion Matrix (Test):\n", confusion_matrix(y_test_dim, y_pred_test))
    print("Classification Report (Test):\n", classification_report(y_test_dim, y_pred_test))



=== Pengujian dengan Data Test ===

Dimensi IE - Akurasi (Test): 0.6417391304347826
Confusion Matrix (Test):
 [[1359  191]
 [ 633  117]]
Classification Report (Test):
               precision    recall  f1-score   support

           E       0.68      0.88      0.77      1550
           I       0.38      0.16      0.22       750

    accuracy                           0.64      2300
   macro avg       0.53      0.52      0.49      2300
weighted avg       0.58      0.64      0.59      2300


Dimensi NS - Akurasi (Test): 0.5839130434782609
Confusion Matrix (Test):
 [[999 351]
 [606 344]]
Classification Report (Test):
               precision    recall  f1-score   support

           N       0.62      0.74      0.68      1350
           S       0.49      0.36      0.42       950

    accuracy                           0.58      2300
   macro avg       0.56      0.55      0.55      2300
weighted avg       0.57      0.58      0.57      2300


Dimensi TF - Akurasi (Test): 0.5921739130434782