In [1]:
import time
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

# Read Data

In [2]:
df = pd.read_csv('data/2023_0531-0315_1000_False_detikcom_clean.csv')
df

Unnamed: 0,title,category,aa,aaji,aal,aare,aba,abad,abah,abai,...,zohri,zombie,zona,zonasi,zone,zonk,zoo,zs,zulhas,zuppa
0,"Pasar Sawit Dihambat, RI & Malaysia Langsung S...",finance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Nggak Nyangka! Negara Tetangga Ini Pesaing RI ...,finance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Cuan Ratusan Juta Rupiah dari Bisnis Makanan Beku,finance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Genjot Kendaraan Listrik, RI Jaring Produsen M...",finance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Insentif buat Bus Listrik Ada, Cek di Sini Pen...",finance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,"Siap-siap UTBK 2023, Intip Tata Tertib UTBK & ...",education,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6996,Beasiswa ke Jepang MEXT Scholarship 2024 Jenja...,education,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6997,Kurang Waktu Bermain Mandiri Tingkatkan Ganggu...,education,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,IPB Masih Buka Program S1 Beasiswa Utusan Daer...,education,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Split Data

In [3]:
# define X and y
# X = drop column category and index
X = df.drop(['category', 'title'], axis=1)
y = df['category']

In [4]:
# split data 90:10
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X, y, test_size=0.1, random_state=42)

# split data 80:20
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size=0.2, random_state=42)

# split data 70:30
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X, y, test_size=0.3, random_state=42)

# ML Model

In [5]:
tree = DecisionTreeClassifier(random_state=0)
svm = SVC(random_state=0)
nb = GaussianNB()

In [6]:
# membuat predefined function untuk melakukan training dan testing
def train_test_model(model, xtrain, xtest, ytrain):
    # melakukan training model
    start = time.time()
    model.fit(xtrain, ytrain)
    end = time.time()
    
    train_time = end - start

    # melakukan testing model
    y_pred = model.predict(xtest)

    # mengembalikan hasil testing
    return y_pred, train_time

# membuat predefined function untuk melakukan evaluasi model
def evaluate_model(ytest, ypred):
    report = classification_report(ytest, ypred, output_dict=True)

    # accuracy
    accuracy = report['accuracy']

    return accuracy

# membuat predefined function untuk melakukan training, testing, dan evaluasi model
def model_report(models, xtrain, xtest, ytrain, ytest):
    eval_list = []

    for model in models:
        # training dan testing model
        y_pred, train_time = train_test_model(model, xtrain, xtest, ytrain)
        # evaluasi model
        accuracy = evaluate_model(ytest, y_pred)
        # menampilkan hasil evaluasi dalam bentuk dictionary
        model_name = model.__class__.__name__
        eval_dict = {'Model': model_name, 
                    'Accuracy': round(accuracy, 3),
                    'Training Time (s)': round(train_time, 3)
                    }
        
        eval_list.append(eval_dict)

    # membuat dataframe dari hasil evaluasi
    eval_df = pd.DataFrame(eval_list)
    eval_df = eval_df.set_index('Model')
    eval_df.index.name = 'Metrik'
    eval_df = eval_df.T

    return eval_df

In [7]:
accuracy_comparison_1 = model_report([tree, svm, nb], X_train_1, X_test_1, y_train_1, y_test_1)
accuracy_comparison_1

Metrik,DecisionTreeClassifier,SVC,GaussianNB
Accuracy,0.779,0.86,0.737
Training Time (s),10.902,161.484,0.848


In [8]:
accuracy_comparison_2 = model_report([tree, svm, nb], X_train_2, X_test_2, y_train_2, y_test_2)
accuracy_comparison_2

Metrik,DecisionTreeClassifier,SVC,GaussianNB
Accuracy,0.739,0.844,0.736
Training Time (s),8.98,127.778,0.752


In [9]:
accuracy_comparison_3 = model_report([tree, svm, nb], X_train_3, X_test_3, y_train_3, y_test_3)
accuracy_comparison_3

Metrik,DecisionTreeClassifier,SVC,GaussianNB
Accuracy,0.744,0.843,0.742
Training Time (s),6.977,99.322,0.661


## SVM Model

In [10]:
svm_linear = SVC(random_state=0, kernel='linear')
svm_poly = SVC(random_state=0, kernel='poly')
svm_rbf = SVC(random_state=0, kernel='rbf')

In [11]:
svm_comparison_1 = model_report([svm_linear, svm_poly, svm_rbf], X_train_1, X_test_1, y_train_1, y_test_1)
svm_comparison_1

Metrik,SVC,SVC.1,SVC.2
Accuracy,0.826,0.687,0.86
Training Time (s),89.461,161.487,153.827


In [12]:
# rename column to SVM Linear, SVM Polynomial, SVM RBF 
svm_comparison_1.columns = ['SVM Linear', 'SVM Polynomial', 'SVM RBF']
svm_comparison_1

Unnamed: 0,SVM Linear,SVM Polynomial,SVM RBF
Accuracy,0.826,0.687,0.86
Training Time (s),89.461,161.487,153.827


In [13]:
svm_comparison_2 = model_report([svm_linear, svm_poly, svm_rbf], X_train_2, X_test_2, y_train_2, y_test_2)
svm_comparison_2

Metrik,SVC,SVC.1,SVC.2
Accuracy,0.821,0.628,0.844
Training Time (s),79.506,128.976,127.156


In [14]:
# rename column to SVM Linear, SVM Polynomial, SVM RBF 
svm_comparison_2.columns = ['SVM Linear', 'SVM Polynomial', 'SVM RBF']
svm_comparison_2

Unnamed: 0,SVM Linear,SVM Polynomial,SVM RBF
Accuracy,0.821,0.628,0.844
Training Time (s),79.506,128.976,127.156


In [15]:
svm_comparison_3 = model_report([svm_linear, svm_poly, svm_rbf], X_train_3, X_test_3, y_train_3, y_test_3)
svm_comparison_3

Metrik,SVC,SVC.1,SVC.2
Accuracy,0.825,0.599,0.843
Training Time (s),71.564,99.362,95.035


In [16]:
# rename column to SVM Linear, SVM Polynomial, SVM RBF 
svm_comparison_3.columns = ['SVM Linear', 'SVM Polynomial', 'SVM RBF']
svm_comparison_3

Unnamed: 0,SVM Linear,SVM Polynomial,SVM RBF
Accuracy,0.825,0.599,0.843
Training Time (s),71.564,99.362,95.035
