In [1]:
!pip install optuna



In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.metrics import classification_report,accuracy_score
import optuna
from sklearn.model_selection import train_test_split
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
df=pd.read_csv('data_1C.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,"Aurion C3 Iron Curl Bar with 2 Locks, 3 ft (Si...",Household
1,1,"Presto 06620 11-Inch Electric Skillet Fries, g...",Household
2,2,WHOOSH! Award-Wining Screen Cleaner - Safe for...,Electronics
3,3,ManQ Men's Blended Waist Coat,Clothing & Accessories
4,4,Lace And Me Women's Blended High Waist Tummy &...,Clothing & Accessories


# A. Preprocessing

## Check missing value

In [5]:
df.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
text,0
label,0


## Drop column 'Unnamed: 0'

In [6]:
# drop column 'Unnamed: 0' karena tidak berguna
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,text,label
0,"Aurion C3 Iron Curl Bar with 2 Locks, 3 ft (Si...",Household
1,"Presto 06620 11-Inch Electric Skillet Fries, g...",Household
2,WHOOSH! Award-Wining Screen Cleaner - Safe for...,Electronics
3,ManQ Men's Blended Waist Coat,Clothing & Accessories
4,Lace And Me Women's Blended High Waist Tummy &...,Clothing & Accessories


In [7]:
df.shape

(12606, 2)

## Label encoding

In [8]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
Household,4831
Books,2961
Electronics,2670
Clothing & Accessories,2144


In [9]:
encoder = LabelEncoder()
df['label'] = encoder.fit_transform(df['label'])
df.head()

Unnamed: 0,text,label
0,"Aurion C3 Iron Curl Bar with 2 Locks, 3 ft (Si...",3
1,"Presto 06620 11-Inch Electric Skillet Fries, g...",3
2,WHOOSH! Award-Wining Screen Cleaner - Safe for...,2
3,ManQ Men's Blended Waist Coat,1
4,Lace And Me Women's Blended High Waist Tummy &...,1


## Cleansing

In [10]:
# Download NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize the custom lemmatizer, and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))  # Using Indonesian stopwords

# Function to cleanse text (remove URLs, special characters, etc.)
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\d+", '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

# Preprocessing function for each comment
def preprocess_text(text):
    # Cleaning
    cleaned_text = clean_text(text)

    # Tokenization using PhraseTokenizer
    tokens = word_tokenize(cleaned_text)

    # Remove stopwords
    tokens_no_stopwords = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens_no_stopwords]

    # Combine back into text
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

# Apply preprocessing to all comments in the DataFrame
df['clean_text'] = df['text'].apply(preprocess_text)
df.to_csv("preprocessed_text.csv", index=False)

# Display the preprocessing results
print(df[['clean_text']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                          clean_text
0  aurion c iron curl bar lock ft silver mm thick...
1  presto inch electric skillet fry grill stew ac...
2  whoosh awardwining screen cleaner safe screen ...
3                        manq men blended waist coat
4  lace woman blended high waist tummy thigh shap...


In [11]:
df.head()

Unnamed: 0,text,label,clean_text
0,"Aurion C3 Iron Curl Bar with 2 Locks, 3 ft (Si...",3,aurion c iron curl bar lock ft silver mm thick...
1,"Presto 06620 11-Inch Electric Skillet Fries, g...",3,presto inch electric skillet fry grill stew ac...
2,WHOOSH! Award-Wining Screen Cleaner - Safe for...,2,whoosh awardwining screen cleaner safe screen ...
3,ManQ Men's Blended Waist Coat,1,manq men blended waist coat
4,Lace And Me Women's Blended High Waist Tummy &...,1,lace woman blended high waist tummy thigh shap...


# B. Melakukan pemodelan klasifikasi dengan menggunakan 2 metode Machine Lerning yaitu SVM dan Random Forest; dan 2 metode text representation (vectorization), anda perlu melakukan tuning hyperparameter minimal 2 hyperparameter untuk masing-masing algoritma machine learning.

text representation menggunakan TF-IDF dan CountVectorizer

80% train, 10% validation,  10% test

In [12]:
test_idx = int(len(df)*0.9)
df_test = df.iloc[test_idx:]
df_train_val = df.iloc[:test_idx]

x_train, x_val,y_train, y_val= train_test_split(df_train_val['text'],df_train_val['label'], test_size=0.10)
x_test , y_test = [df_test['text'], df_test['label']]

In [13]:
def validation_report(model, x_val,y_val):
    y_pred_val = model.predict(x_val)
    print('validation classification report : ')
    print(classification_report(y_val, y_pred_val, labels = y_val.unique()))

def test_report(model, x_test, y_test):
    y_pred_test = model.predict(x_test)
    print('test classification report : ')
    print(classification_report(y_test, y_pred_test, labels = y_test.unique()))

In [14]:
cv = CountVectorizer()
tfidf = TfidfVectorizer()

#persiapan dataset
data_type_x = {'train':x_train,'val':x_val,'test':x_test}

dataset = {
    'count_vectorizer':{'vectorizer_model':cv},
    'tfidf_vectorizer':{'vectorizer_model':tfidf}
}

In [15]:
for vector in dataset :
    vector_model = dataset[vector]['vectorizer_model']
    all_data = data_type_x.copy()

    vector_model.fit(all_data['train'])

    for type in ['train','val','test']:
        data_x= all_data[type]
        vectorized_data = vector_model.transform(data_x)
        all_data[type] = vectorized_data.toarray()

    dataset[vector]['data'] = all_data

## CountVectorizer

In [16]:
x_train_data_cv = dataset['count_vectorizer']['data']['train']
x_val_data_cv = dataset['count_vectorizer']['data']['val']
x_test_data_cv = dataset['count_vectorizer']['data']['test']

### Random Forest

In [17]:
rf_cv = RandomForestClassifier(class_weight = 'balanced', random_state=42)
rf_cv.fit(x_train_data_cv,y_train)

In [18]:
validation_report(rf_cv,x_test_data_cv,y_test)

validation classification report : 
              precision    recall  f1-score   support

           3       0.89      0.96      0.92       475
           1       0.97      0.95      0.96       220
           0       0.94      0.94      0.94       301
           2       0.98      0.86      0.92       265

    accuracy                           0.93      1261
   macro avg       0.95      0.93      0.93      1261
weighted avg       0.93      0.93      0.93      1261



### Hypertune parameter random forest

In [46]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 50, 300)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)

    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=43,
        class_weight='balanced'
    )

    rf.fit(x_train_data_cv, y_train)

    y_pred = rf.predict(x_val_data_cv)
    score = accuracy_score(y_pred, y_val)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

[I 2024-11-11 09:28:46,184] A new study created in memory with name: no-name-33a6ff9f-ba99-4172-afa7-e796d580e6ea
[I 2024-11-11 09:29:33,219] Trial 0 finished with value: 0.8986784140969163 and parameters: {'n_estimators': 160, 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.8986784140969163.
[I 2024-11-11 09:30:47,734] Trial 1 finished with value: 0.8960352422907489 and parameters: {'n_estimators': 242, 'max_depth': 13, 'min_samples_split': 17, 'min_samples_leaf': 3}. Best is trial 0 with value: 0.8986784140969163.
[I 2024-11-11 09:31:13,180] Trial 2 finished with value: 0.879295154185022 and parameters: {'n_estimators': 80, 'max_depth': 18, 'min_samples_split': 7, 'min_samples_leaf': 16}. Best is trial 0 with value: 0.8986784140969163.
[I 2024-11-11 09:31:51,836] Trial 3 finished with value: 0.8960352422907489 and parameters: {'n_estimators': 144, 'max_depth': 13, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 0 with valu

In [47]:
print('Best hyperparameters: ', study.best_params)
print('Best score: ', study.best_value)

Best hyperparameters:  {'n_estimators': 160, 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 5}
Best score:  0.8986784140969163


In [48]:
rf_cv_hyper = RandomForestClassifier(class_weight = 'balanced', random_state=42, **study.best_params)
rf_cv_hyper.fit(x_train_data_cv,y_train)

### Perbandingan hasil

model sebelum tuning

In [49]:
test_report(rf_cv,x_test_data_cv,y_test)

test classification report : 
              precision    recall  f1-score   support

           3       0.89      0.96      0.92       475
           1       0.97      0.95      0.96       220
           0       0.94      0.94      0.94       301
           2       0.98      0.86      0.92       265

    accuracy                           0.93      1261
   macro avg       0.95      0.93      0.93      1261
weighted avg       0.93      0.93      0.93      1261



model setelah tuning

In [50]:
test_report(rf_cv_hyper,x_test_data_cv,y_test)

test classification report : 
              precision    recall  f1-score   support

           3       0.87      0.90      0.89       475
           1       0.87      0.98      0.92       220
           0       0.93      0.90      0.91       301
           2       0.94      0.83      0.88       265

    accuracy                           0.90      1261
   macro avg       0.90      0.90      0.90      1261
weighted avg       0.90      0.90      0.90      1261



### SVM

In [24]:
svm_cv = LinearSVC(class_weight = 'balanced', random_state = 42)
svm_cv.fit(x_train_data_cv,y_train)



In [25]:
validation_report(svm_cv, x_val_data_cv,y_val)

validation classification report : 
              precision    recall  f1-score   support

           3       0.97      0.95      0.96       409
           1       0.95      0.96      0.96       193
           2       0.92      0.92      0.92       254
           0       0.92      0.94      0.93       279

    accuracy                           0.94      1135
   macro avg       0.94      0.94      0.94      1135
weighted avg       0.94      0.94      0.94      1135



### Hypertune SVM

In [53]:
def objective(trial):
    C = trial.suggest_float('C', 0.1, 10.0, log=True)
    tol = trial.suggest_float('tol', 1e-5, 1e-1)
    max_iter = trial.suggest_int('max_iter', 100, 1000)

    svm = LinearSVC(C=C,tol=tol,max_iter=max_iter,class_weight = 'balanced', random_state = 43)

    svm.fit(x_train_data_cv, y_train)

    y_pred = svm.predict(x_val_data_cv)
    score = accuracy_score(y_pred, y_val)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

[I 2024-11-11 09:35:24,201] A new study created in memory with name: no-name-3a2c9bd9-8efc-4df9-ba84-0276aea61344
[I 2024-11-11 09:35:31,032] Trial 0 finished with value: 0.9436123348017621 and parameters: {'C': 0.7734663946607083, 'tol': 0.07991777834439422, 'max_iter': 992}. Best is trial 0 with value: 0.9436123348017621.
[I 2024-11-11 09:35:36,199] Trial 1 finished with value: 0.945374449339207 and parameters: {'C': 0.11752231254748512, 'tol': 0.002557902414020967, 'max_iter': 748}. Best is trial 1 with value: 0.945374449339207.
[I 2024-11-11 09:35:41,261] Trial 2 finished with value: 0.9436123348017621 and parameters: {'C': 0.4939655717195372, 'tol': 0.04198989141627506, 'max_iter': 339}. Best is trial 1 with value: 0.945374449339207.
[I 2024-11-11 09:35:46,079] Trial 3 finished with value: 0.9436123348017621 and parameters: {'C': 0.5223430735315719, 'tol': 0.02502179871337343, 'max_iter': 380}. Best is trial 1 with value: 0.945374449339207.
[I 2024-11-11 09:35:51,212] Trial 4 fini

In [54]:
print('Best hyperparameters: ', study.best_params)
print('Best score: ', study.best_value)

Best hyperparameters:  {'C': 0.11752231254748512, 'tol': 0.002557902414020967, 'max_iter': 748}
Best score:  0.945374449339207


In [55]:
svm_cv_hyper = LinearSVC(class_weight = 'balanced', random_state = 43 ,**study.best_params)
svm_cv_hyper.fit(x_train_data_cv,y_train)



### Perbandingan hasil

model sebelum tuning

In [56]:
test_report(svm_cv,x_test_data_cv, y_test)

test classification report : 
              precision    recall  f1-score   support

           3       0.95      0.95      0.95       475
           1       0.96      0.97      0.97       220
           0       0.93      0.97      0.95       301
           2       0.96      0.91      0.94       265

    accuracy                           0.95      1261
   macro avg       0.95      0.95      0.95      1261
weighted avg       0.95      0.95      0.95      1261



model setelah tuning

In [57]:
test_report(svm_cv_hyper,x_test_data_cv, y_test)

test classification report : 
              precision    recall  f1-score   support

           3       0.95      0.94      0.95       475
           1       0.96      0.97      0.97       220
           0       0.92      0.97      0.95       301
           2       0.96      0.92      0.94       265

    accuracy                           0.95      1261
   macro avg       0.95      0.95      0.95      1261
weighted avg       0.95      0.95      0.95      1261



## TF-IDF

### Random forest

In [None]:
x_train_data_tf = dataset['tfidf_vectorizer']['data']['train']
x_val_data_tf = dataset['tfidf_vectorizer']['data']['val']
x_test_data_tf = dataset['tfidf_vectorizer']['data']['test']

In [32]:
rf_tf = RandomForestClassifier(class_weight = 'balanced', random_state=42)
rf_tf.fit(x_train_data_tf,y_train)

In [33]:
validation_report(rf_tf,x_val_data_tf,y_val)

validation classification report : 
              precision    recall  f1-score   support

           3       0.91      0.96      0.93       409
           1       0.98      0.93      0.95       193
           2       0.96      0.88      0.92       254
           0       0.94      0.96      0.95       279

    accuracy                           0.94      1135
   macro avg       0.95      0.93      0.94      1135
weighted avg       0.94      0.94      0.94      1135



### Hypertune random forest

In [59]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 10, 300)
    max_depth = trial.suggest_int("max_depth", 2, 20)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 20)

    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        class_weight='balanced'
    )

    rf.fit(x_train_data_tf, y_train)

    y_pred = rf.predict(x_val_data_tf)
    score = accuracy_score(y_pred, y_val)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

[I 2024-11-11 09:38:06,803] A new study created in memory with name: no-name-078dfae5-0216-4dfe-abef-4c3eaed86b8a
[I 2024-11-11 09:39:19,285] Trial 0 finished with value: 0.9004405286343612 and parameters: {'n_estimators': 253, 'max_depth': 16, 'min_samples_split': 11, 'min_samples_leaf': 13}. Best is trial 0 with value: 0.9004405286343612.
[I 2024-11-11 09:39:44,632] Trial 1 finished with value: 0.8634361233480177 and parameters: {'n_estimators': 270, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 18}. Best is trial 0 with value: 0.9004405286343612.
[I 2024-11-11 09:39:49,554] Trial 2 finished with value: 0.8290748898678414 and parameters: {'n_estimators': 19, 'max_depth': 11, 'min_samples_split': 15, 'min_samples_leaf': 19}. Best is trial 0 with value: 0.9004405286343612.
[I 2024-11-11 09:39:58,885] Trial 3 finished with value: 0.8458149779735683 and parameters: {'n_estimators': 167, 'max_depth': 2, 'min_samples_split': 9, 'min_samples_leaf': 1}. Best is trial 0 with val

In [60]:
print('Best hyperparameters: ', study.best_params)
print('Best score: ', study.best_value)

Best hyperparameters:  {'n_estimators': 253, 'max_depth': 16, 'min_samples_split': 11, 'min_samples_leaf': 13}
Best score:  0.9004405286343612


In [61]:
rf_tf_hyper = RandomForestClassifier(class_weight = 'balanced', random_state=42, **study.best_params)
rf_tf_hyper.fit(x_train_data_tf,y_train)

### Perbandingan hasil

model sebelum tuning

In [62]:
test_report(rf_tf,x_test_data_tf,y_test)

test classification report : 
              precision    recall  f1-score   support

           3       0.88      0.96      0.92       475
           1       0.97      0.94      0.96       220
           0       0.94      0.94      0.94       301
           2       0.98      0.85      0.91       265

    accuracy                           0.93      1261
   macro avg       0.94      0.92      0.93      1261
weighted avg       0.93      0.93      0.93      1261



model setelah tuning

In [63]:
test_report(rf_tf_hyper,x_test_data_tf,y_test)

test classification report : 
              precision    recall  f1-score   support

           3       0.88      0.88      0.88       475
           1       0.84      0.96      0.90       220
           0       0.91      0.91      0.91       301
           2       0.94      0.83      0.89       265

    accuracy                           0.89      1261
   macro avg       0.89      0.90      0.89      1261
weighted avg       0.89      0.89      0.89      1261



### SVM

In [64]:
svm_tf = LinearSVC(class_weight = 'balanced', random_state = 42)
svm_tf.fit(x_train_data_tf,y_train)

In [65]:
validation_report(svm_tf,x_val_data_tf,y_val)

validation classification report : 
              precision    recall  f1-score   support

           3       0.97      0.97      0.97       409
           1       0.97      0.99      0.98       193
           2       0.96      0.94      0.95       254
           0       0.96      0.96      0.96       279

    accuracy                           0.97      1135
   macro avg       0.97      0.97      0.97      1135
weighted avg       0.97      0.97      0.97      1135



### Hypertune SVM

In [67]:
def objective(trial):
    C = trial.suggest_float('C', 0.1, 10.0, log=True)
    tol = trial.suggest_float('tol', 1e-5, 1e-1)
    max_iter = trial.suggest_int('max_iter', 100, 1000)

    svm = LinearSVC(C=C,tol=tol,max_iter=max_iter,class_weight = 'balanced', random_state = 43)

    svm.fit(x_train_data_tf, y_train)

    y_pred = svm.predict(x_val_data_tf)
    score = accuracy_score(y_pred, y_val)
    return score

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

[I 2024-11-11 09:45:10,939] A new study created in memory with name: no-name-30040f9c-bb48-42c9-981f-0a50707f8b26
[I 2024-11-11 09:45:13,354] Trial 0 finished with value: 0.958590308370044 and parameters: {'C': 0.15273580145342988, 'tol': 0.07168567611193458, 'max_iter': 241}. Best is trial 0 with value: 0.958590308370044.
[I 2024-11-11 09:45:15,905] Trial 1 finished with value: 0.9656387665198238 and parameters: {'C': 3.643865041494412, 'tol': 0.07410110546927456, 'max_iter': 869}. Best is trial 1 with value: 0.9656387665198238.
[I 2024-11-11 09:45:19,336] Trial 2 finished with value: 0.9577092511013215 and parameters: {'C': 0.23252914757056073, 'tol': 0.012123682635600876, 'max_iter': 745}. Best is trial 1 with value: 0.9656387665198238.
[I 2024-11-11 09:45:22,004] Trial 3 finished with value: 0.9647577092511013 and parameters: {'C': 3.361493731042266, 'tol': 0.019746493408895067, 'max_iter': 291}. Best is trial 1 with value: 0.9656387665198238.
[I 2024-11-11 09:45:24,318] Trial 4 fi

In [68]:
print('Best hyperparameters: ', study.best_params)
print('Best score: ', study.best_value)

Best hyperparameters:  {'C': 3.643865041494412, 'tol': 0.07410110546927456, 'max_iter': 869}
Best score:  0.9656387665198238


In [69]:
svm_tf_hyper = LinearSVC(class_weight = 'balanced', random_state = 43, **study.best_params)
svm_tf_hyper.fit(x_train_data_tf,y_train)

### Perbandingan hasil

model sebelum tuning

In [70]:
test_report(svm_tf,x_test_data_tf,y_test)

test classification report : 
              precision    recall  f1-score   support

           3       0.95      0.97      0.96       475
           1       0.97      0.98      0.98       220
           0       0.96      0.96      0.96       301
           2       0.98      0.94      0.96       265

    accuracy                           0.96      1261
   macro avg       0.97      0.96      0.97      1261
weighted avg       0.96      0.96      0.96      1261



model setelah tuning

In [71]:
test_report(svm_tf_hyper,x_test_data_tf,y_test)

test classification report : 
              precision    recall  f1-score   support

           3       0.95      0.97      0.96       475
           1       0.97      0.98      0.98       220
           0       0.96      0.95      0.96       301
           2       0.97      0.93      0.95       265

    accuracy                           0.96      1261
   macro avg       0.96      0.96      0.96      1261
weighted avg       0.96      0.96      0.96      1261



# C. Analisa

| Text   Representation | Algoritma   Machine learning | Machine   learning Hyperparameter                                                      | Accuracy | Precision | Recall | F1 Score |
|-----------------------|------------------------------|----------------------------------------------------------------------------------------|----------|-----------|--------|----------|
| CountVectorizer    | SVM                          | 'C': 0.11752231254748512, 'tol': 0.002557902414020967, 'max_iter': 748                | 0.95     | 0.95      | 0.95   | 0.95     |
| CountVectorizer    | Random Forest                | 'n_estimators': 160, 'max_depth': 15, 'min_samples_split': 10, 'min_samples_leaf': 5 | 0.90      | 0.90       | 0.90   | 0.90      |
| TF-IDF   | SVM                          | 'C': 3.643865041494412, 'tol': 0.07410110546927456, 'max_iter': 869                 | 0.96     | 0.96      | 0.96   | 0.96     |
| TF-IDF   | Random Forest                | 'n_estimators': 253, 'max_depth': 16, 'min_samples_split': 11, 'min_samples_leaf': 13 | 0.89      | 0.90       | 0.90    | 0.89      |

### Hasil analisa

- Model SVM dengan vektorisasi TF-IDF mendapatkan skor performa tertinggi, sedangkan model random forest dengan vektorisasi TF-IDF mendapatkan skor performa yang paling rendah.

- Model yang menggunakan vektorisasi TF-IDF menunjukan skor performa yang lebih tinggi dibandingkan dengan menggunakan vektorisasi CountVectorizer. Hal ini menunjukkan bahwa TF-IDF mampu mengekstrak dan merepresentasikan data teks lebih baik dibandingkan dengan CountVectorizer.

# E.
https://drive.google.com/drive/u/0/folders/1tG83LDXedOqkqBgje1p_usH0Rc3utq5P