In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


## Read file `.csv`

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

train_df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/website_defacement/train_data.csv")
test_df = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/website_defacement/test_data.csv")

## Prepare data

Separate the features (X) and the target variable (y) for both the training and testing datasets.


In [None]:
X_train = train_df['HTML']
y_train = train_df['Label']
X_test = test_df['HTML']
y_test = test_df['Label']

print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (1230,)
Shape of y_train: (1230,)
Shape of X_test: (307,)
Shape of y_test: (307,)


## Install lazypredict

Install the LazyPredict library if it's not already installed.


In [None]:
%pip install lazypredict

Collecting lazypredict
  Downloading lazypredict-0.2.16-py2.py3-none-any.whl.metadata (13 kB)
Collecting pytest-runner (from lazypredict)
  Downloading pytest_runner-6.0.1-py3-none-any.whl.metadata (7.3 kB)
Collecting mlflow>=2.0.0 (from lazypredict)
  Downloading mlflow-2.22.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.1 (from mlflow>=2.0.0->lazypredict)
  Downloading mlflow_skinny-2.22.1-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow>=2.0.0->lazypredict)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow>=2.0.0->lazypredict)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow>=2.0.0->lazypredict)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow>=2.0.0->lazypredict)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-ski

## Initialize and run lazypredict

Initialize the LazyPredict classifier or regressor and fit it on the training data. Then, evaluate the models on the test data.


In [None]:
from lazypredict.Supervised import LazyClassifier
import pandas as pd # Import pandas to use DataFrame

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Convert Series to DataFrames
X_train_df = pd.DataFrame(X_train)
X_test_df = pd.DataFrame(X_test)

models, predictions = clf.fit(X_train_df, X_test_df, y_train, y_test)
print(models)

  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 645, number of negative: 585
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 255
[LightGBM] [Info] Number of data points in the train set: 1230, number of used features: 1
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524390 -> initscore=0.097638
[LightGBM] [Info] Start training from score 0.097638
Empty DataFrame
Columns: [Accuracy, Balanced Accuracy, ROC AUC, F1 Score, Time Taken]
Index: []


## Review results
Display and analyze the performance of different models.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from lazypredict.Supervised import LazyClassifier
import numpy as np # Import numpy for array conversion

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train_df['HTML'])
X_test_tfidf = tfidf.transform(X_test_df['HTML'])

# Convert sparse matrix to dense array
X_train_dense = X_train_tfidf.todense()
X_test_dense = X_test_tfidf.todense()

clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)

# Use the dense arrays
models, predictions = clf.fit(X_train_dense, X_test_dense, y_train, y_test)

print(models)

  0%|          | 0/32 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 645, number of negative: 585
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.026830 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 53414
[LightGBM] [Info] Number of data points in the train set: 1230, number of used features: 2305
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524390 -> initscore=0.097638
[LightGBM] [Info] Start training from score 0.097638
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
CalibratedClassifierCV             1.00               1.00     1.00      1.00   
LinearSVC                          1.00               1.00     1.00      1.00   
LogisticRegression                 0.99               0.99     0.99      0.99   
PassiveAggressiveClassifier        0.99  

## Prepare data for training

In [None]:
# Tách input và output
X_train_raw = train_df['HTML']
y_train = train_df['Label']

X_test_raw = test_df['HTML']
y_test = test_df['Label']

# Vector hóa TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(X_train_raw)
X_test = vectorizer.transform(X_test_raw)

### Train `RandomForestClassifier`

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
print("Random Forest:\n", classification_report(y_test, y_pred_rf))


Random Forest:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       147
           1       0.97      0.99      0.98       160

    accuracy                           0.98       307
   macro avg       0.98      0.98      0.98       307
weighted avg       0.98      0.98      0.98       307



### train `ExtraTreesClassifier`

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier(n_estimators=100, random_state=42)
et.fit(X_train, y_train)

y_pred_et = et.predict(X_test)
print("Extra Trees:\n", classification_report(y_test, y_pred_et))


Extra Trees:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       147
           1       0.98      0.99      0.99       160

    accuracy                           0.99       307
   macro avg       0.99      0.99      0.99       307
weighted avg       0.99      0.99      0.99       307



### train `lightgbm`

In [None]:
import lightgbm as lgb

lgbm = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgbm.fit(X_train, y_train)

y_pred_lgbm = lgbm.predict(X_test)
print("LightGBM:\n", classification_report(y_test, y_pred_lgbm))


[LightGBM] [Info] Number of positive: 645, number of negative: 585
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025980 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 50924
[LightGBM] [Info] Number of data points in the train set: 1230, number of used features: 2305
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.524390 -> initscore=0.097638
[LightGBM] [Info] Start training from score 0.097638
LightGBM:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       147
           1       0.97      0.99      0.98       160

    accuracy                           0.98       307
   macro avg       0.98      0.98      0.98       307
weighted avg       0.98      0.98      0.98       307



### train `xgboost`

In [None]:
import xgboost as xgb

xgb = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

y_pred_xgb = xgb.predict(X_test)
print("XGBoost:\n", classification_report(y_test, y_pred_xgb))


XGBoost:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       147
           1       0.96      0.98      0.97       160

    accuracy                           0.97       307
   macro avg       0.97      0.97      0.97       307
weighted avg       0.97      0.97      0.97       307



## Đối chiếu hiệu năng các model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

models = {
    "Random Forest": rf,
    "Extra Trees": et,
    "LightGBM": lgbm,
    "XGBoost": xgb
}

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"\n=== {name} ===")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))



=== Random Forest ===
Accuracy: 0.9804560260586319
Precision: 0.9695121951219512
Recall: 0.99375
F1 Score: 0.9814814814814815

=== Extra Trees ===
Accuracy: 0.9869706840390879
Precision: 0.9814814814814815
Recall: 0.99375
F1 Score: 0.9875776397515528

=== LightGBM ===
Accuracy: 0.9771986970684039
Precision: 0.9693251533742331
Recall: 0.9875
F1 Score: 0.978328173374613

=== XGBoost ===
Accuracy: 0.9706840390879479
Precision: 0.9631901840490797
Recall: 0.98125
F1 Score: 0.9721362229102167


### Lưu models

In [None]:
import joblib
import os

output_dir = "./drive/MyDrive/ColabNotebooks/train_model_2"
os.makedirs(output_dir, exist_ok=True)

# Lưu vectorizer
joblib.dump(vectorizer, os.path.join(output_dir, "tfidf_vectorizer.joblib"))

# Lưu từng model
joblib.dump(rf, os.path.join(output_dir, "model_random_forest.joblib"))
joblib.dump(et, os.path.join(output_dir, "model_extra_trees.joblib"))
joblib.dump(lgbm, os.path.join(output_dir, "model_lightgbm.joblib"))
joblib.dump(xgb, os.path.join(output_dir, "model_xgboost.joblib"))

['./drive/MyDrive/ColabNotebooks/train_model_2/model_xgboost.joblib']

## Load Model để dùng
- xuống local test tiếp ...