In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score

In [None]:
df_train = pd.read_csv('df_train_clean.csv')
df_dev = pd.read_csv('df_dev_clean.csv')
df_test = pd.read_csv('df_test_clean.csv')

In [None]:
df_train.head()

Unnamed: 0,content,label,topic,content_clean
0,slide giáo trình đầy đủ .,2,1,slide giáo trình đầy đủ .
1,"nhiệt tình giảng dạy , gần gũi với sinh viên .",2,0,"nhiệt tình giảng dạy , gần gũi với sinh viên ."
2,đi học đầy đủ full điểm chuyên cần .,0,1,đi học đầy đủ full điểm chuyên cần .
3,chưa áp dụng công nghệ thông tin và các thiết ...,0,0,chưa áp dụng công nghệ thông tin và các thiết ...
4,"thầy giảng bài hay , có nhiều bài tập ví dụ ng...",2,0,"thầy giảng bài hay , có nhiều bài tập ví dụ ng..."


In [None]:
col_text = 'content_clean'
col_label = 'label'

In [None]:
X_train_text = df_train[col_text]
y_train = df_train[col_label]
X_dev_text = df_dev[col_text]
y_dev = df_dev[col_label]
X_test_text = df_test[col_text]
y_test = df_test[col_label]

In [None]:
# --- TF-IDF & Bag of Words ---
vectorizer_options = {
    'tfidf': TfidfVectorizer(),
    'bow': CountVectorizer()
}

# --- n-gram & feature ---
vectorizer_param_grid = {
    'max_features': [3000, 5000, 7000],
    'ngram_range': [(1,3), (1,7)],  # unigram, unigram+bigram
}

In [None]:
# --- Finetune SVM & Logistic Regression ---
model_param_grid = {
    'svm': {
        'C': [0.1, 1, 10],
        'penalty': ['l2'],
        'loss': ['hinge', 'squared_hinge'],
        'max_iter': [1000, 2000]
    },
    'logistic': {
        'C': [0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['lbfgs', 'saga'],
        'max_iter': [200, 500]
    }
}

In [None]:
def finetune_and_report(vec_name, vectorizer, model_name, model, model_params):
    print(f"\n========== {model_name.upper()} với {vec_name.upper()} ==========")
    # grid vectorizer + model
    param_grid = {**{f'vectorizer__{k}': v for k, v in vectorizer_param_grid.items()},
                  **{f'model__{k}': v for k, v in model_params.items()}}
    from sklearn.pipeline import Pipeline

    pipe = Pipeline([
        ('vectorizer', vectorizer),
        ('model', model)
    ])
    grid = GridSearchCV(pipe, param_grid, scoring='f1_macro', cv=3, n_jobs=-1, verbose=1)
    grid.fit(X_train_text, y_train)

    print(f"Best parameters: {grid.best_params_}")
    print(f"Best crossval F1_macro: {grid.best_score_:.4f}")

    # classification report
    y_pred_dev = grid.predict(X_dev_text)
    f1_dev = f1_score(y_dev, y_pred_dev, average='macro')
    print("Dev F1 Macro:", f1_dev)
    print("Classification report trên dev:")
    print(classification_report(y_dev, y_pred_dev, digits=4))
    return grid

In [None]:
best_models = {}
for vec_name, vectorizer in vectorizer_options.items():
    # SVM
    best_models[f'svm_{vec_name}'] = finetune_and_report(
        vec_name, vectorizer, 'svm', LinearSVC(), model_param_grid['svm']
    )
    # Logistic Regression
    best_models[f'logistic_{vec_name}'] = finetune_and_report(
        vec_name, vectorizer, 'logistic', LogisticRegression(), model_param_grid['logistic']
    )


Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best parameters: {'model__C': 1, 'model__loss': 'squared_hinge', 'model__max_iter': 1000, 'model__penalty': 'l2', 'vectorizer__max_features': 5000, 'vectorizer__ngram_range': (1, 3)}
Best crossval F1_macro: 0.7224
Dev F1 Macro: 0.7012589413447783
Classification report trên dev:
              precision    recall  f1-score   support

           0     0.8986    0.9305    0.9143       705
           1     0.4815    0.1781    0.2600        73
           2     0.9177    0.9416    0.9295       805

    accuracy                         0.9015      1583
   macro avg     0.7659    0.6834    0.7013      1583
weighted avg     0.8891    0.9015    0.8918      1583


Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best parameters: {'model__C': 10, 'model__max_iter': 500, 'model__penalty': 'l2', 'model__solver': 'saga', 'vectorizer__max_features': 5000, 'vectorizer__ngram_range': (1, 7)}
Best crossval F1_macro: 0.7275
Dev F1 



Best parameters: {'model__C': 1, 'model__loss': 'hinge', 'model__max_iter': 1000, 'model__penalty': 'l2', 'vectorizer__max_features': 7000, 'vectorizer__ngram_range': (1, 3)}
Best crossval F1_macro: 0.7349
Dev F1 Macro: 0.7619387777190232
Classification report trên dev:
              precision    recall  f1-score   support

           0     0.9030    0.9106    0.9068       705
           1     0.5472    0.3973    0.4603        73
           2     0.9109    0.9267    0.9187       805

    accuracy                         0.8951      1583
   macro avg     0.7870    0.7449    0.7619      1583
weighted avg     0.8906    0.8951    0.8923      1583


Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best parameters: {'model__C': 1, 'model__max_iter': 200, 'model__penalty': 'l2', 'model__solver': 'lbfgs', 'vectorizer__max_features': 5000, 'vectorizer__ngram_range': (1, 7)}
Best crossval F1_macro: 0.7327
Dev F1 Macro: 0.7447042600874383
Classification report trên dev:
             

In [None]:
for name, model in best_models.items():
    y_pred_test = model.predict(X_test_text)
    f1 = f1_score(y_test, y_pred_test, average='macro')
    print(f"\n== {name.upper()} trên TEST ==")
    print("F1 Macro:", f1)
    print("Classification report trên test:")
    print(classification_report(y_test, y_pred_test, digits=4))


== SVM_TFIDF trên TEST ==
F1 Macro: 0.6887274458239324
Classification report trên test:
              precision    recall  f1-score   support

           0     0.8711    0.9397    0.9041      1409
           1     0.6047    0.1557    0.2476       167
           2     0.9108    0.9182    0.9145      1590

    accuracy                         0.8876      3166
   macro avg     0.7955    0.6712    0.6887      3166
weighted avg     0.8770    0.8876    0.8747      3166


== LOGISTIC_TFIDF trên TEST ==
F1 Macro: 0.7069679886442107
Classification report trên test:
              precision    recall  f1-score   support

           0     0.8703    0.9290    0.8987      1409
           1     0.5538    0.2156    0.3103       167
           2     0.9098    0.9138    0.9118      1590

    accuracy                         0.8838      3166
   macro avg     0.7780    0.6861    0.7070      3166
weighted avg     0.8735    0.8838    0.8743      3166


== SVM_BOW trên TEST ==
F1 Macro: 0.7268556678284024
C