In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score

In [None]:
df_train = pd.read_csv('/content/df_train_clean.csv')
df_dev = pd.read_csv('/content/df_dev_clean.csv')
df_test = pd.read_csv('/content/df_test_clean.csv')

In [None]:
col_text = 'content_clean'      # Chỉnh lại nếu khác
col_label = 'Emotion'    # Chỉnh lại nếu khác

In [None]:
X_train_text = df_train[col_text]
y_train = df_train[col_label]
X_dev_text = df_dev[col_text]
y_dev = df_dev[col_label]
X_test_text = df_test[col_text]
y_test = df_test[col_label]

In [None]:
# --- Các vectorizer để thử nghiệm: TF-IDF & Bag of Words ---
vectorizer_options = {
    'tfidf': TfidfVectorizer(),
    'bow': CountVectorizer()
}

# --- Các n-gram và số feature để thử (bạn có thể mở rộng thêm) ---
vectorizer_param_grid = {
    'max_features': [3000, 5000, 7000],
    'ngram_range': [(1,3), (1,7)],  # unigram, unigram+bigram
}

In [None]:
# --- Finetune SVM và Logistic Regression ---
model_param_grid = {
    'svm': {
        'C': [0.1, 1, 10],
        'penalty': ['l2'],
        'loss': ['hinge', 'squared_hinge'],
        'max_iter': [1000, 2000]
    },
    'logistic': {
        'C': [0.1, 1, 10],
        'penalty': ['l2'],
        'solver': ['lbfgs', 'saga'],
        'max_iter': [200, 500]
    }
}

In [None]:
def finetune_and_report(vec_name, vectorizer, model_name, model, model_params):
    print(f"\n========== {model_name.upper()} với {vec_name.upper()} ==========")
    # Ghép grid vectorizer + model
    param_grid = {**{f'vectorizer__{k}': v for k, v in vectorizer_param_grid.items()},
                  **{f'model__{k}': v for k, v in model_params.items()}}
    from sklearn.pipeline import Pipeline

    pipe = Pipeline([
        ('vectorizer', vectorizer),
        ('model', model)
    ])
    grid = GridSearchCV(pipe, param_grid, scoring='f1_macro', cv=3, n_jobs=-1, verbose=1)
    grid.fit(X_train_text, y_train)

    print(f"Best parameters: {grid.best_params_}")
    print(f"Best crossval F1_macro: {grid.best_score_:.4f}")

    # Đánh giá trên tập dev
    y_pred_dev = grid.predict(X_dev_text)
    f1_dev = f1_score(y_dev, y_pred_dev, average='macro')
    print("Dev F1 Macro:", f1_dev)
    print("Classification report trên dev:")
    print(classification_report(y_dev, y_pred_dev, digits=4))
    return grid

In [None]:
best_models = {}
for vec_name, vectorizer in vectorizer_options.items():
    # SVM
    best_models[f'svm_{vec_name}'] = finetune_and_report(
        vec_name, vectorizer, 'svm', LinearSVC(), model_param_grid['svm']
    )
    # Logistic Regression
    best_models[f'logistic_{vec_name}'] = finetune_and_report(
        vec_name, vectorizer, 'logistic', LogisticRegression(), model_param_grid['logistic']
    )


Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best parameters: {'model__C': 1, 'model__loss': 'hinge', 'model__max_iter': 1000, 'model__penalty': 'l2', 'vectorizer__max_features': 7000, 'vectorizer__ngram_range': (1, 3)}
Best crossval F1_macro: 0.4849
Dev F1 Macro: 0.4859396836427457
Classification report trên dev:
              precision    recall  f1-score   support

       Anger     0.4857    0.3469    0.4048        49
     Disgust     0.5473    0.6000    0.5724       135
   Enjoyment     0.6295    0.7383    0.6796       214
        Fear     0.5862    0.5484    0.5667        31
       Other     0.4024    0.2340    0.2960       141
     Sadness     0.4309    0.6163    0.5072        86
    Surprise     0.5000    0.3000    0.3750        30

    accuracy                         0.5364       686
   macro avg     0.5117    0.4834    0.4859       686
weighted avg     0.5239    0.5364    0.5200       686


Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best p

In [None]:
for name, model in best_models.items():
    y_pred_test = model.predict(X_test_text)
    f1 = f1_score(y_test, y_pred_test, average='macro')
    print(f"\n== {name.upper()} trên TEST ==")
    print("F1 Macro:", f1)
    print("Classification report trên test:")
    print(classification_report(y_test, y_pred_test, digits=4))


== SVM_TFIDF trên TEST ==
F1 Macro: 0.5236405564426421
Classification report trên test:
              precision    recall  f1-score   support

       Anger     0.3939    0.3250    0.3562        40
     Disgust     0.4734    0.6061    0.5316       132
   Enjoyment     0.5901    0.6788    0.6313       193
        Fear     0.7073    0.6304    0.6667        46
       Other     0.5417    0.4031    0.4622       129
     Sadness     0.6071    0.5862    0.5965       116
    Surprise     0.6000    0.3243    0.4211        37

    accuracy                         0.5556       693
   macro avg     0.5591    0.5077    0.5236       693
weighted avg     0.5587    0.5556    0.5503       693


== LOGISTIC_TFIDF trên TEST ==
F1 Macro: 0.538473293613438
Classification report trên test:
              precision    recall  f1-score   support

       Anger     0.4516    0.3500    0.3944        40
     Disgust     0.5062    0.6136    0.5548       132
   Enjoyment     0.5890    0.6684    0.6262       193
    