In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import warnings
warnings.filterwarnings('ignore')

In [2]:
data_train = pd.read_csv('/kaggle/input/dataset/data_train.csv')
data_val = pd.read_csv('/kaggle/input/dataset/data_val.csv')
data_test = pd.read_csv('/kaggle/input/dataset/data_test.csv')

In [3]:
data_train

Unnamed: 0,appearance,delivery,fabric_quality,others,packaging,price,service,size_fit,processed_text
0,0,0,0,0,0,0,0,1,áo nhỏ hơn so với mô_tả
1,3,0,0,0,0,0,0,0,áo đẹp như mẫu luôn nha mọi người
2,0,3,0,2,0,0,0,0,rất hài_lòng sản_phẩm tốt shop giao đúng mẫu n...
3,0,0,0,0,0,0,0,3,nâng ngực ôm gọn bầu ngực mặc ok lắm
4,3,0,3,0,0,0,0,0,quần đẹp vải mát cho kiểm hàng kĩ rất hài_lòng...
...,...,...,...,...,...,...,...,...,...
3626,3,0,3,0,0,0,0,0,co_dãn tuyệt_đối vải mỏng ôm sát không bị bí n...
3627,0,1,0,0,0,0,0,0,đặt màu trắng ma giao màu đen chán
3628,3,0,3,0,0,0,0,0,chất đẹp form đẹp mặc thoải_mái vote cho shop
3629,0,0,1,0,0,0,0,1,chất_lượng vải rất tệ size lớn bé lẫn_lộn shop...


In [4]:
data_train.dropna(axis=0, inplace=True)

## Embedding by TF-IDF

In [5]:
emb = TfidfVectorizer(min_df=5, max_df=0.8,max_features=3000,sublinear_tf=True)
emb.fit(data_train['processed_text'])

embedding_train = emb.transform(data_train['processed_text'])
embedding_val = emb.transform(data_val['processed_text'])
embedding_test = emb.transform(data_test['processed_text'])

In [6]:
def make_aspect_outputs(df):
    outputs = {}
    for col in range(len(df.columns) - 1):
        binary_labels = []
        for row in range(len(df)):
            sentiment = df.iloc[row, col]
            # Convert sentiment labels to binary (1 if aspect is present, 0 if absent)
            if sentiment == 0:
                one_hot = 0
            else:
                one_hot = 1
            binary_labels.append(one_hot)
        outputs[df.columns[col]] = binary_labels

    return pd.DataFrame(outputs)

In [7]:
label_train = make_aspect_outputs(data_train)
label_val = make_aspect_outputs(data_val)
label_test = make_aspect_outputs(data_test)

label_train.head()

Unnamed: 0,appearance,delivery,fabric_quality,others,packaging,price,service,size_fit
0,0,0,0,0,0,0,0,1
1,1,0,0,0,0,0,0,0
2,0,1,0,1,0,0,0,0
3,0,0,0,0,0,0,0,1
4,1,0,1,0,0,0,0,0


## Aspect detection with Random Forest and Hyperparameter tuning

In [8]:
aspect_columns = data_train.columns[:-1]

aspect_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

best_rf_clf = {}
for aspect in aspect_columns:
    # Prepare labels for current aspect
    y_train_aspect = label_train[aspect]
    y_val_aspect = label_val[aspect]
    
    # Initialize the model with GridSearchCV
    rf_clf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf_clf, param_grid=aspect_param_grid, cv=3, scoring='f1_macro', n_jobs=-1)
    
    # Fit the model
    grid_search.fit(embedding_train, y_train_aspect)
    
    # Get the best model from GridSearch
    best_rf_clf[aspect] = grid_search.best_estimator_
    
    # Evaluate on validation set
    y_pred = best_rf_clf[aspect].predict(embedding_val)
    print(f"Aspect: {aspect}")
    print("Best Parameters:", grid_search.best_params_)
    print(classification_report(y_val_aspect, y_pred))

Aspect: appearance
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
              precision    recall  f1-score   support

           0       0.91      0.92      0.91       300
           1       0.84      0.81      0.83       154

    accuracy                           0.89       454
   macro avg       0.87      0.87      0.87       454
weighted avg       0.88      0.89      0.88       454

Aspect: delivery
Best Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       391
           1       0.95      0.87      0.91        63

    accuracy                           0.98       454
   macro avg       0.96      0.93      0.95       454
weighted avg       0.98      0.98      0.98       454

Aspect: fabric_quality
Best Parameters: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split'

In [9]:
# save model file
with open('/kaggle/working/aspect_detection_model.pkl', 'wb') as file:
    pickle.dump(best_rf_clf, file)

## Sentiment analysis with SVM and hyperparameter tuning

In [10]:
# Loop through each aspect and train an SVM model if there are enough classes
sentiment_param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1],
    'kernel': ['linear', 'rbf']
}
best_svm_clf = {}
for aspect in aspect_columns:
    # Filter the training data to only include rows where the aspect is present
    train_data_aspect = embedding_train[data_train[aspect] > 0]
    train_label_aspect = data_train[data_train[aspect] > 0][aspect]

    val_data_aspect = embedding_val[data_val[aspect] > 0]
    val_label_aspect = data_val[data_val[aspect] > 0][aspect]

    # Check if there are at least two classes in the training labels
    if len(train_label_aspect.unique()) > 1:
        # Initialize the SVC model
        svm_clf = SVC(random_state=42)

        # Set up the GridSearchCV
        grid_search = GridSearchCV(svm_clf, sentiment_param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
        grid_search.fit(train_data_aspect, train_label_aspect)

        # Best parameters from grid search
        best_params = grid_search.best_params_
        print(f"Best parameters for Aspect '{aspect}': {best_params}")

        # Predict and evaluate with the best model
        best_svm_clf[aspect] = grid_search.best_estimator_
        y_pred = best_svm_clf[aspect].predict(val_data_aspect)
        print(f"Sentiment for Aspect: {aspect}")
        print(classification_report(val_label_aspect, y_pred))
    else:
        print(f"Skipping aspect '{aspect}' due to insufficient class diversity in training data.\n")

Best parameters for Aspect 'appearance': {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
Sentiment for Aspect: appearance
              precision    recall  f1-score   support

           1       0.76      0.72      0.74        18
           2       0.00      0.00      0.00         7
           3       0.93      0.97      0.95       129

    accuracy                           0.90       154
   macro avg       0.57      0.56      0.56       154
weighted avg       0.87      0.90      0.88       154

Best parameters for Aspect 'delivery': {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Sentiment for Aspect: delivery
              precision    recall  f1-score   support

           1       0.67      0.50      0.57         4
           2       0.00      0.00      0.00         1
           3       0.95      0.98      0.97        58

    accuracy                           0.94        63
   macro avg       0.54      0.49      0.51        63
weighted avg       0.92      0.94      0.93        63

Bes

In [11]:
# save model
with open('/kaggle/working/sentiment_analysis_model.pkl', 'wb') as file:
    pickle.dump(best_svm_clf, file)

## Evaluation

In [12]:
# Aspect classification on test set
aspect_predictions = {}
sentiment_predictions = {}

aspect_f1_score = {}
sentiment_f1_score = {}

for aspect in aspect_columns:
    y_test_aspect = label_test[aspect]
    y_pred_aspect = best_rf_clf[aspect].predict(embedding_test)
    aspect_predictions[aspect] = y_pred_aspect  # Store predictions for each aspect
    
    aspect_f1_score[aspect] = f1_score(y_test_aspect, y_pred_aspect, average='macro')

for aspect in ['appearance', 'delivery', 'fabric_quality', 'packaging',
       'price', 'service', 'size_fit']:
    sentiment_predictions[aspect] = best_svm_clf[aspect].predict(embedding_test)
    relevant_indices = aspect_predictions[aspect] > 0
    test_label_aspect = data_test[relevant_indices][aspect]
    y_pred_sentiment_filtered = sentiment_predictions[aspect][relevant_indices]
    
    if len(test_label_aspect) > 0:
        sentiment_f1_score[aspect] = f1_score(test_label_aspect, y_pred_sentiment_filtered, average='weighted')
    else:
        sentiment_f1_score[aspect] = 0
        
aspect_f1_score['overall'] = sum(aspect_f1_score.values()) / len(aspect_columns)
sentiment_f1_score['overall'] = sum(sentiment_f1_score.values()) / (len(aspect_columns)-1)

summary_report = pd.DataFrame({
    'Aspect Detection (F1 Score)': aspect_f1_score,
    'Sentiment Analysis (F1 Score)': sentiment_f1_score
})
summary_report.round(2)

Unnamed: 0,Aspect Detection (F1 Score),Sentiment Analysis (F1 Score)
appearance,0.9,0.77
delivery,0.96,0.95
fabric_quality,0.88,0.66
others,0.84,
packaging,0.97,0.92
price,0.85,0.76
service,0.71,0.76
size_fit,0.8,0.55
overall,0.86,0.76
