In [2]:
import pandas as pd

df = pd.read_csv('Dataset\IMDB-Dataset.csv')

### Split Train/Val and Vectorize

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['sentiment'])
X = df['review_processed'].values.tolist()

x_train_tmp, x_val_tmp, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=0
)

tfidt = TfidfVectorizer(max_features=20000)
tfidt.fit(x_train_tmp, y_train)

x_train = tfidt.transform(x_train_tmp)
x_val = tfidt.transform(x_val_tmp)

### Training

In [9]:
from sklearn.metrics import f1_score

#### Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(
    criterion='entropy',
    random_state = 42
)

dt_clf.fit(x_train, y_train)

y_pred = dt_clf.predict(x_val)

In [18]:
print(f'F1 score: {f1_score(y_val, y_pred)}')

F1 score: 0.7203970223325062


#### Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(
    random_state = 42
)

rf_clf.fit(x_train, y_train)

y_pred = rf_clf.predict(x_val)

In [20]:
print(f'F1 score: {f1_score(y_val, y_pred)}')

F1 score: 0.8496784565916399


#### XGBoost

In [21]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(n_estimators = 500, # số lượng cây
                        learning_rate = 0.05, 
                        use_label_encoder = False,
                        eval_metric = "logloss",
                        early_stopping_rounds = 5, # ngăn overfitting
                        n_jobs = -1) # dùng toàn bộ cpu cores

xgb_clf.fit(x_train, y_train,                    
            eval_set = [(x_val,y_val)],
            verbose = 1)

Parameters: { "use_label_encoder" } are not used.



[0]	validation_0-logloss:0.68169
[1]	validation_0-logloss:0.67137
[2]	validation_0-logloss:0.66202
[3]	validation_0-logloss:0.65341
[4]	validation_0-logloss:0.64548
[5]	validation_0-logloss:0.63816
[6]	validation_0-logloss:0.63143
[7]	validation_0-logloss:0.62509
[8]	validation_0-logloss:0.61920
[9]	validation_0-logloss:0.61365
[10]	validation_0-logloss:0.60839
[11]	validation_0-logloss:0.60341
[12]	validation_0-logloss:0.59864
[13]	validation_0-logloss:0.59417
[14]	validation_0-logloss:0.58995
[15]	validation_0-logloss:0.58587
[16]	validation_0-logloss:0.58209
[17]	validation_0-logloss:0.57859
[18]	validation_0-logloss:0.57526
[19]	validation_0-logloss:0.57191
[20]	validation_0-logloss:0.56865
[21]	validation_0-logloss:0.56567
[22]	validation_0-logloss:0.56253
[23]	validation_0-logloss:0.55983
[24]	validation_0-logloss:0.55711
[25]	validation_0-logloss:0.55439
[26]	validation_0-logloss:0.55196
[27]	validation_0-logloss:0.54929
[28]	validation_0-logloss:0.54700
[29]	validation_0-loglos

In [23]:
print(f'F1 score: {f1_score(xgb_clf.predict(x_val), y_pred)}')

F1 score: 0.9176470588235294
