In [2]:
!pip install spacy xgboost scikit-learn pandas joblib
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m87.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from xgboost import XGBClassifier
import joblib

In [4]:
# Read and show first five
df = pd.read_csv('preprocessed_reviews.csv')

df.dropna(inplace=True)

df.head()

Unnamed: 0,rating_review,review_full,clean_text,sentiment
0,5,"Totally in love with the Auro of the place, re...",totally love auro place beautiful fancy time a...,positive
1,5,I went this bar 8 days regularly with my husba...,go bar 8 day regularly husband fully satisfied...,positive
2,5,We were few friends and was a birthday celebra...,friend birthday celebration food good taste fr...,positive
3,5,Fatjar Cafe and Market is the perfect place fo...,fatjar cafe market perfect place casual lunch ...,positive
4,5,"Hey Guys, if you are craving for pizza and sea...",hey guy crave pizza search visit cafe yes high...,positive


In [5]:
print(df.sentiment.value_counts())

sentiment
positive    122623
neutral      15933
negative      9006
Name: count, dtype: int64


In [6]:
from sklearn.preprocessing import LabelEncoder

# Encode star ratings to integers if not already
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['sentiment'])

# Save for decoding later
joblib.dump(label_encoder, 'label_encoder.pkl')

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=np.unique(df['label']),
                                     y=df['label'])
class_weight_dict = dict(enumerate(class_weights))

print("Class Weights:", class_weight_dict)


Class Weights: {0: np.float64(5.461618180472278), 1: np.float64(3.087135714136279), 2: np.float64(0.40112648796174727)}


In [7]:
# Initialize TF-IDF
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))

# Fit-transform on full dataset
X_all = tfidf.fit_transform(df['clean_text'])
y_all = df['label']

# Save vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [8]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

accuracy_scores = []
f1_scores = []

for fold, (train_index, test_index) in enumerate(kf.split(X_all)):
    print(f"Fold {fold + 1}")
    X_train, X_test = X_all[train_index], X_all[test_index]
    y_train, y_test = y_all.iloc[train_index], y_all.iloc[test_index]

    # XGBoost with class weights
    model = XGBClassifier(
        objective='multi:softprob',
        num_class=len(np.unique(y_all)),
        eval_metric='mlogloss',
        use_label_encoder=False,
        scale_pos_weight=None,
        seed=42
    )

    model.fit(
        X_train,
        y_train,
        sample_weight=np.array([class_weight_dict[label] for label in y_train])
    )

    y_pred = model.predict(X_test)

    print(classification_report(y_test, y_pred))

    # acc = accuracy_score(y_test, y_pred)
    # f1 = f1_score(y_test, y_pred, average='macro')

    # print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}")
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred, average='macro'))

print("\nAverage Accuracy:", np.mean(accuracy_scores))
print("Average F1 Score:", np.mean(f1_scores))


Fold 1


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.50      0.72      0.59      1716
           1       0.34      0.57      0.42      3169
           2       0.96      0.84      0.90     24628

    accuracy                           0.81     29513
   macro avg       0.60      0.71      0.64     29513
weighted avg       0.86      0.81      0.83     29513

Fold 2


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.51      0.72      0.59      1806
           1       0.33      0.57      0.42      3162
           2       0.96      0.84      0.89     24545

    accuracy                           0.80     29513
   macro avg       0.60      0.71      0.64     29513
weighted avg       0.86      0.80      0.82     29513

Fold 3


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.53      0.72      0.61      1884
           1       0.33      0.57      0.42      3211
           2       0.96      0.84      0.89     24417

    accuracy                           0.80     29512
   macro avg       0.61      0.71      0.64     29512
weighted avg       0.86      0.80      0.82     29512

Fold 4


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.51      0.73      0.60      1785
           1       0.33      0.59      0.42      3150
           2       0.96      0.83      0.89     24577

    accuracy                           0.80     29512
   macro avg       0.60      0.72      0.64     29512
weighted avg       0.87      0.80      0.82     29512

Fold 5


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       0.52      0.70      0.60      1815
           1       0.34      0.58      0.42      3241
           2       0.95      0.84      0.89     24456

    accuracy                           0.80     29512
   macro avg       0.60      0.71      0.64     29512
weighted avg       0.86      0.80      0.82     29512


Average Accuracy: 0.8020560560045411
Average F1 Score: 0.6377777882094409


In [9]:
# Train on full data
final_model = XGBClassifier(
    objective='multi:softprob',
    num_class=len(np.unique(y_all)),
    eval_metric='mlogloss',
    use_label_encoder=False,
    seed=42
)
final_model.fit(X_all, y_all, sample_weight=[class_weight_dict[label] for label in y_all])

# Save the model
joblib.dump(final_model, 'xgboost_model.pkl')


Parameters: { "use_label_encoder" } are not used.



['xgboost_model.pkl']