In [11]:
from tensorflow.keras.models import load_model
import pickle

# Load balanced model
balanced_model = load_model('deep_imbalanced_model3.h5')

# Load tokenizer used during balanced training
with open('deep_tokenizer_imbalanced.pkl', 'rb') as f:
    tokenizer_bal = pickle.load(f)

# Load maxlen
with open('maxlen_imbalanced1.pkl', 'rb') as f:
    max_len_bal = pickle.load(f)



In [12]:
import pandas as pd

import pandas as pd

X_test = pd.read_csv("X_test_balanced.csv")
y_test = pd.read_csv("y_test_balanced.csv")


# Fix labels to start from 0 (i.e., 1–5 → 0–4)
y_test['Rating'] -= 1

# Separate features and labels
X_balanced_test = X_test['review_text']
y_balanced_test = y_test['Rating']

In [13]:
import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def spacy_preprocess_pipe(texts):
    processed = []
    for doc in nlp.pipe(texts, batch_size=1000):
        tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
        processed.append(" ".join(tokens))
    return processed

X_imbalanced_test_cleaned = pd.Series(spacy_preprocess_pipe(X_balanced_test))


In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

X_test_seq = tokenizer_bal.texts_to_sequences(X_imbalanced_test_cleaned)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len_bal, padding='post', truncating='post')


In [15]:
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict
y_pred_probs = balanced_model.predict(X_test_pad)
y_pred = np.argmax(y_pred_probs, axis=1)

# Evaluate
print("Accuracy:", accuracy_score(y_balanced_test, y_pred))
print(classification_report(y_balanced_test, y_pred, digits=2))
print("Confusion Matrix:\n", confusion_matrix(y_balanced_test, y_pred))


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 127ms/step
Accuracy: 0.54615
              precision    recall  f1-score   support

           0       0.71      0.61      0.65      4000
           1       0.54      0.36      0.43      4000
           2       0.50      0.49      0.50      4000
           3       0.43      0.68      0.53      4000
           4       0.65      0.60      0.62      4000

    accuracy                           0.55     20000
   macro avg       0.57      0.55      0.55     20000
weighted avg       0.57      0.55      0.55     20000

Confusion Matrix:
 [[2424  573  373  455  175]
 [ 571 1432 1238  653  106]
 [ 194  564 1967 1119  156]
 [ 109   65  279 2704  843]
 [ 114   40   87 1363 2396]]
