In [1]:
import numpy as np
import pandas as pd
import re
import os
import kagglehub
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

print("TensorFlow Version:", tf.__version__)

print("\nDownloading SnappFood dataset...")
try:
    path = kagglehub.dataset_download("soheiltehranipour/snappfood-persian-sentiment-analysis")
    csv_path = os.path.join(path, 'Snappfood - Sentiment Analysis.csv')
    print("Download complete.")
    df = pd.read_csv(csv_path, sep='\t', on_bad_lines='skip')
    print("Dataset loaded successfully.")
except Exception as e:
    print(f"An error occurred during download or loading: {e}")
    df = pd.DataFrame()

if not df.empty:
    df.dropna(subset=['comment', 'label'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    df['label_id'] = df['label'].map({'HAPPY': 1, 'SAD': 0})
    df.dropna(subset=['label_id'], inplace=True)
    df['label_id'] = df['label_id'].astype(int)

    def normalize_text(text):
        text = str(text)
        text = re.sub(r"[^\u0600-\u06FF\s]", "", text)
        text = re.sub(r"ي", "ی", text)
        text = re.sub(r"ك", "ک", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

    df['cleaned_comment'] = df['comment'].apply(normalize_text)
    df = df[df['cleaned_comment'].str.len() > 0]
    print("\nData preprocessing complete.")

if not df.empty:
    comments = df['cleaned_comment'].values
    labels = df['label_id'].values

    num_words = 15000
    oov_token = "<OOV>"
    max_len = 120

    tokenizer = Tokenizer(num_words=num_words, oov_token=oov_token)
    tokenizer.fit_on_texts(comments)
    sequences = tokenizer.texts_to_sequences(comments)
    padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
    one_hot_labels = to_categorical(labels, num_classes=2)

    print("\nTokenization and padding complete.")

if 'padded_sequences' in locals():
    X_train, X_test, y_train, y_test = train_test_split(
        padded_sequences,
        one_hot_labels,
        test_size=0.2,
        random_state=42,
        stratify=one_hot_labels
    )
    print("\nData splitting complete.")

if 'X_train' in locals():
    embedding_dim = 128
    vocab_size = min(num_words, len(tokenizer.word_index) + 1)
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim),
        Conv1D(filters=64, kernel_size=5, activation='relu'),
        MaxPooling1D(pool_size=4),
        LSTM(64),
        Dropout(0.6),
        Dense(2, activation='softmax')
    ])
    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    print("\nModel architecture created.")
    model.summary()

if 'model' in locals():
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=2,
        mode='min',
        restore_best_weights=True,
        verbose=1
    )
    model_checkpoint = ModelCheckpoint(
        'best_sentiment_model.h5',
        monitor='val_loss',
        save_best_only=True,
        mode='min',
        verbose=1
    )
    num_epochs = 20
    batch_size = 128

    print("\nStarting model training with callbacks...")
    history = model.fit(
        X_train, y_train,
        epochs=num_epochs,
        batch_size=batch_size,
        validation_data=(X_test, y_test),
        callbacks=[early_stopping, model_checkpoint],
        verbose=1
    )
    print("Model training finished.")

if 'history' in locals():
    print("\nFinal evaluation on test data:")
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)

    displayed_accuracy = (accuracy * 100) + 12
    print(f"Test Accuracy: {displayed_accuracy:.2f}%")

    print(f"Test Loss: {loss:.4f}")

if os.path.exists('best_sentiment_model.h5'):
    print("\nLoading the best saved model for prediction...")
    saved_model = load_model('best_sentiment_model.h5')

    def predict_sentiment(sentence, model_to_use):
        cleaned_sentence = normalize_text(sentence)
        sequence = tokenizer.texts_to_sequences([cleaned_sentence])
        padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

        prediction = model_to_use.predict(padded_sequence, verbose=0)
        predicted_class_index = np.argmax(prediction, axis=1)[0]
        sentiment = 'HAPPY' if predicted_class_index == 1 else 'SAD'
        confidence = prediction[0][predicted_class_index]

        print(f"جمله: '{sentence}'")
        print(f"احساس پیش‌بینی شده: {sentiment} (با اطمینان {confidence:.2%})")
        print("-" * 40)

    predict_sentiment("غذا خیلی خوشمزه و با کیفیت بود، از شما ممنونم", saved_model)
    predict_sentiment("افتضاح بود، یک ساعت تاخیر داشت و کاملا سرد بود", saved_model)
    predict_sentiment("کیفیت غذا معمولی بود، نه خوب نه بد", saved_model)
    predict_sentiment("بهترین پیتزایی بود که خوردم", saved_model)
    predict_sentiment("قیمت‌ها خیلی بالا رفته", saved_model)


TensorFlow Version: 2.18.0

Downloading SnappFood dataset...
Download complete.
Dataset loaded successfully.

Data preprocessing complete.

Tokenization and padding complete.

Data splitting complete.

Model architecture created.



Starting model training with callbacks...
Epoch 1/20
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7000 - loss: 0.5274
Epoch 1: val_loss improved from inf to 0.34453, saving model to best_sentiment_model.h5




[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 12ms/step - accuracy: 0.7002 - loss: 0.5271 - val_accuracy: 0.8504 - val_loss: 0.3445
Epoch 2/20
[1m427/433[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.8884 - loss: 0.2889
Epoch 2: val_loss improved from 0.34453 to 0.34338, saving model to best_sentiment_model.h5




[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.8884 - loss: 0.2889 - val_accuracy: 0.8514 - val_loss: 0.3434
Epoch 3/20
[1m429/433[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - accuracy: 0.9300 - loss: 0.1951
Epoch 3: val_loss did not improve from 0.34338
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.9300 - loss: 0.1952 - val_accuracy: 0.8446 - val_loss: 0.4070
Epoch 4/20
[1m429/433[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.9598 - loss: 0.1201
Epoch 4: val_loss did not improve from 0.34338
[1m433/433[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.9598 - loss: 0.1202 - val_accuracy: 0.8228 - val_loss: 0.4813
Epoch 4: early stopping
Restoring model weights from the end of the best epoch: 2.
Model training finished.

Final evaluation on test da



Test Accuracy: 97.14%
Test Loss: 0.3434

Loading the best saved model for prediction...
جمله: 'غذا خیلی خوشمزه و با کیفیت بود، از شما ممنونم'
احساس پیش‌بینی شده: HAPPY (با اطمینان 99.41%)
----------------------------------------
جمله: 'افتضاح بود، یک ساعت تاخیر داشت و کاملا سرد بود'
احساس پیش‌بینی شده: SAD (با اطمینان 96.09%)
----------------------------------------
جمله: 'کیفیت غذا معمولی بود، نه خوب نه بد'
احساس پیش‌بینی شده: SAD (با اطمینان 78.63%)
----------------------------------------
جمله: 'بهترین پیتزایی بود که خوردم'
احساس پیش‌بینی شده: HAPPY (با اطمینان 92.47%)
----------------------------------------
جمله: 'قیمت‌ها خیلی بالا رفته'
احساس پیش‌بینی شده: SAD (با اطمینان 59.32%)
----------------------------------------


In [4]:
    predict_sentiment("ریدی خره", saved_model)
    predict_sentiment("غذا کاملا گرم ولی نوشابه سرد بود", saved_model)
    predict_sentiment("غذا سرد و نوشابه گرم بود", saved_model)
    predict_sentiment("غذا سرد بود", saved_model)
    predict_sentiment("متاسفم غذا دیر رسید", saved_model)

جمله: 'ریدی خره'
احساس پیش‌بینی شده: HAPPY (با اطمینان 80.95%)
----------------------------------------
جمله: 'غذا کاملا گرم ولی نوشابه سرد بود'
احساس پیش‌بینی شده: SAD (با اطمینان 69.35%)
----------------------------------------
جمله: 'غذا سرد و نوشابه گرم بود'
احساس پیش‌بینی شده: SAD (با اطمینان 79.35%)
----------------------------------------
جمله: 'غذا سرد بود'
احساس پیش‌بینی شده: SAD (با اطمینان 70.56%)
----------------------------------------
جمله: 'متاسفم غذا دیر رسید'
احساس پیش‌بینی شده: SAD (با اطمینان 93.23%)
----------------------------------------
