In [3]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import emoji
import ast
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModel
import torch

# Load and clean data
def wrangle_reviews(df):
    # Clean data
    df.drop(columns=["Unnamed: 0"], errors='ignore', inplace=True)
    df.drop_duplicates(subset=['review_description'], keep='first', inplace=True)
    df.dropna(subset=['review_description'], inplace=True)
    df.review_description = df.review_description.astype(str)

    # Remove punctuation
    df['review_description'] = df['review_description'].apply(
        lambda x: re.sub(r'[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~؛"""), ' ', x)
    )

    # Remove reviews with only emojis
    def is_only_emoji(text):
        text = str(text).strip()
        return not text or emoji.replace_emoji(text, '').strip() == ''
    df = df[~df['review_description'].apply(is_only_emoji)].reset_index(drop=True)

    return df

# Load data
df_raw = pd.read_csv("../../data/embedded_dataset.csv")
df = wrangle_reviews(df_raw)

# Convert string embeddings to list
df['embedding-using-cls-MARBERT'] = df['embedding-using-cls-MARBERT'].apply(ast.literal_eval)

# Prepare training data
X = np.array(df['embedding-using-cls-MARBERT'].tolist())
y = df['rating'].map({-1: 0, 1: 1})  # Binary classification

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Save model
with open("trained_model.sav", "wb") as f:
    pickle.dump(model, f)

# =============================
# MARBERT embedding for new text
# =============================

# Load tokenizer & model once
tokenizer = AutoTokenizer.from_pretrained("local_marbert/")
marbert = AutoModel.from_pretrained("local_marbert/")



  lambda x: re.sub(r'[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~؛"""), ' ', x)


              precision    recall  f1-score   support

           0       0.83      0.81      0.82      2632
           1       0.87      0.89      0.88      3837

    accuracy                           0.86      6469
   macro avg       0.85      0.85      0.85      6469
weighted avg       0.86      0.86      0.86      6469

Prediction Negative


In [5]:
def embed_review(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = marbert(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # CLS token
    return cls_embedding

# Load model
model = pickle.load(open("trained_model.sav", "rb"))

# New input
new_review = "خوش اكل"
new_embedding = embed_review(new_review).reshape(1, -1)  # Reshape for sklearn
prediction = model.predict(new_embedding)

# Output prediction
print("Prediction" , "Postive" if prediction[0]  == 1 else "Negative" )


Prediction Negative
