<a href="https://colab.research.google.com/github/janaghoniem/Social-Media-Sentiment-Analysis/blob/main/arabic_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from camel_tools.tokenizers.word import simple_word_tokenize
import nltk
import pandas as pd
import plotly.express as px
import re
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

ModuleNotFoundError: No module named 'camel_tools'

In [None]:
nltk.download('stopwords')

In [None]:
df = pd.read_csv('arabic_reviews.csv')
# Remove 'Mixed' class
df = df[df['label'] != 'Mixed']

In [None]:
df

In [None]:
df[('label')].value_counts()

In [None]:
px.histogram(df, x="label") #check for imbalance

In [None]:
df.duplicated().sum() #check duplicates

In [None]:
df.isnull().sum()

In [None]:
print(df['label'].unique())

# PREPROCESSING

In [None]:
#stop_words = list(set(stopwords.words('arabic')))

stop_words = set([
    'في', 'من', 'إلى', 'على', 'أن', 'لا', 'ما', 'هذا', 'هذه', 'ذلك',
    'كان', 'يكون', 'هو', 'هي', 'مع', 'بين', 'عن', 'في', 'و', 'أو', 'إذ', 'إذا'
])

DIALECT_MSA_MAP = {
    # Egyptian
    "مش": "ليس",
    "دلوقتي": "الآن",
    "إزاي": "كيف",
    "كده": "هكذا",

    # Levantine
    "شو": "ماذا",
    "ليش": "لماذا",
    "كتير": "كثيراً",
    "مافي": "لا يوجد",

    # Gulf
    "وايد": "كثيراً",
    "زين": "جيد",
    "شسالفة": "ما القصة",
    "ايوا": "نعم",

    # Moroccan / Maghrebi
    "بزاف": "كثيراً",
    "واش": "هل",
    "شنو": "ماذا",
    "دابا": "الآن",

    # Iraqi
    "هسة": "الآن",
    "ماكو": "لا يوجد",
    "شكو ماكو": "ما الأخبار؟",
    "شلونك": "كيف حالك",
}

In [None]:
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

In [None]:
def dialect_to_msa(text):
    words = simple_word_tokenize(text)
    return ' '.join(DIALECT_MSA_MAP.get(word, word) for word in words)


In [None]:
def remove_gibberish(text):
    arabic_pattern = re.compile(r'[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s.,!?؟]')
    return arabic_pattern.sub('', text)


In [None]:
def remove_diacritics(text):
    arabic_diacritics = re.compile(""" ّ|َ|ً|ُ|ٌ|ِ|ٍ|ْ|ـ""", re.VERBOSE)
    return re.sub(arabic_diacritics, '', str(text))


In [None]:
def remove_emoji(text):
    regrex_pattern = re.compile("[\U0001F600-\U0001F64F"
                                "\U0001F300-\U0001F5FF"
                                "\U0001F680-\U0001F6FF"
                                "\U0001F1E0-\U0001F1FF]+", flags=re.UNICODE)
    return regrex_pattern.sub(r'', text)


In [None]:
def normalize_elongation(text):
    return re.sub(r'(.)\1+', r'\1\1', text)


In [None]:
def normalize_letters(text):
    text = re.sub("[إأٱآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    return text


In [None]:
def clean_punctuation(text):
    # Define punctuation to keep
    keep = '،.؟!'  # Arabic comma, dot, question, exclamation

    # Remove any punctuation that is NOT in 'keep'
    # This keeps Arabic letters and spaces, removes other symbols
    text = re.sub(rf'[^\w\s{keep}]', '', text)

    #remove excessive dots like "..."
    text = re.sub(r'\.{2,}', '.', text)

    # remove multiple spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

In [None]:
#pipline
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    # text = dialect_to_msa(text)
    text = remove_gibberish(text)
    text = remove_emoji(text)
    text = normalize_elongation(text)
    text = remove_diacritics(text)
    text = normalize_letters(text)
    text = clean_punctuation(text)
    tokens = word_tokenize(text)
    text = ' '.join([word for word in tokens if word not in stop_words])
    return text


In [None]:
df['cleanedtext'] = df['text'].apply(clean_text)

In [None]:
df

In [None]:
# encode label
label_mapping = {'Negative': -1, 'Positive': 1}
df['label'] = df['label'].str.strip()
df['label'] = df['label'].map(label_mapping)

df

In [None]:
#TF-IDF vector
vectorizer = TfidfVectorizer()
x_tfidf = vectorizer.fit_transform(df['cleanedtext'])
y = df['label']

In [None]:
df

In [None]:
print(x_tfidf)

#Feed-Forward Neural Network

In [None]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)  # or 10,000 if your data is big
x_tfidf = vectorizer.fit_transform(arabic_df['cleanedtext'])

In [None]:
# Labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(arabic_df['label'])
y_cat = to_categorical(y)

In [None]:
# Train/Test Split
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y_cat, test_size=0.2, random_state=42)

In [None]:
# Convert sparse matrix to dense (required by Keras)
x_train_dense = x_train.toarray()
x_test_dense = x_test.toarray()

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.regularizers import l2
from keras.callbacks import EarlyStopping

# Define EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Model with regularization and dropout
model = Sequential([
    Input(shape=(x_train_dense.shape[1],)),
    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.5),
    Dense(y_cat.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train with EarlyStopping
history = model.fit(
    x_train_dense, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop]
)

Epoch 1/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 17ms/step - accuracy: 0.7707 - loss: 0.6043 - val_accuracy: 0.8311 - val_loss: 0.5172
Epoch 2/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 14ms/step - accuracy: 0.8409 - loss: 0.5133 - val_accuracy: 0.8382 - val_loss: 0.5049
Epoch 3/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - accuracy: 0.8401 - loss: 0.5084 - val_accuracy: 0.8391 - val_loss: 0.5083
Epoch 4/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 15ms/step - accuracy: 0.8466 - loss: 0.5029 - val_accuracy: 0.8335 - val_loss: 0.5090
Epoch 5/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 14ms/step - accuracy: 0.8508 - loss: 0.4959 - val_accuracy: 0.8416 - val_loss: 0.5056


In [None]:
from sklearn.metrics import accuracy_score, f1_score

# Predict on test set
y_test_pred = model.predict(x_test_dense)
y_test_pred_classes = y_test_pred.argmax(axis=1)
y_test_true = y_test.argmax(axis=1)

# Accuracy and F1
acc = accuracy_score(y_test_true, y_test_pred_classes)
f1 = f1_score(y_test_true, y_test_pred_classes, average='weighted')  # or 'macro'

print(f"Test Accuracy: {acc:.4f}")
print(f"Weighted F1 Score: {f1:.4f}")

[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step
Test Accuracy: 0.8332
Weighted F1 Score: 0.8332


In [None]:
def predict_sentiment(text):
    vec = vectorizer.transform([text]).toarray()
    pred = model.predict(vec)
    class_idx = pred.argmax()

    # Map -1 and 1 to labels manually
    index_to_label = {-1: "negative", 1: "positive"}
    original_label = label_encoder.inverse_transform([class_idx])[0]
    label_name = index_to_label[original_label]
    print(text)
    print(f"Predicted Class: {label_name}")

In [None]:
predict_sentiment("المنتج سيء للغاية")
predict_sentiment("أحببت هذا الفيلم كثيرًا")
predict_sentiment("رائع")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step
المنتج سيء للغاية
Predicted Class: negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
أحببت هذا الفيلم كثيرًا
Predicted Class: negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
رائع
Predicted Class: positive


#LSTM/GRU Model

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GRU, Bidirectional
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Step 1: Tokenization
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(arabic_df['cleanedtext'])

sequences = tokenizer.texts_to_sequences(arabic_df['cleanedtext'])
padded = pad_sequences(sequences, maxlen=100)  # You can adjust maxlen

# Step 2: Labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(arabic_df['label'])  # assumes labels are -1, 1
y_cat = to_categorical(y)

# Step 3: Train/Test split
x_train, x_test, y_train, y_test = train_test_split(padded, y_cat, test_size=0.2, random_state=42)

# Step 4: Model
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.5),
    Bidirectional(LSTM(32)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.4),
    Dense(y_cat.shape[1], activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Step 5: Training
from keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    x_train, y_train,
    epochs=20,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop]
)

# Step 6: Evaluation
from sklearn.metrics import accuracy_score, f1_score

y_pred = model.predict(x_test).argmax(axis=1)
y_true = y_test.argmax(axis=1)

print("Test Accuracy:", accuracy_score(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred, average='weighted'))

Epoch 1/20




[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m386s[0m 251ms/step - accuracy: 0.7373 - loss: 0.4948 - val_accuracy: 0.8526 - val_loss: 0.3315
Epoch 2/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m378s[0m 249ms/step - accuracy: 0.8904 - loss: 0.2734 - val_accuracy: 0.8474 - val_loss: 0.3466
Epoch 3/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m382s[0m 248ms/step - accuracy: 0.9214 - loss: 0.1988 - val_accuracy: 0.8412 - val_loss: 0.4231
Epoch 4/20
[1m1500/1500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 247ms/step - accuracy: 0.9437 - loss: 0.1550 - val_accuracy: 0.8416 - val_loss: 0.4418
[1m417/417[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 64ms/step
Test Accuracy: 0.8499212480312007
F1 Score: 0.8497950272741241


In [None]:
def predict_sentiment(text):
    vec = vectorizer.transform([text]).toarray()
    pred = model.predict(vec)
    class_idx = pred.argmax()

    # Map -1 and 1 to labels manually
    index_to_label = {-1: "negative", 1: "positive"}
    original_label = label_encoder.inverse_transform([class_idx])[0]
    label_name = index_to_label[original_label]
    print(text)
    print(f"Predicted Class: {label_name}")

In [None]:
predict_sentiment("المنتج سيء للغاية")
predict_sentiment("أحببت هذا الفيلم كثيرًا")
predict_sentiment("رائع")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
المنتج سيء للغاية
Predicted Class: negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
أحببت هذا الفيلم كثيرًا
Predicted Class: negative
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
رائع
Predicted Class: negative


# MODEL TRAINING

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_tfidf, y, test_size=0.1, random_state=42)

# Train classifier
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

# Predict and evaluate
y_pred = model.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

# ARABERT TEST