### Load the Dataset

In [114]:
import pandas as pd
from keras.models import Sequential,load_model
from keras.layers import Dense, Dropout, Input, BatchNormalization
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import ModelCheckpoint
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.isri import ISRIStemmer
import gensim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [115]:
df=pd.read_csv(f'Dataset.tsv', sep='\t')  
df 

Unnamed: 0,label,text
0,Positive,ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...
1,Positive,أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...
2,Positive,هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...
3,Positive,خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...
4,Positive,ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...
...,...,...
99994,Negative,معرفش ليه كنت عاوزة أكملها وهي مش عاجباني من ا...
99995,Negative,لا يستحق ان يكون في بوكنق لانه سيئ . لا شي. لا...
99996,Negative,كتاب ضعيف جدا ولم استمتع به. فى كل قصه سرد لحا...
99997,Negative,مملة جدا. محمد حسن علوان فنان بالكلمات، والوصف...


#### Arabic Words Preprocessing

In [116]:
# Arabic stopwords
nltk.download('stopwords')
arabic_stopwords = set(stopwords.words('arabic'))
stemmer = ISRIStemmer()

def preprocess_arabic_text(text):
    text = re.sub(r'[\u064B-\u0652]', '', text)  
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)  
    text = re.sub(r'\d+', '', text)  
    
    tokens = text.split()
    tokens = [word for word in tokens if word not in arabic_stopwords]
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

# Test
sample_text = "أحب البرمجة والتعلم في هذا العصر الحديث."
cleaned_text = preprocess_arabic_text(sample_text)
print(cleaned_text)



احب رمج علم عصر حدث


[nltk_data] Downloading package stopwords to /home/seif/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Loading Aravec Model

In [117]:
t_model = gensim.models.Word2Vec.load('models/full_grams_cbow_300_twitter.mdl')

In [118]:
def preprocess_embedd_arabic_text(text):
    text = re.sub(r'[\u064B-\u0652]', '', text)  
    text = re.sub(r'[^\u0600-\u06FF\s]', '', text) 
    text = re.sub(r'\d+', '', text)  

    tokens = text.split()
    filtered_sentence=[]
    for word in tokens:
        if word not in arabic_stopwords:
            stemmed_word = stemmer.stem(word)
            filtered_sentence.append(stemmed_word)
    if len(filtered_sentence) == 0:
        return np.zeros(t_model.vector_size)
    return t_model.wv.get_mean_vector(filtered_sentence)

In [119]:
df['text_vector'] = df['text'].apply(preprocess_embedd_arabic_text)

In [120]:
df = df[df['text_vector'].apply(lambda x: not np.array_equal(x, np.zeros(t_model.vector_size)))]
df

Unnamed: 0,label,text,text_vector
0,Positive,ممتاز نوعا ما . النظافة والموقع والتجهيز والشا...,"[0.03075951, -0.023665147, -0.0020882313, 0.00..."
1,Positive,أحد أسباب نجاح الإمارات أن كل شخص في هذه الدول...,"[0.032468032, -0.0050334274, 0.013271037, 0.00..."
2,Positive,هادفة .. وقوية. تنقلك من صخب شوارع القاهرة الى...,"[0.03640272, -0.0052244263, 0.0027546007, -0.0..."
3,Positive,خلصنا .. مبدئيا اللي مستني ابهار زي الفيل الاز...,"[0.039948247, -0.0121723525, 0.00025296805, 0...."
4,Positive,ياسات جلوريا جزء لا يتجزأ من دبي . فندق متكامل...,"[0.0050339643, -0.0018979385, 0.0238677, 0.033..."
...,...,...,...
99994,Negative,معرفش ليه كنت عاوزة أكملها وهي مش عاجباني من ا...,"[0.021896569, -0.028480032, 0.016720144, -0.00..."
99995,Negative,لا يستحق ان يكون في بوكنق لانه سيئ . لا شي. لا...,"[0.057995524, -0.018431652, 0.01127016, -0.000..."
99996,Negative,كتاب ضعيف جدا ولم استمتع به. فى كل قصه سرد لحا...,"[0.020529604, 0.014629706, -0.011728142, 0.011..."
99997,Negative,مملة جدا. محمد حسن علوان فنان بالكلمات، والوصف...,"[0.03499831, 0.0057043405, 0.007058305, 0.0093..."


In [121]:
df['sentiment_id'] = df['label'].map({'Negative': 0, 'Positive': 1, 'Mixed': 2})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment_id'] = df['label'].map({'Negative': 0, 'Positive': 1, 'Mixed': 2})


In [122]:
df['sentiment_id']

0        1
1        1
2        1
3        1
4        1
        ..
99994    0
99995    0
99996    0
99997    0
99998    0
Name: sentiment_id, Length: 99836, dtype: int64

## Model Training

In [123]:
X_train, X_test, y_train, y_test = train_test_split(df['text_vector'], df['sentiment_id'], test_size=0.1)
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(df['label'])
model = Sequential()
model.add(Input(shape=(t_model.vector_size,)))  # Input layer
model.add(BatchNormalization())
model.add(Dense(256, activation='relu'))  
model.add(Dropout(0.2))  # Dropout layer
model.add(Dense(128, activation='relu'))  # Hidden layer
model.add(Dropout(0.5))  # Dropout layer
model.add(Dense(64, activation='relu'))  # Hidden layer
model.add(Dense(len(np.unique(labels_encoded)), activation='softmax'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
checkpoint = ModelCheckpoint('best_model.keras', monitor='val_accuracy', save_best_only=True, mode='max', verbose=1)


In [124]:
model.summary()

In [125]:
X_train = np.stack(X_train)
X_test = np.stack(X_test)

In [126]:

model.fit(X_train, y_train, epochs=15, batch_size=60, validation_split=0.1, callbacks=[checkpoint])


Epoch 1/15
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5084 - loss: 0.9682
Epoch 1: val_accuracy improved from -inf to 0.59404, saving model to best_model.keras
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.5084 - loss: 0.9682 - val_accuracy: 0.5940 - val_loss: 0.8539
Epoch 2/15
[1m1341/1348[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 1ms/step - accuracy: 0.5826 - loss: 0.8684
Epoch 2: val_accuracy improved from 0.59404 to 0.60160, saving model to best_model.keras
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.5826 - loss: 0.8684 - val_accuracy: 0.6016 - val_loss: 0.8428
Epoch 3/15
[1m1325/1348[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 1ms/step - accuracy: 0.5927 - loss: 0.8476
Epoch 3: val_accuracy improved from 0.60160 to 0.61128, saving model to best_model.keras
[1m1348/1348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s

<keras.src.callbacks.history.History at 0x7ba296b1bb90>

In [127]:
model = load_model('best_model.keras')
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)  

print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 817us/step - accuracy: 0.6049 - loss: 0.8310
Test Loss: 0.8310883641242981, Test Accuracy: 0.6073718070983887
[1m312/312[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 869us/step
              precision    recall  f1-score   support

       Mixed       0.63      0.68      0.66      3356
    Negative       0.64      0.67      0.65      3290
    Positive       0.54      0.47      0.50      3338

    accuracy                           0.61      9984
   macro avg       0.60      0.61      0.60      9984
weighted avg       0.60      0.61      0.60      9984

