# Experiment 1 

# TF-IDF + FeedForward Neural Network

In [3]:
import pandas as pd
import re
import random
import numpy as np

In [4]:
df = pd.read_csv('C:\\Users\\ASUS\\Desktop\\arabic-empathetic-conversations.csv')

In [5]:
print("عدد الصفوف الفارغة:", df[df.isnull().all(axis=1)].shape[0])

عدد الصفوف الفارغة: 0


In [6]:
df = df.drop('emotion', axis=1)

In [7]:
df = df.rename(columns={
    'context': 'Question',
    'response': 'Answer',
})

df.head()

Unnamed: 0,Question,Answer
0,أتذكر أنني ذهبت لمشاهدة الألعاب النارية مع أعز...,هل كان هذا صديقًا كنت تحبه أم مجرد أفضل صديق؟
1,كان هذا أفضل صديق. اشتقت لها.,اين ذهبت؟
2,لم نعد نتحدث.,هل كان هذا شيء حدث بسبب جدال؟
3,أشعر وكأنني ضرب على جدار فارغ عندما أرى الظلام,أجل؟ أنا حقا لا أرى كيف
4,ألا تشعر بذلك .. إنه لأمر عجيب,أصطدم في الواقع بجدران فارغة في كثير من الأحيا...


In [8]:
import re
def clean_text(text):
  """
  Removes punctuation, numbers, and dates from the text.
  """
  text = re.sub(r'[^\w\s]', '', text) 
  text = re.sub(r'\d+', '', text)  
  text = re.sub(r'\b\d{1,2}/\d{1,2}/\d{2,4}\b', '', text)  
  return text



df['Question'] = df['Question'].astype(str).apply(clean_text)
df['Answer'] = df['Answer'].astype(str).apply(clean_text)

df.head()


Unnamed: 0,Question,Answer
0,أتذكر أنني ذهبت لمشاهدة الألعاب النارية مع أعز...,هل كان هذا صديقا كنت تحبه أم مجرد أفضل صديق
1,كان هذا أفضل صديق اشتقت لها,اين ذهبت
2,لم نعد نتحدث,هل كان هذا شيء حدث بسبب جدال
3,أشعر وكأنني ضرب على جدار فارغ عندما أرى الظلام,أجل أنا حقا لا أرى كيف
4,ألا تشعر بذلك إنه لأمر عجيب,أصطدم في الواقع بجدران فارغة في كثير من الأحيا...


In [9]:
def remove_tashkeel(text):
    if not isinstance(text, str):
        text = str(text)
    return re.sub(r'[\u064B-\u065F]', '', text)

df[['Question' , 'Answer']] = df[['Question' , 'Answer']].applymap(remove_tashkeel)


  df[['Question' , 'Answer']] = df[['Question' , 'Answer']].applymap(remove_tashkeel)


In [10]:
!pip install nltk --quiet
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
def remove_stopwords(text):
    if isinstance(text, str):
        stop_words = set(stopwords.words('arabic'))
        words = text.split()
        filtered_words = [word for word in words if word not in stop_words]
        return " ".join(filtered_words)
    return text


In [12]:
df[['Question', 'Answer']] = df[['Question', 'Answer']].applymap(remove_stopwords)


def remove_punctuation(text):
    if isinstance(text, str):
        text = re.sub(r'[^\w\s]', '', text)
    return text


  df[['Question', 'Answer']] = df[['Question', 'Answer']].applymap(remove_stopwords)


In [13]:
df[['Question', 'Answer']] = df[['Question', 'Answer']].applymap(remove_punctuation)

  df[['Question', 'Answer']] = df[['Question', 'Answer']].applymap(remove_punctuation)


In [14]:
def generate_response_candidates(df, num_negatives=2):
    data = []
    all_responses = df['Answer'].tolist()

    for _, row in df.iterrows():
        question = row['Question']
        correct_response = row['Answer']

        negatives = random.sample([r for r in all_responses if r != correct_response], num_negatives)
        data.append((question, correct_response, 1))  
        for neg in negatives:
            data.append((question, neg, 0))  

    return pd.DataFrame(data, columns=['Question', 'Answer', 'label'])

pairs_df = generate_response_candidates(df, num_negatives=2)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=3000)
vectorizer.fit(pairs_df['Question'].tolist() + pairs_df['Answer'].tolist())

def get_vector(text):
    return vectorizer.transform([text]).toarray()[0]

X = []
y = []

for _, row in pairs_df.iterrows():
    q_vec = get_vector(row['Question'])
    r_vec = get_vector(row['Answer'])
    combined = np.concatenate([q_vec, r_vec])
    X.append(combined)
    y.append(row['label'])

X = np.array(X)
y = np.array(y)


In [16]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))

model.compile(optimizer=Adam(1e-4), loss='binary_crossentropy', metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Experiment 2

# Encoder-Decoder & LSTM

In [31]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


questions = df['Question'].tolist()
answers = df['Answer'].tolist()


answers_input = ['<start> ' + a for a in answers]
answers_target = [a + ' <end>' for a in answers]

tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(questions + answers_input + answers_target)
vocab_size = len(tokenizer.word_index) + 1


q_seq = tokenizer.texts_to_sequences(questions)
a_input_seq = tokenizer.texts_to_sequences(answers_input)
a_target_seq = tokenizer.texts_to_sequences(answers_target)


max_len = 20
q_seq = pad_sequences(q_seq, maxlen=max_len, padding='post')
a_input_seq = pad_sequences(a_input_seq, maxlen=max_len, padding='post')
a_target_seq = pad_sequences(a_target_seq, maxlen=max_len, padding='post')

In [22]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense


encoder_inputs = Input(shape=(max_len,))
enc_emb = Embedding(vocab_size, 256, mask_zero=True)(encoder_inputs)
_, h, c = LSTM(256, return_state=True)(enc_emb)
encoder_states = [h, c]


decoder_inputs = Input(shape=(max_len,))
dec_emb = Embedding(vocab_size, 256, mask_zero=True)(decoder_inputs)
dec_lstm = LSTM(256, return_sequences=True)(dec_emb, initial_state=encoder_states)
dec_dense = Dense(vocab_size, activation='softmax')(dec_lstm)


In [24]:
model = Model([encoder_inputs, decoder_inputs], dec_dense)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit([q_seq, a_input_seq], a_target_seq,
                    batch_size=64, epochs=10, validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
def generate_response(input_text):
    
    seq = tokenizer.texts_to_sequences([input_text])
    seq = pad_sequences(seq, maxlen=max_len, padding='post')  

    target_seq = np.zeros((1, max_len))
    target_seq[0, 0] = tokenizer.word_index.get('<start>', 1)

    response = []

    for i in range(1, max_len):
        preds = model.predict([seq, target_seq], verbose=0)
        pred_id = np.argmax(preds[0, i - 1])

        if pred_id == 0 or pred_id == tokenizer.word_index.get('<end>', 2):
            break

        word = tokenizer.index_word.get(pred_id, '')
        response.append(word)
        target_seq[0, i] = pred_id

    return ' '.join(response)


In [30]:
test_questions = [
    "أشعر بالحزن",
    "أنا قلق بشأن مستقبلي",
    "لقد نجحت في الامتحان",
    "صديقي تخلى عني"
]

for q in test_questions:
    print(f"\n سؤال: {q}")
    print(f" رد النموذج: {generate_response(q)}")


 سؤال: أشعر بالحزن
 رد النموذج: حدث

 سؤال: أنا قلق بشأن مستقبلي
 رد النموذج: 

 سؤال: لقد نجحت في الامتحان
 رد النموذج: يجب تكون فخورا جدا

 سؤال: صديقي تخلى عني
 رد النموذج: بتحسين الروك مرة أخرى


# Experiment 3

#  Word2Vec & Attention Mechanism & LSTM 

In [32]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate, TimeDistributed, dot, Activation

latent_dim = 256


encoder_inputs = Input(shape=(max_len,))
encoder_embedding = Embedding(vocab_size, 256)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)


encoder_states = [state_h, state_c]


decoder_inputs = Input(shape=(max_len,))
decoder_embedding = Embedding(vocab_size, 256)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

# Attention Mechanism
attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2]) 
attention = Activation('softmax')(attention)
context = dot([attention, encoder_outputs], axes=[2,1]) 


decoder_combined_context = Concatenate(axis=-1)([context, decoder_outputs])


output = TimeDistributed(Dense(256, activation="relu"))(decoder_combined_context)
decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(output)


model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_5 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 input_6 (InputLayer)           [(None, 20)]         0           []                               
                                                                                                  
 embedding_4 (Embedding)        (None, 20, 256)      10795776    ['input_5[0][0]']                
                                                                                                  
 embedding_5 (Embedding)        (None, 20, 256)      10795776    ['input_6[0][0]']                
                                                                                            

In [34]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


max_len = 20


tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Question'].tolist() + df['Answer'].tolist())
vocab_size = len(tokenizer.word_index) + 1


q_seq = tokenizer.texts_to_sequences(df['Question'].tolist())
a_seq = tokenizer.texts_to_sequences(df['Answer'].tolist())


sos_token = tokenizer.word_index.get('بداية', 1) 
eos_token = tokenizer.word_index.get('نهاية', 2)

decoder_input_seq = [[sos_token] + seq for seq in a_seq]
decoder_target_seq = [seq + [eos_token] for seq in a_seq]


encoder_input_data = pad_sequences(q_seq, maxlen=max_len, padding='post')
decoder_input_data = pad_sequences(decoder_input_seq, maxlen=max_len, padding='post')
decoder_target_data = pad_sequences(decoder_target_seq, maxlen=max_len, padding='post')


In [35]:
model.fit([encoder_input_data, decoder_input_data], 
          decoder_target_data.reshape(decoder_target_data.shape[0], decoder_target_data.shape[1], 1),
          batch_size=64, epochs=5, validation_split=0.1)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1b40dec6740>

# Experiment 4

# Encoder-Decoder & Embedding & Pad_Sequences & LSTM

In [42]:
import pandas as pd
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense, Concatenate, TimeDistributed, dot, Activation
from sklearn.model_selection import train_test_split


In [43]:
df2 = pd.read_csv('C:\\Users\\ASUS\\Desktop\\arabic-empathetic-conversations.csv')

In [44]:
df2 = df2.rename(columns={"context": "Question", "response": "Answer"})

In [45]:
def clean_text(text):
    text = re.sub(r'[^؀-ۿ\s]', '', str(text))  
    text = re.sub(r'\d+', '', text) 
    return text


In [46]:

df2['Question'] = df2['Question'].apply(clean_text).apply(remove_tashkeel)
df2['Answer'] = df2['Answer'].apply(clean_text).apply(remove_tashkeel)
df2['Emotion'] = df2['emotion'].apply(clean_text).apply(remove_tashkeel)

In [48]:
df2['Input'] = "العاطفة: " + df2['Emotion'] + "، السؤال: " + df2['Question']

In [49]:
input_texts = df2['Input'].tolist()
target_texts = df2['Answer'].tolist()

sos_token = 'بداية'
eos_token = 'نهاية'
target_texts_input = [sos_token + ' ' + t for t in target_texts]
target_texts_output = [t + ' ' + eos_token for t in target_texts]


In [50]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(input_texts + target_texts_input + target_texts_output)

vocab_size = len(tokenizer.word_index) + 1
max_len = 20


In [51]:
encoder_input_seq = tokenizer.texts_to_sequences(input_texts)
decoder_input_seq = tokenizer.texts_to_sequences(target_texts_input)
decoder_target_seq = tokenizer.texts_to_sequences(target_texts_output)

encoder_input_seq = pad_sequences(encoder_input_seq, maxlen=max_len, padding='post')
decoder_input_seq = pad_sequences(decoder_input_seq, maxlen=max_len, padding='post')
decoder_target_seq = pad_sequences(decoder_target_seq, maxlen=max_len, padding='post')


In [52]:
latent_dim = 256

encoder_inputs = Input(shape=(max_len,))
enc_emb = Embedding(vocab_size, 256)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(max_len,))
dec_emb = Embedding(vocab_size, 256)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)


In [53]:
attention = dot([decoder_outputs, encoder_outputs], axes=[2, 2])
attention = Activation('softmax')(attention)
context = dot([attention, encoder_outputs], axes=[2, 1])

decoder_combined_context = Concatenate(axis=-1)([context, decoder_outputs])

output = TimeDistributed(Dense(256, activation="relu"))(decoder_combined_context)
decoder_dense = TimeDistributed(Dense(vocab_size, activation='softmax'))
decoder_outputs = decoder_dense(output)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [54]:
decoder_target_data = np.expand_dims(decoder_target_seq, -1)

model.fit([encoder_input_seq, decoder_input_seq], decoder_target_data,
          batch_size=64, epochs=10, validation_split=0.1) 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b418cd51e0>

#  Experiment 5

# TfIdf tokenizer & sequnetial & Dense

In [2]:
import pandas as pd
import numpy as np
import re
import random

In [3]:
df = pd.read_csv('C:\\Users\\ASUS\\Desktop\\arabic-empathetic-conversations.csv')

In [91]:
df = df.dropna(subset=['context', 'response'])
df = df.rename(columns={'context': 'Question', 'response': 'Answer'})

In [93]:
def clean_text(text):
    text = re.sub(r'[\u064B-\u065F]', '', text) 
    text = re.sub(r'\d+', '', text) 
    text = re.sub(r'[^\w\s]', '', text) 
    return text

df['Question'] = df['Question'].astype(str).apply(clean_text)
df['Answer'] = df['Answer'].astype(str).apply(clean_text)

In [94]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('arabic'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])

df['Question'] = df['Question'].apply(remove_stopwords)
df['Answer'] = df['Answer'].apply(remove_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [97]:
def generate_response_candidates(df, num_negatives=2):
    data = []
    all_responses = df['Answer'].tolist()
    for _, row in df.iterrows():
        question = row['Question']
        correct = row['Answer']
        data.append((question, correct, 1))

    negatives = random.sample([r for r in all_responses if r != correct], num_negatives)
    for neg in negatives:
        data.append((question, neg, 0))

    return pd.DataFrame(data, columns=['Question', 'Answer', 'label'])


In [98]:
pairs_df = generate_response_candidates(df)

In [108]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000)
vectorizer.fit(pairs_df['Question'].tolist() + pairs_df['Answer'].tolist())


q_vecs = vectorizer.transform(pairs_df['Question'].tolist())
a_vecs = vectorizer.transform(pairs_df['Answer'].tolist())

In [109]:
from scipy.sparse import hstack

X_sparse = hstack([q_vecs, a_vecs])
X = X_sparse.toarray()
y = np.array(pairs_df['label'])

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [119]:
model = Sequential([
Dense(512, activation='relu', input_shape=(X.shape[1],)),
Dropout(0.4),
Dense(256, activation='relu'),
Dropout(0.2)
])

model.compile(optimizer=Adam(1e-4), loss='binary_crossentropy', metrics=['accuracy'])

In [120]:
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1b15d184d90>

# Experiment 6

# Encoder-Decoder & Tokenizer & LSTM & Dense

In [28]:
df3 = pd.read_csv('C:\\Users\\ASUS\\Desktop\\arabic-empathetic-conversations.csv')

In [29]:
df3.head()

Unnamed: 0,emotion,context,response
0,sentimental,أتذكر أنني ذهبت لمشاهدة الألعاب النارية مع أعز...,هل كان هذا صديقًا كنت تحبه أم مجرد أفضل صديق؟
1,sentimental,كان هذا أفضل صديق. اشتقت لها.,اين ذهبت؟
2,sentimental,لم نعد نتحدث.,هل كان هذا شيء حدث بسبب جدال؟
3,afraid,أشعر وكأنني ضرب على جدار فارغ عندما أرى الظلام,أجل؟ أنا حقا لا أرى كيف
4,afraid,ألا تشعر بذلك .. إنه لأمر عجيب,أصطدم في الواقع بجدران فارغة في كثير من الأحيا...


In [30]:
df3.dropna(subset=["emotion", "context", "response"], inplace=True)

In [31]:
df['input_text'] = df['emotion'] + " : " + df['context']
df['target_text'] = "<sos> " + df['response'] + " <eos>"

input_texts = df['input_text'].tolist()
target_texts = df['target_text'].tolist()

In [32]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np


In [33]:
input_tokenizer = Tokenizer(oov_token="<OOV>")
input_tokenizer.fit_on_texts(input_texts)
input_sequences = input_tokenizer.texts_to_sequences(input_texts)
input_padded = pad_sequences(input_sequences, padding='post')

In [34]:
target_tokenizer = Tokenizer(oov_token="<OOV>")
target_tokenizer.fit_on_texts(target_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)
target_padded = pad_sequences(target_sequences, padding='post')

In [35]:
num_encoder_tokens = len(input_tokenizer.word_index) + 1
num_decoder_tokens = len(target_tokenizer.word_index) + 1

In [36]:
max_encoder_seq_length = input_padded.shape[1]
max_decoder_seq_length = target_padded.shape[1]

In [37]:
encoder_input_data = input_padded
decoder_input_data = target_padded[:, :-1]
decoder_target_data = target_padded[:, 1:]

In [38]:
X_train, X_test, y_train_in, y_test_in, y_train_out, y_test_out = train_test_split(
encoder_input_data, decoder_input_data, decoder_target_data,
test_size=0.2,
random_state=42
)

In [39]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense

latent_dim = 256

encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_encoder_tokens, latent_dim)(encoder_inputs)
encoder_outputs, state_h, state_c = LSTM(latent_dim, return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c]

In [40]:
decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_decoder_tokens, latent_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [41]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [42]:
y_train_out = np.expand_dims(y_train_out, -1)
y_test_out = np.expand_dims(y_test_out, -1)

model.fit([X_train, y_train_in], y_train_out,
validation_data=([X_test, y_test_in], y_test_out),
batch_size=64,
epochs=1)



<keras.callbacks.History at 0x243521012d0>