# Libraries 

In [2]:
import nltk
#to open csv file
import csv
import pandas as pd
import numpy as np
#sentences & words tokenization
from nltk.tokenize import sent_tokenize, word_tokenize
#regular expression 
import re
#for stopwords
from nltk.corpus import stopwords
import string
# from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.stem import ISRIStemmer
from nltk.stem import WordNetLemmatizer

In [3]:
#for preprocessing
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hends\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hends\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hends\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# csv path

In [6]:
dataset_path = r'C:\Users\hends\Documents\ANLP\dataset\arabic_english.csv'
dataset = pd.read_csv(dataset_path, encoding="utf-8")
dataset.head()

Unnamed: 0,arabic,english
0,متى أنشئت هذه الجامعة؟,When was this university founded?
1,أراها نادراً,I see it rarely.
2,يعزف على البيانو بشكل جيد جداً,He plays the piano very well.
3,مع كل احترامي.,With all due respect.
4,نظف أسنانك,Brush your teeth clean.


# preprocessing function

### arabic preprocessing

In [7]:
def preprocess_text(text):
    # Apply lowercase
    text = text.lower()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove Arabic stopwords
    stop_words = set(stopwords.words('arabic'))
    tokens = [token for token in tokens if token not in stop_words]
    # Remove Arabic punctuation and other non-alphanumeric characters
    tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens]
    # Remove empty tokens
    tokens = [token for token in tokens if token]
    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Apply stemming
    stemmer = ISRIStemmer()
    tokens = [stemmer.stem(token) for token in tokens]
    return tokens

In [8]:
dataset['arabic_preprocessed'] = dataset['arabic'].apply(preprocess_text)

In [32]:
dataset['arabic_preprocessed']

0                       [شئت, جمع]
1                       [ارا, ندر]
2        [عزف, ينو, شكل, جيد, جدا]
3                            [حرم]
4                       [نظف, سنن]
                   ...            
34884    [ذهب, انت, يسر, ونأ, يمي]
34885         [يجب, قرء, لنص, بتأ]
34886                   [عند, شكل]
34887                   [عند, شكل]
34888                   [عند, شكل]
Name: arabic_preprocessed, Length: 34889, dtype: object

### english preprocessing

In [11]:
def preprocess_english_text(text):
    # Apply lowercase
    text = text.lower()
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove English stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    # Remove English punctuation and other non-alphanumeric characters
    tokens = [re.sub(r'[^a-zA-Z]', '', token) for token in tokens]
    # Remove empty tokens
    tokens = [token for token in tokens if token]
    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return tokens

In [12]:
dataset['english_preprocessed'] = dataset['english'].apply(preprocess_english_text)

In [31]:
dataset['english_preprocessed']

0                  [university, founded]
1                          [see, rarely]
2                    [play, piano, well]
3                         [due, respect]
4                  [brush, teeth, clean]
                      ...               
34884              [go, left, go, right]
34885    [must, read, textbook, closely]
34886                          [problem]
34887                 [ve, got, problem]
34888                 [ve, got, problem]
Name: english_preprocessed, Length: 34889, dtype: object

# Feature Extraction

### word embedding

In [16]:
from gensim.models import Word2Vec

# Assuming you have lists of tokenized Arabic and English sentences
arabic_corpus = [sentence for sentence in dataset['arabic_preprocessed']]
english_corpus = [sentence for sentence in dataset['english_preprocessed']]

In [17]:
# Train Arabic Word2Vec model
arabic_model = Word2Vec(sentences=arabic_corpus, vector_size=300, window=5, min_count=5, workers=4)

# Train English Word2Vec model
english_model = Word2Vec(sentences=english_corpus, vector_size=300, window=5, min_count=5, workers=4)

In [25]:
arabic_model.save('arabic_word2vec.model')
english_model.save('english_word2vec.model')

In [26]:
from gensim.models import Word2Vec

arabic_model = Word2Vec.load('arabic_word2vec.model')
english_model = Word2Vec.load('english_word2vec.model')

In [27]:
import numpy as np

def text_to_embeddings(text, model):
    embeddings = []
    for word in text:
        if word in model.wv:
            embeddings.append(model.wv[word])
        else:
            # Handle out-of-vocabulary words
            embeddings.append(np.zeros(model.vector_size))
    return np.array(embeddings)

In [28]:
dataset['arabic_embeddings'] = dataset['arabic_preprocessed'].apply(lambda x: text_to_embeddings(x, arabic_model))
dataset['english_embeddings'] = dataset['english_preprocessed'].apply(lambda x: text_to_embeddings(x, english_model))

In [29]:
dataset['arabic_embeddings']

0        [[0.011475698, 0.10967453, 0.02558354, 0.02932...
1        [[0.0051676948, 0.061253097, 0.017778201, 0.01...
2        [[0.031647027, 0.21455647, 0.054307714, 0.0547...
3        [[0.028915202, 0.23491731, 0.056054126, 0.0598...
4        [[0.02441435, 0.23771553, 0.051473312, 0.06183...
                               ...                        
34884    [[0.02755453996360302, 0.2768106162548065, 0.0...
34885    [[0.03508031368255615, 0.2866920530796051, 0.0...
34886    [[0.037276633, 0.30006623, 0.06619843, 0.07606...
34887    [[0.037276633, 0.30006623, 0.06619843, 0.07606...
34888    [[0.037276633, 0.30006623, 0.06619843, 0.07606...
Name: arabic_embeddings, Length: 34889, dtype: object

In [30]:
dataset['english_embeddings']

0        [[0.022330971, 0.21512741, -0.010423325, 0.108...
1        [[0.015965493, 0.23092939, -0.0027182873, 0.11...
2        [[0.026843045, 0.23262744, -0.016636169, 0.112...
3        [[0.009102348, 0.08759432, -0.0013995869, 0.04...
4        [[0.009990926, 0.09358135, -0.0077565066, 0.04...
                               ...                        
34884    [[0.01925363, 0.22731914, -0.014380923, 0.1263...
34885    [[0.030479565, 0.26141086, -0.00827911, 0.1375...
34886    [[0.020079298, 0.24023326, -0.007571408, 0.120...
34887    [[0.028377939, 0.24865027, -0.020709172, 0.137...
34888    [[0.028377939, 0.24865027, -0.020709172, 0.137...
Name: english_embeddings, Length: 34889, dtype: object