<a href="https://colab.research.google.com/github/khawla-T/NeuralNetwork/blob/main/sudidialect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://github.com/iwan-rg/Saudi-Dialect-Irony-Dataset/tree/main


In [130]:
from spellchecker import SpellChecker
from tqdm import tqdm
import re
import pyarabic.araby as araby
import pandas as pd

In [None]:
pip install pyspellchecker

In [None]:
pip install SpellChecker

In [None]:
from arabert import ArabertPreprocessor
from arabert.aragpt2.grover.modeling_gpt2 import GPT2LMHeadModel

In [None]:
pip install arabert

In [None]:
pip install transformers

Preprocessing

Before training the model, the data is preprocessed by performing the following steps:

drop all word or letters, which are not Arabic (like tags,..)
remove repetitive letters and word which have one letter
apply arabert preprocessing


**Collecting Data **

In [233]:
fields=['Tweet_ID','Tweets_withDecodedemojis','Final_Annotation']
train_dataset = pd.read_csv('SaudiIrony.csv',usecols=fields)
train_Saudi= train_dataset['Tweets_withDecodedemojis']

In [None]:
train_Saudi

In [234]:
train_d_pos = pd.DataFrame(train_Saudi, columns=[ 'Tweets_withDecodedemojis'])
train_d_pos.rename(columns = {'Tweets_withDecodedemojis':'tweet'}, inplace = True)
train_d_pos['dialect']='SA'
train_d_pos['tweet'] = train_d_pos['tweet'].astype(str)
train_d_pos.head(5)

Unnamed: 0,tweet,dialect
0,نعم من علامات الجمال تلك الطيبه التي لاترى بل ...,SA
1,المعرفه الجديده والمهارات الجديده واعتماد طرائ...,SA
2,لايشيخ,SA
3,لايشيخ وكورونا بتزيده مناعه يعني كورونا العن ت...,SA
4,لايشيخ و ليه المفروض اشتري بدل,SA


In [235]:
#tunisain
fields=['texts','data_labels']
train_dataset_tuii = pd.read_csv('tun.csv',usecols=fields)
train_dataset_tu=train_dataset_tuii['texts']

In [None]:
train_dataset_tu

In [236]:
#algirian
fields=['id','text']
train_dataset_al = pd.read_csv('datasetAlger.csv',usecols=fields)
train_d_al=train_dataset_al['text']

In [None]:
train_d_al[7]

'مبهمه وغامضه لم تشدني ابدا ولم اشعر بالتشويق فيها انهيتها ولا زلت انتظر المزيد '

In [237]:
#egypt
fields=['review']
train_dataset_eg = pd.read_csv('40000-Egyptian-tweets.csv',usecols=fields)
train_d_eg=train_dataset_eg['review']

In [238]:
dd= pd.concat([train_d_al[1:6603], train_d_eg[1:6603]])
dd= pd.concat([dd, train_dataset_tu[1:6603]])
len(dd)

19806

In [239]:
train_d_negative = pd.DataFrame(dd, columns=[ 'tweet'])

In [240]:
train_d_negative['dialect']='NS'

In [171]:
train_d_negative.head(5)

Unnamed: 0,tweet,dialect
1,من اسوا ما قرات ولا اجد حبكه او مغزي قمه الملل...,NS
2,احلي تخلف,NS
3,الله يرحم والديك الشيخ حفيظ علي هذا الكلام اكب...,NS
4,زرت فرع الخبر المطعم شكله مستهلك واللحم ماله طعم,NS
5,ان اللون الرمادي ليس له محل من الاعراب يا ابيض...,NS


In [None]:
train_d_negative.info()

In [241]:
train_d_negative['tweet'] = train_d_negative['tweet'].astype(str)

**Start Pre-processing**

In [82]:
""""the idea of this process is to remove all strange letters from arabic,
and drop duplicate letters in words like "هههههههه" or "لااااااااااا"
"""
def preprocessing_V0_1_0(data):
    for i in tqdm(range(len(data))):
        #get just arabic text
        data['tweet'].iloc[i]=re.sub(r'[u0600-u06FF]+', '', data['tweet'].iloc[i]).strip()
        data['tweet'].iloc[i]=re.sub(r'[a-z]+', '', data['tweet'].iloc[i]).strip()
        #remove duplicate letter
        data['tweet'].iloc[i]=re.sub(r'(.)\1+', r'\1', data['tweet'].iloc[i]).strip()
        #removing symbols
        data['tweet'].iloc[i]=' '.join(w for w in re.split(r"\W", data['tweet'].iloc[i]) if w)
        #remove letters
        data['tweet'].iloc[i]=' '.join(w for w in araby.tokenize(data['tweet'].iloc[i]) if len(w)>1)
    return data

In [133]:
""""the idea of this process is to check if the word in english or frensh (The most popular foreign languages in the region)
if it is, we drop it,for other words we use aransia to translate it to arabic letters,
and drop duplicate letters in words like "هههههههه" or "لااااااااااا"
"""
check_frensh= SpellChecker(language='fr')
check_English=SpellChecker()
def check(word):
    if(re.search(r'[a-zA-Z]',word)!=None):
        if word == check_English.correction(word) or word == check_frensh.correction(word):
            return False

    return True

def preprocessing_V0_1_1(data):
    for i in tqdm(range(len(data))):
        #remove links
        data['tweet'].iloc[i] = re.sub(r'http\S+', '',  data['tweet'].iloc[i])
        #remove users nam
        data['tweet'].iloc[i]=' '.join(w for w in re.split(r"@\w*",data['tweet'].iloc[i]) if w)

        #removing symbols
        data['tweet'].iloc[i]=' '.join(w for w in re.split(r"\W", data['tweet'].iloc[i]) if w)


        #remove English word and frensh word
        if re.search(r'[a-zA-Z]',data['tweet'].iloc[i])!=None:
            data['tweet'].iloc[i]=' '.join(w for w in data['tweet'].iloc[i].split() if check(w))
            #use aranisia
            data['tweet'].iloc[i]=transliterate(data['tweet'].iloc[i], source='ma', target='ar' , universal=True)
        #remove duplicate letter
        data['tweet'].iloc[i]=re.sub(r'(.)\1+', r'\1', data['tweet'].iloc[i]).strip()
        #remove letters
        data['tweet'].iloc[i]=' '.join(w for w in araby.tokenize(data['tweet'].iloc[i]) if len(w)>1)
    return data

In [242]:

data__=preprocessing_V0_1_0(train_d_negative.copy())

100%|██████████| 19806/19806 [00:40<00:00, 490.78it/s]


In [243]:
data__=preprocessing_V0_1_1(data__.copy())

100%|██████████| 19806/19806 [00:40<00:00, 487.20it/s]


In [244]:
# pre-process the Saudi dialect
data_pos=preprocessing_V0_1_0(train_d_pos.copy())

100%|██████████| 19804/19804 [00:38<00:00, 508.99it/s]


In [245]:
data_pos=preprocessing_V0_1_1(data_pos.copy())

100%|██████████| 19804/19804 [00:39<00:00, 501.19it/s]


1- frequent words removing

In [246]:
def freq_words_removal(text, lst_words):
    lst_text = text.split()
    if lst_words is not None:
        lst_text = [word for word in lst_text if word not in lst_words]
    text = " ".join(lst_text)
    return text
wrds = ['مع','لا','على','من','ما','في','الي','هو','انا','أنا','اله']
data_pos["tweet"] = data_pos["tweet"].apply(lambda x: freq_words_removal(x, wrds))

In [247]:
data__["tweet"] = data__["tweet"].apply(lambda x: freq_words_removal(x, wrds))

In [180]:
len(data_pos)

19804

In [181]:
len(data__)

19806

Combine the two datasets of Saudi and non-Saudi dialect

In [248]:
data_set=data_pos.append(data__)
len(data_set)

  data_set=data_pos.append(data__)


39610

Shuffling

In [None]:
#Shuffle the dataset
#df = df.reindex(np.random.permutation(df.index))
#df['LABEL'] = 11 # SA is lables as 11
#dataset=dataset[dataset['dialect'].isnull()==False]

In [None]:
#One-hot encode the lab
#df.loc[df['country'] == 'SA', 'LABEL'] = 0
# I have only one class!!
#from keras.utils.np_utils import to_categorical
#labels = to_categorical(df['LABEL'], num_classes=18)

In [274]:
dataset=data_set.rename(columns={'tweet':'text'})

In [265]:
from arabert.preprocess import ArabertPreprocessor
model_name="bert-base-arabert"
arabert_prep = ArabertPreprocessor(model_name=model_name)



In [266]:
dataset["text"]=dataset["text"].apply(lambda x:arabert_prep.preprocess(x))

In [None]:
#test_data["text"]=test_data["text"].apply(lambda x:arabert_prep.preprocess(x))

## **create a classification dataset to load the data**

In [267]:
map_label={
    'NS':0,
    'SA':1,
}
label_map={
    0:'NS',
    1:'SA',
}


In [268]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("lafifi-24/arbert_arabic_dialect_identification")
model = AutoModelForSequenceClassification.from_pretrained("lafifi-24/arbert_arabic_dialect_identification")

Encoding(num_tokens=68, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [269]:
from torch.utils.data import  Dataset

In [270]:
num_labels = 2
max_length = 150

In [271]:

class ClassificationDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
      super(ClassificationDataset).__init__()

      self.text = text
      self.target = target
      self.tokenizer_name = model
      self.tokenizer = tokenizer
      self.max_len = max_len
      self.label_map = label_map


    def __len__(self):
      return len(self.text)

    def __getitem__(self,item):
      text = str(self.text[item])
      text = " ".join(text.split())

      inputs = self.tokenizer(
          text,
          max_length=self.max_len,
          padding='max_length',
          truncation=True
        )
      return InputFeatures(**inputs,label= self.target[item])

## **Creating datasets**

In [272]:
train_dataset = ClassificationDataset(
    dataset['text'].to_list(),
    dataset['dialect'].to_list(),
    model,
    max_length,
    map_label
)


In [207]:
train_dataset

<__main__.ClassificationDataset at 0x78bf92290ee0>

In [277]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

TypeError: ignored

### **Kernas**

In [250]:
import numpy as np
sd= dataset
sd['lable']= np.where(sd.dialect=='SA', 1, 0)

In [262]:
sd['text'] = sd['text'].astype(str)
sd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39610 entries, 0 to 6602
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     39610 non-null  object
 1   dialect  39610 non-null  object
 2   lable    39610 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [261]:
dataset2=sd
dataset2['text'] = dataset2['text'].astype(str)
dataset2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39610 entries, 0 to 6602
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   text     39610 non-null  object
 1   dialect  39610 non-null  object
 2   lable    39610 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.2+ MB


In [252]:
X= dataset2.iloc[:,0:2]
y= dataset2['lable']

Split the dataset and compelete the preprossing

In [253]:
#dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25

In [278]:
X_train = X_train.apply(lambda x: str(x[0], encoding='utf-8'))
X_test = X_test.apply(lambda x:  str(x[0], encoding='utf-8'))

TypeError: ignored

In [230]:
import tensorflow as tf

In [1]:
from transformers import AutoTokenizer

tokenized_data = tokenizer(X_train['text'].values.tolist(), return_tensors="np", padding=True)
# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
tokenized_data = dict(tokenized_data)

labels = np.array(dataset["label"])  # Label is already an array of 0 and 1

NameError: ignored

In [231]:
#data_toknized
#model

new_model = tf.keras.Sequential(model.layers[:-1])
new_model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

AttributeError: ignored

In [None]:
# Testing
txt = ["فيديوات لي كيطلعو ليك فاش كتوصل لباج 987 فالموقع الازرق"]
seq = tokenizer.texts_to_sequences(txt)
padded = pad_sequences(seq, maxlen=max_len)
pred = model.predict(padded)
labels = ['SA','QA','KW','AE','OM','JO','PL','BH','LY','EG','SD','IQ','LB','SY','TN','DZ','MA','YE']
print(pred, labels[np.argmax(pred)])

In [None]:
#model_A = tf.keras.models.load_model("my_model_A")
#model_B_on_A = tf.keras.Sequential(model_A.layers[:-1])
#model_B_on_A.add(tf.keras.layers.Dense(1, activation="sigmoid"))