In [None]:
!pip install pyarabic
!pip install scikit-learn-intelex
!pip install nltk
!pip install fasttext

In [None]:
import pandas as pd
import numpy as np
import pyarabic.araby as araby
import re ,string
import fasttext
from gensim.models.phrases import Phrases, Phraser
from gensim.models import FastText
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearnex import patch_sklearn 
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

In [None]:
df=pd.read_csv('/content/transformed.csv', encoding='utf-8-sig')
df.head(100)

Unnamed: 0,id,dialect,transformed,fineText
0,1175358310087892992,IQ,0,لكن بالنهايه ينتفض يغير
1,1175416117793349632,IQ,0,يعنى هذا محسوب على البشر حيونه ووحشيه وتطلبون...
2,1175450108898565888,IQ,0,مبين من كلامه خليجى
3,1175471073770573824,IQ,0,يسلملى مرورك وروحك الحلوه
4,1175496913145217024,IQ,0,وين هل الغيبه اخ محمد
...,...,...,...,...
95,1168486929270300672,IQ,0,حلوه بالعراق انجمد البانيه للشتاء والباگله لل...
96,1168582662715510784,IQ,0,سنه يسولف برفع الصبات واللى اعتبره اللوگيه من...
97,1168585361787895808,IQ,0,هذا كله حتى يبقى اقتصاد ايران متوازن والعراق ...
98,1168585980309311488,IQ,0,كبسوله كل ست ساعات


**model cannot run with 400k rows so we need to take a sample of the dataset, but first we must shuffle the data to guarantee the sample we take has all 17 unique dialects, instead of taking the first 20k rows**

In [None]:
shuffled = df.sample(frac=1, random_state=42) 
selected_rows = shuffled.head(60000)
print(selected_rows['dialect'].unique())

['SA' 'KW' 'OM' 'SY' 'PL' 'LB' 'SD' 'EG' 'MA' 'LY' 'IQ' 'AE' 'BH' 'QA'
 'JO' 'YE' 'TN' 'DZ']


In [None]:
selected_rows['transformed'].unique()

array([ 8, 13, 14,  4,  3, 12, 15, 11,  7,  1,  0, 16, 17,  2,  6,  9,  5,
       10])

In [None]:
column_name = 'fineText'
has_null_values = selected_rows[column_name].isnull().any()

if has_null_values:
    print(f"The column '{column_name}' contains null values.")
else:
    print(f"The column '{column_name}' does not contain null values.")

The column 'fineText' contains null values.


In [None]:
selected_rows.dtypes

id              int64
dialect        object
transformed     int64
fineText       object
dtype: object

**type cast 'fineText' as string**

In [None]:
selected_rows['fineText'] = selected_rows['fineText'].fillna('').astype(str)

In [None]:
print(selected_rows['fineText'].apply(type).unique())

[<class 'str'>]


# ***Step 1: Preprocessing***

**Remove Arabic stop words from the text**

In [None]:
stop_words = set(stopwords.words('arabic'))

In [None]:
def process_text(text):
    if isinstance(text, str):
        return ' '.join([word for word in text.split() if word not in stop_words])
    return ''

selected_rows['fineText'] = selected_rows['fineText'].apply(process_text)

**some preprocessing steps have removed most fineText values so i need to check after each cleaning step that values are still there**

In [None]:
selected_rows.head(100)

Unnamed: 0,id,dialect,transformed,fineText
217576,1161331866680664064,SA,8,نجاح حج العام منتدى الوطن السعودى اللى مالهم ا...
344491,649669594408349696,KW,13,بنفس هاليوم قبلسنه تم ايقاف سياره شباب اعمارهم...
380119,628467008678400000,OM,14,احسن الظن فيهم يمكن التبن حاجه حلوه عندهم
133458,1018050664906329984,SY,4,هادا رضا امك عليكى بعتلك ناس حنييتهم بتشبه حني...
98921,1171931771346259968,PL,3,تامر عاشور اكيد الالبوم الجاى مش حيعمله كئيب
...,...,...,...,...
291489,960587892908228608,EG,11,فى السعيد كمان عادى دول بيلعبو فى نادى مصرى مش...
388205,878288196789911552,OM,14,زمان كنا نساعد ابوى فالحلوى
410938,883712080259493888,AE,16,زحل حر عندنا مطر
360858,1161926977651015680,KW,13,اللى دخلوهم مادخلوهم بالخش وكل شى مكشوف ومعروف...


**Remove Arabic Diacritization (tashkeel) like fatha, damma, kasra, shaddah, etc:**

In [None]:
selected_rows['fineText'] = selected_rows['fineText'].apply(lambda x: araby.strip_tashkeel(str(x)))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_rows['fineText'] = selected_rows['fineText'].apply(lambda x: araby.strip_tashkeel(str(x)))


In [None]:
selected_rows.head(100)

Unnamed: 0,id,dialect,transformed,fineText
217576,1161331866680664064,SA,8,نجاح حج العام منتدى الوطن السعودى اللى مالهم ا...
344491,649669594408349696,KW,13,بنفس هاليوم قبلسنه تم ايقاف سياره شباب اعمارهم...
380119,628467008678400000,OM,14,احسن الظن فيهم يمكن التبن حاجه حلوه عندهم
133458,1018050664906329984,SY,4,هادا رضا امك عليكى بعتلك ناس حنييتهم بتشبه حني...
98921,1171931771346259968,PL,3,تامر عاشور اكيد الالبوم الجاى مش حيعمله كئيب
...,...,...,...,...
291489,960587892908228608,EG,11,فى السعيد كمان عادى دول بيلعبو فى نادى مصرى مش...
388205,878288196789911552,OM,14,زمان كنا نساعد ابوى فالحلوى
410938,883712080259493888,AE,16,زحل حر عندنا مطر
360858,1161926977651015680,KW,13,اللى دخلوهم مادخلوهم بالخش وكل شى مكشوف ومعروف...


**Remove escape codes like \n, \t, \\, etc from text**

In [None]:
selected_rows['fineText'] = selected_rows['fineText'].apply(lambda x :re.sub(r"(\\n|\\r|\\t|\\)", "", x).strip())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_rows['fineText'] = selected_rows['fineText'].apply(lambda x :re.sub(r"(\\n|\\r|\\t|\\)", "", x).strip())


In [None]:
selected_rows.head(1000)

Unnamed: 0,id,dialect,transformed,fineText
217576,1161331866680664064,SA,8,نجاح حج العام منتدى الوطن السعودى اللى مالهم ا...
344491,649669594408349696,KW,13,بنفس هاليوم قبلسنه تم ايقاف سياره شباب اعمارهم...
380119,628467008678400000,OM,14,احسن الظن فيهم يمكن التبن حاجه حلوه عندهم
133458,1018050664906329984,SY,4,هادا رضا امك عليكى بعتلك ناس حنييتهم بتشبه حني...
98921,1171931771346259968,PL,3,تامر عاشور اكيد الالبوم الجاى مش حيعمله كئيب
...,...,...,...,...
242834,946146404954722432,DZ,10,انتى جبتى طاريه احب قاعده التعميم
201566,1118352179998412800,SA,8,صح الله لسانك محمد الحيسونى ولسانك يابوسعد ابد...
186679,1089546761910259584,MA,7,بغا يقول العزى انا راه باقى راجل
111631,1105873512756117632,PL,3,سيبكم التصدى الرهيب انا اللى مش فاهمه المدافع ...


**Remove URL from text**

In [None]:
selected_rows['fineText'] = selected_rows['fineText'].apply(lambda x : re.sub(r'http\S+', '', x, flags=re.MULTILINE))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_rows['fineText'] = selected_rows['fineText'].apply(lambda x : re.sub(r'http\S+', '', x, flags=re.MULTILINE))


In [None]:
selected_rows.head(1000)

Unnamed: 0,id,dialect,transformed,fineText
217576,1161331866680664064,SA,8,نجاح حج العام منتدى الوطن السعودى اللى مالهم ا...
344491,649669594408349696,KW,13,بنفس هاليوم قبلسنه تم ايقاف سياره شباب اعمارهم...
380119,628467008678400000,OM,14,احسن الظن فيهم يمكن التبن حاجه حلوه عندهم
133458,1018050664906329984,SY,4,هادا رضا امك عليكى بعتلك ناس حنييتهم بتشبه حني...
98921,1171931771346259968,PL,3,تامر عاشور اكيد الالبوم الجاى مش حيعمله كئيب
...,...,...,...,...
242834,946146404954722432,DZ,10,انتى جبتى طاريه احب قاعده التعميم
201566,1118352179998412800,SA,8,صح الله لسانك محمد الحيسونى ولسانك يابوسعد ابد...
186679,1089546761910259584,MA,7,بغا يقول العزى انا راه باقى راجل
111631,1105873512756117632,PL,3,سيبكم التصدى الرهيب انا اللى مش فاهمه المدافع ...


**Remove username “@handle“ from text**

In [None]:
selected_rows['fineText'] = selected_rows['fineText'].apply(lambda x: re.sub(r'@\w+\s*', '', x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_rows['fineText'] = selected_rows['fineText'].apply(lambda x: re.sub(r'@\w+\s*', '', x))


In [None]:
selected_rows.head(1000)

Unnamed: 0,id,dialect,transformed,fineText
217576,1161331866680664064,SA,8,نجاح حج العام منتدى الوطن السعودى اللى مالهم ا...
344491,649669594408349696,KW,13,بنفس هاليوم قبلسنه تم ايقاف سياره شباب اعمارهم...
380119,628467008678400000,OM,14,احسن الظن فيهم يمكن التبن حاجه حلوه عندهم
133458,1018050664906329984,SY,4,هادا رضا امك عليكى بعتلك ناس حنييتهم بتشبه حني...
98921,1171931771346259968,PL,3,تامر عاشور اكيد الالبوم الجاى مش حيعمله كئيب
...,...,...,...,...
242834,946146404954722432,DZ,10,انتى جبتى طاريه احب قاعده التعميم
201566,1118352179998412800,SA,8,صح الله لسانك محمد الحيسونى ولسانك يابوسعد ابد...
186679,1089546761910259584,MA,7,بغا يقول العزى انا راه باقى راجل
111631,1105873512756117632,PL,3,سيبكم التصدى الرهيب انا اللى مش فاهمه المدافع ...


In [None]:
print(len(selected_rows))

60000


# ***Step 2: The usage of Word Embedding***

add content of fineText into a text file to be trained by fasttext later

In [None]:
column_data = selected_rows['fineText']

corpus_file2 = 'corpus2.txt'
column_data.to_csv(corpus_file2, sep='\n', index=False, header=False)


Number of lines in the corpus file: 60000


make sure length of text file == length of fineText column

In [None]:
line_count = 0

with open(corpus_file2, 'r') as file:
    for line in file:
        line_count += 1

print("Number of lines in the corpus file:", line_count)

print first 5 lines of text file to make sure everything was created orrectly

In [None]:
num_lines = 5

with open(corpus_file2, 'r', encoding='utf-8') as file:
    for i, line in enumerate(file):
        print(line.rstrip())
        if i + 1 >= num_lines:
            break

نجاح حج العام منتدى الوطن السعودى اللى مالهم الدسايس حيله الجيج بخير وانا كلنا خدامى مملكتنا دوله كبرى ماهيب دويله والله اللى عزها مناره الاسلامى الحشود انديرها والحرب نوفى كيله ساعد حنون ساعد رامى المهدى تركى
بنفس هاليوم قبلسنه تم ايقاف سياره شباب اعمارهم تتجاوز سنه والتهمه سيارتهم مشغلينها بدون مفتاح جطل كلبچوهم ايدهم للخلف
احسن الظن فيهم يمكن التبن حاجه حلوه عندهم
هادا رضا امك عليكى بعتلك ناس حنييتهم بتشبه حنيه الام
تامر عاشور اكيد الالبوم الجاى مش حيعمله كئيب


In [None]:
selected_rows.shape

(60000, 4)

In [None]:
model = fasttext.train_unsupervised(corpus_file2, model='skipgram')

word = 'جملة'
word_vector = model.get_word_vector(word)

print("Word:", word)
print("Vector:", word_vector,len(word_vector))


Word: جملة
Vector: [-0.00818765  0.14655817  0.06234251 -0.06318384  0.04172984  0.03267336
  0.05662162 -0.08366159 -0.09369873 -0.09181898 -0.09385093 -0.06093824
  0.11954756  0.00275488 -0.00660922 -0.00780669 -0.14874661 -0.06267768
 -0.03666773 -0.05918446  0.00707431  0.05795287  0.10962953 -0.02374876
 -0.14362943 -0.01924419  0.13287777 -0.05398592 -0.06460397  0.15765195
 -0.18087599 -0.12637825 -0.07433919 -0.00859973 -0.082987    0.03997335
  0.04868435  0.00324054 -0.10422565 -0.0537242   0.19913082  0.06278264
  0.06615251 -0.10892338  0.05009441 -0.0397015  -0.08962584 -0.0890315
 -0.1013047  -0.09811395 -0.14118259  0.00762574 -0.02815591  0.05063012
  0.02804846 -0.08543026  0.08607525  0.0461076   0.04626128 -0.04070195
 -0.03446043 -0.02363844  0.0678039  -0.01978531  0.0822804   0.04690181
 -0.08576982  0.11157534  0.01444859  0.0483564  -0.08112321  0.09047667
 -0.12833352  0.05618366  0.09884424  0.02999594  0.18001898  0.01426992
  0.17897652 -0.17675942 -0.18329

In [None]:
sentences = []

with open(corpus_file2, 'r') as file:
    for line in file:
        sentence = line.strip()  
        words = sentence.split()  
        sentences.append(words)

In [None]:
print(len(sentences),sentences[0])     

60000 ['نجاح', 'حج', 'العام', 'منتدى', 'الوطن', 'السعودى', 'اللى', 'مالهم', 'الدسايس', 'حيله', 'الجيج', 'بخير', 'وانا', 'كلنا', 'خدامى', 'مملكتنا', 'دوله', 'كبرى', 'ماهيب', 'دويله', 'والله', 'اللى', 'عزها', 'مناره', 'الاسلامى', 'الحشود', 'انديرها', 'والحرب', 'نوفى', 'كيله', 'ساعد', 'حنون', 'ساعد', 'رامى', 'المهدى', 'تركى']


In [None]:

word_vectors = []
labels = []
max_sentence_length = 0

for sentence, label in zip(sentences, selected_rows['transformed']):
    sentence_vectors = []
    for word in sentence:
        try:
            vector = model[word]
            sentence_vectors.append(vector)
        except KeyError:
            sentence_vectors.append(np.zeros(model.dim)) 
    if sentence_vectors:  
        word_vectors.append(sentence_vectors)
        labels.append(label)
        max_sentence_length = max(max_sentence_length, len(sentence_vectors))


**Pad the sentences to have the same length**

In [None]:
for i in range(len(word_vectors)):
    sentence_vectors = word_vectors[i]
    sentence_vectors += [np.zeros(model.dim)] * (max_sentence_length - len(sentence_vectors))
    word_vectors[i] = sentence_vectors

word_vectors = np.array(word_vectors)
labels = np.array(labels)

In [None]:
word_vectors.ndim

3

In [None]:
print(len(word_vectors),word_vectors[0,0,0])

60000 0.1791331022977829


In [None]:
labels = selected_rows['transformed'].tolist()
print(len(labels),labels[0])
#labels and word_vetors MUST BE THE EXACT SAME LENGTH

60000 8


In [None]:
X_train, X_test, y_train, y_test = train_test_split(word_vectors, labels, test_size=0.3, random_state=42)

In [None]:
X_train = np.array([np.concatenate(sentence) for sentence in X_train])
X_test = np.array([np.concatenate(sentence) for sentence in X_test])

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
svm_model = SVC(probability=False)
svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

if svm doesnt work with word vectors, you should train svm using one hot encoder instead:

In [None]:
# import pandas as pd
# from sklearn.preprocessing import OneHotEncoder

# # Select the columns for one-hot encoding
# columns_to_encode = ['transformed', 'fineText']

# # Extract the columns for encoding
# data_to_encode = selected_rows[columns_to_encode]

# # Create an instance of the OneHotEncoder
# encoder = OneHotEncoder(sparse=False)

# # Fit and transform the data for encoding
# encoded_data = encoder.fit_transform(data_to_encode)

# # Retrieve the feature names from the encoder
# feature_names = encoder.get_feature_names_out(columns_to_encode)

# # Convert the encoded data into a DataFrame with the feature names
# encoded_df = pd.DataFrame(encoded_data, columns=feature_names)

# # Concatenate the encoded DataFrame with the original DataFrame
# df_encoded = pd.concat([selected_rows, encoded_df], axis=1)

# df_encoded.head(10)
