## Pre-Implementation: Data Cleaning
*    I. Categorizing Records According to Phrase Length
*    II. Finding Total Number of Unique Words

In [1]:
import pandas as pd
import numpy as np

#### A. JUMLA-QSL-22

In [2]:
df = pd.read_excel("Dataset Excel Sheets/Full Dataset Corrected.xlsx")

In [3]:
df

Unnamed: 0,CODE,INTENT_ENGLISH,INTENT_ARABIC
0,59,GREETING,مرحبا
1,60,CORONA_TEST,فيروس كورونا فحص
2,61,GREETING,مرحبا
3,62,DOCTOR_REQUEST,اريد طبيب رؤية
4,63,NEGATION,موعد لا مافي
...,...,...,...
896,960,CERTIFICATION_REQUEST,اريد شهادة تقرير
897,961,ANALYSIS_REQUEST,مرحبا اريد ضغط فحص
898,962,ANALYSIS_REQUEST,السلام عليكم اريد فحص الغدة
899,963,ANALYSIS_REQUEST,السلام عليكم اريد الغدة فحص


### I. Categorizing Records According to Phrase Length

In [4]:
# Assuming df is your original DataFrame
# Initialize empty lists to collect data
unique_records = []
code_lists = []
phrase_lengths = []
record_counts = []

# Iterate over the 'INTENT_ARABIC' column
for idx, record in enumerate(df['INTENT_ARABIC']):
    record = record.strip()     # trim trailing whitespaces
    
    recordWords = record.split()
    recordWordsCount = len(recordWords)
    recordCode = df.at[idx, 'CODE']
    
    # Check if the record is already in the list of unique_records
    if record not in unique_records:
        unique_records.append(record)
        code_lists.append([recordCode])
        phrase_lengths.append(recordWordsCount)  # Store the length directly as an integer
        record_counts.append(1)
    else:
        # Find the index of the existing record
        record_index = unique_records.index(record)
        # Update the corresponding lists
        code_lists[record_index].append(recordCode)
        # No need to append to phrase_lengths, it's already a single integer
        record_counts[record_index] += 1

# Create a new DataFrame from the collected data
result_df = pd.DataFrame({
    'INTENT_ARABIC': unique_records,
    'CODE': code_lists,
    'PHRASE_LENGTH': phrase_lengths,
    'COUNT': record_counts
})

# Export the resulting DataFrame to an Excel file
result_df.to_excel('Dataset Excel Sheets/Categorized Records.xlsx', index=False)

### II. Finding Total Number of Unique Words

In [5]:
df2 = pd.read_excel("Dataset Excel Sheets/Categorized Records.xlsx")

In [6]:
df2            # total unique records

Unnamed: 0,INTENT_ARABIC,CODE,PHRASE_LENGTH,COUNT
0,مرحبا,"[59, 61, 68, 76, 83, 93, 99, 106, 108, 110, 11...",1,47
1,فيروس كورونا فحص,[60],3,1
2,اريد طبيب رؤية,[62],3,1
3,موعد لا مافي,[63],3,1
4,اريد موافق,[64],2,1
...,...,...,...,...
339,اﻷشارة سريعة مش فاهم,[875],4,1
340,السلام عليكم اريد سكر دم و ضغط فحص,[881],8,1
341,لا اريد الساعة السادسة,[946],4,1
342,نعم مناسب شكرا,[951],3,1


In [7]:
# Initialize an empty list to collect unique words
unique_words = set()

# Iterate through each unique record
for record in df2['INTENT_ARABIC']:
    words = record.split()
    unique_words.update(words)

# Convert the set of unique words back to a list
unique_words_list = list(unique_words)

# Create a DataFrame with the list of unique words
unique_words_df = pd.DataFrame({'UNIQUE_WORDS': unique_words_list})     # 159 words

# Export the DataFrame to an Excel file
unique_words_df.to_excel('Dataset Excel Sheets/Unique Words.xlsx', index=False)

### B. ArabSign

In [2]:
df = pd.read_excel("Dataset Excel Sheets/ArabSign Records.xlsx")

In [3]:
df

Unnamed: 0,SentenceID,Sentence
0,1,اسم الله
1,2,الحمد الله
2,3,جميع الصم العرب السامع
3,4,السلام عليكم رحمة الله بركة
4,5,اليوم اقدم انتم برنامج اخر
5,6,موضوع دراسة لغة الاشارة العربية
6,7,كلمات اليوم متفرقة في الدين
7,8,ايضا كلمات عادية
8,9,لا شرك الله
9,10,الله اكبر


### I. Categorizing Records According to Phrase Length

In [6]:
unique_records = []
code_lists = []
phrase_lengths = []

# Iterate over the 'Sentence' column
for idx, record in enumerate(df['Sentence']):
    record = record.strip()     # trim trailing whitespaces
    
    recordWords = record.split()
    recordWordsCount = len(recordWords)
    recordCode = df.at[idx, 'SentenceID']
    
    # Check if the record is already in the list of unique_records
    if record not in unique_records:
        unique_records.append(record)
        code_lists.append([recordCode])
        phrase_lengths.append(recordWordsCount)  # Store the length directly as an integer
    else:
        # Find the index of the existing record
        record_index = unique_records.index(record)
        # Update the corresponding lists
        code_lists[record_index].append(recordCode)

# Create a new DataFrame from the collected data
result_df = pd.DataFrame({
    'Sentence': unique_records,
    'ID': code_lists,
    'PHRASE_LENGTH': phrase_lengths,
})

# Export the resulting DataFrame to an Excel file
result_df.to_excel('Dataset Excel Sheets/Categorized Records.xlsx', index=False)

### II. Finding Total Number of Unique Words

In [8]:
df2 = pd.read_excel("Dataset Excel Sheets/Categorized Records.xlsx")

In [10]:
df2

Unnamed: 0,Sentence,ID,PHRASE_LENGTH
0,اسم الله,[1],2
1,الحمد الله,[2],2
2,الله اكبر,[10],2
3,الله كريم,[13],2
4,الله رزق,[14],2
5,الله غني,[17],2
6,شكرا انتم,[22],2
7,انا سفر,[27],2
8,اين السعودية,[28],2
9,هنا نكتفي,[34],2


In [11]:
# Initialize an empty list to collect unique words
unique_words = set()

# Iterate through each unique record
for record in df2['Sentence']:
    words = record.split()
    unique_words.update(words)

# Convert the set of unique words back to a list
unique_words_list = list(unique_words)

# Create a DataFrame with the list of unique words
unique_words_df = pd.DataFrame({'UNIQUE_WORDS': unique_words_list})     # 159 words

# Export the DataFrame to an Excel file
unique_words_df.to_excel('Dataset Excel Sheets/Unique Words.xlsx', index=False)