## Pre-Implementation: Data Cleaning
*    I. Categorizing Records According to Phrase Length
*    II. Finding Total Number of Unique Words (Documentation Purposes)

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_excel("Dataset Excel Sheets/Full Dataset Corrected.xlsx")

In [3]:
df

Unnamed: 0,CODE,INTENT_ENGLISH,INTENT_ARABIC
0,59,GREETING,مرحبا
1,60,CORONA_TEST,فيروس كورونا فحص
2,61,GREETING,مرحبا
3,62,DOCTOR_REQUEST,اريد طبيب رؤية
4,63,NEGATION,موعد لا مافي
...,...,...,...
896,960,CERTIFICATION_REQUEST,اريد شهادة تقرير
897,961,ANALYSIS_REQUEST,مرحبا اريد ضغط فحص
898,962,ANALYSIS_REQUEST,السلام عليكم اريد فحص الغدة
899,963,ANALYSIS_REQUEST,السلام عليكم اريد الغدة فحص


### I. Categorizing Records According to Phrase Length

In [4]:
import pandas as pd

# Assuming df is your original DataFrame
# Initialize empty lists to collect data
unique_records = []
code_lists = []
phrase_lengths = []
record_counts = []

# Iterate over the 'INTENT_ARABIC' column
for idx, record in enumerate(df['INTENT_ARABIC']):
    record = record.strip()     # trim trailing whitespaces
    
    recordWords = record.split()
    recordWordsCount = len(recordWords)
    recordCode = df.at[idx, 'CODE']
    
    # Check if the record is already in the list of unique_records
    if record not in unique_records:
        unique_records.append(record)
        code_lists.append([recordCode])
        phrase_lengths.append(recordWordsCount)  # Store the length directly as an integer
        record_counts.append(1)
    else:
        # Find the index of the existing record
        record_index = unique_records.index(record)
        # Update the corresponding lists
        code_lists[record_index].append(recordCode)
        # No need to append to phrase_lengths, it's already a single integer
        record_counts[record_index] += 1

# Create a new DataFrame from the collected data
result_df = pd.DataFrame({
    'INTENT_ARABIC': unique_records,
    'CODE': code_lists,
    'PHRASE_LENGTH': phrase_lengths,
    'COUNT': record_counts
})

# Export the resulting DataFrame to an Excel file
result_df.to_excel('Dataset Excel Sheets/Categorized Records.xlsx', index=False)


### II. Finding Total Number of Unique Words (Documentation Purposes)

In [5]:
df2 = pd.read_excel("Dataset Excel Sheets/Categorized Records.xlsx")

In [6]:
df2            # total unique records

Unnamed: 0,INTENT_ARABIC,CODE,PHRASE_LENGTH,COUNT
0,مرحبا,"[59, 61, 68, 76, 83, 93, 99, 106, 108, 110, 11...",1,47
1,فيروس كورونا فحص,[60],3,1
2,اريد طبيب رؤية,[62],3,1
3,موعد لا مافي,[63],3,1
4,اريد موافق,[64],2,1
...,...,...,...,...
339,اﻷشارة سريعة مش فاهم,[875],4,1
340,السلام عليكم اريد سكر دم و ضغط فحص,[881],8,1
341,لا اريد الساعة السادسة,[946],4,1
342,نعم مناسب شكرا,[951],3,1


In [7]:
# Initialize an empty list to collect unique words
unique_words = set()

# Iterate through each unique record
for record in df2['INTENT_ARABIC']:
    words = record.split()
    unique_words.update(words)

# Convert the set of unique words back to a list
unique_words_list = list(unique_words)

# Create a DataFrame with the list of unique words
unique_words_df = pd.DataFrame({'UNIQUE_WORDS': unique_words_list})     # 159 words

# Export the DataFrame to an Excel file
unique_words_df.to_excel('Dataset Excel Sheets/Unique Words.xlsx', index=False)