# **Data Cleanup and preparation:**
Given that this data contains cells in arabic, we will first work on extracting them and translating before we start encoding.
After extracting, now we translate to english using deep translator's google translate

In [4]:
import pandas as pd
from deep_translator import GoogleTranslator
import re

# Load your dataset
df = pd.read_csv('2024_PersonalityTraits_SurveyData.csv')

# Function to check for Arabic text
def contains_arabic(text):
    try:
        return bool(re.search(r'[\u0600-\u06FF]', str(text)))
    except:
        return False

# Function to translate Arabic text to English
def translate_to_english(text):
    if pd.isna(text):  # Check for NaN values
        return text
    try:
        translated = GoogleTranslator(source='ar', target='en').translate(text)
        return translated
    except Exception as e:
        print(f"Error translating '{text}': {e}")
        return text

# Identify and translate columns containing Arabic text
for column in df.columns:
    if df[column].apply(contains_arabic).any():
        df[column] = df[column].apply(translate_to_english)

# Save the translated DataFrame to a new CSV file
df.to_csv('Translated_PersonalityTraits_SurveyData.csv', index=False)


# **Encoding the data :**


In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the translated dataset
df = pd.read_csv('Translated_PersonalityTraits_SurveyData.csv')

# Create a copy to avoid altering the original directly
df_encoded = df.copy()

# Iterate over all columns
for col in df_encoded.columns:
    # Check column type; if it's of object type (often strings) or category, then label encode it
    if df_encoded[col].dtype == 'object' or str(df_encoded[col].dtype) == 'category':
        # Convert NaNs to string 'NA' or handle them before encoding
        df_encoded[col] = df_encoded[col].fillna('NA')  
        
        # Instantiate LabelEncoder
        le = LabelEncoder()
        
        # Fit and transform the column data
        df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

# Save the encoded DataFrame to a new CSV file
df_encoded.to_csv('IntegerEncoded_PersonalityTraits_SurveyData.csv', index=False)

print("All applicable columns have been label-encoded and saved to 'IntegerEncoded_PersonalityTraits_SurveyData.csv'.")


All applicable columns have been label-encoded and saved to 'IntegerEncoded_PersonalityTraits_SurveyData.csv'.
