In [3]:
!pip install deep_translator

Collecting deep_translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: deep_translator
Successfully installed deep_translator-1.11.4


# **Data Cleanup and preparation:**
Given that this data contains cells in arabic, we will first work on extracting them and translating before we start encoding.
After extracting, now we translate to english using deep translator's google translate

In [9]:
import pandas as pd
from deep_translator import GoogleTranslator
import re

# Load your dataset
df = pd.read_csv('2024_PersonalityTraits_SurveyData.csv')

# Function to check for Arabic text
def contains_arabic(text):
    try:
        return bool(re.search(r'[\u0600-\u06FF]', str(text)))
    except:
        return False

# Function to translate Arabic text to English
def translate_to_english(text):
    if pd.isna(text):  # Check for NaN values
        return text
    try:
        translated = GoogleTranslator(source='ar', target='en').translate(text)
        return translated
    except Exception as e:
        print(f"Error translating '{text}': {e}")
        return text

# Identify and translate columns containing Arabic text
for column in df.columns:
    if df[column].apply(contains_arabic).any():
        df[column] = df[column].apply(translate_to_english)

# Save the translated DataFrame to a new CSV file
df.to_csv('Translated_PersonalityTraits_SurveyData.csv', index=False)


# **Filtering the columns that can/cannot be encoded so we can start building our model, and re-naming the colunmns for easiness and clarity**



In [18]:
cols_to_drop = [
    "What is your favorite or preferred cigarette brand(s) if you were able to access it?",
    "What cigarette brand(s) are you currently using?",
    "What is your current employment status? [Comment]",
    "What is your current marital status? [Comment]",
    "What is your main source of income? [Comment]",
    "What type of income or financial support does your household receive? [Comment]"
]
df1 = df.copy()
df1 = df1.drop(columns=cols_to_drop)


In [28]:
# Dictionary mapping old column names to new ones
column_name_map = {
    "Unnamed: 0": "ID",
    "Sector": "Sector",
    "Last page": "LastPage",
    "Have you smoked at least one full tobacco cigarette (excluding e-cigarettes) once or more in the past 30 days?": "SmokedIn30Days",
    "I see myself as someone who is extraverted, enthusiastic:": "Extraverted",
    "I see myself as someone who is critical, quarrelsome:": "Critical",
    "I see myself as someone who is dependable, self-disciplined:": "Dependable",
    "I see myself as someone who is anxious, easily upset:": "Anxious",
    "I see myself as someone who is open to new experiences:": "OpenExp",
    "I see myself as someone who is reserved, quiet:": "Reserved",
    "I see myself as someone who is sympathetic, warm:": "Sympathetic",
    "I see myself as someone who is disorganized, careless:": "Disorganized",
    "I see myself as someone who is calm, emotionally stable:": "Calm",
    "I see myself as someone who is conventional, uncreative:": "Conventional",
    "Do you find it difficult to refrain from smoking where it is forbidden (church, library, cinema, plane, etc...)?": "DifficultRefrain",
    "How many cigarettes do you smoke each day?": "CigsPerDay",
    "Do you smoke more frequently during the first hours after waking up than during the rest of the day? ": "SmokeMoreEarly",
    "Do you smoke if you are so ill that you are in bed most of the day?": "SmokeWhileIll",
    "How soon after you wake up do you smoke your first cigarette?": "TimeToFirstCig",
    "Which cigarette would you mostly hate to give up?": "HateToGiveUp",
    "How old were you the first time you smoked a full cigarette (not just a few puffs)?": "FirstCigAge",
    "How would you describe your current smoking behavior compared to your smoking behavior before Lebanon's economic crisis and revolution began in 2019?": "SmokingChangePostCrisis",
    "Are you currently able to afford your favorite or preferred cigarette brand(s)?": "AffordFavoriteBrand",
    "Has 2019's revolution or economic crisis caused you to switch away from your favorite or preferred cigarette brand(s) to an  alternative?": "SwitchedBrandPostCrisis",
    "Gender:": "Gender",
    "How old are you?": "Age",
    "Which governerate do you live in or spend most of your time in?": "Governorate",
    "What is the highest level of education you have attained?": "EducationLevel",
    "What is your current employment status?": "CurrentEmploymentStatus",
    "What is your current marital status?": "MaritalStatus",
    "Do you have close friends?": "CloseFriends",
    "Of the five closest friends or acquaintances that you spend time with on a regular basis, how many of them are smokers?": "NumSmokerFriends",
    "What is your main source of income?": "MainIncomeSource",
    "What type of income or financial support does your household receive?": "HouseholdIncomeType",
    "If you receive payment in Lebanese Lira, what is your current estimated monthly household income? (If income is in US Dollars, then refer to the current black market exchange).": "MonthlyIncome",
    "How would you describe your current income sufficiency?": "IncomeSufficiency",
    "Including yourself, how many people currently live in your household?": "HouseholdSize",
    "To what extent were you financially (negatively) affected by the deterioration of the Lebanese economy?": "FinancialImpact",
    "How often do you exercise?": "ExerciseFreq",
    "On average, how many hours per day do you spend on social media for entertainment and social interaction (Facebook, Instagram, YouTube, etc...)?": "SocialMediaHours",
    "How often do you feel stressed?": "StressFreq",
    "Employment Status": "EmploymentStatusGeneral"
}

print(len(column_name_map))

# Apply the rename
df1 = df1.rename(columns=column_name_map)

print(df1)

42
      ID   Sector  LastPage SmokedIn30Days                 Extraverted  \
0      5  Private         5            Yes              Agree strongly   
1     11  Private         5            Yes            Agree moderately   
2     14  Private         5            Yes         Disagree moderately   
3     15  Private         5            Yes              Agree strongly   
4     16  Private         5            Yes              Agree a little   
..   ...      ...       ...            ...                         ...   
207  617   Public         5            Yes            Agree moderately   
208  618   Public         5            Yes            Agree moderately   
209  619   Public         5            Yes           Disagree a little   
210  620   Public         5            Yes            Agree moderately   
211  621   Public         5            Yes  Neither agree nor disagree   

                       Critical        Dependable              Anxious  \
0           Disagree moderately   

# **Now we want to use label-encoding to encode our data so we can start with the model:**

In [26]:
predefined_categories = {
    "Sector": ["Private", "Public", "Other"],
    "SmokedIn30Days": ["Yes", "No"],

    # Personality traits (Likert scale):
    # Based on the snippet, possible responses:
    # "Agree strongly", "Agree moderately", "Agree a little",
    # "Neither agree nor disagree", "Disagree a little",
    # "Disagree moderately", "Disagree strongly"
    "Extraverted": ["Agree strongly","Agree moderately","Agree a little","Neither agree nor disagree","Disagree a little","Disagree moderately","Disagree strongly"],
    "Critical": ["Agree strongly","Agree moderately","Agree a little","Neither agree nor disagree","Disagree a little","Disagree moderately","Disagree strongly"],
    "Dependable": ["Agree strongly","Agree moderately","Agree a little","Neither agree nor disagree","Disagree a little","Disagree moderately","Disagree strongly"],
    "Anxious": ["Agree strongly","Agree moderately","Agree a little","Neither agree nor disagree","Disagree a little","Disagree moderately","Disagree strongly"],
    "OpenExp": ["Agree strongly","Agree moderately","Agree a little","Neither agree nor disagree","Disagree a little","Disagree moderately","Disagree strongly"],
    "Reserved": ["Agree strongly","Agree moderately","Agree a little","Neither agree nor disagree","Disagree a little","Disagree moderately","Disagree strongly"],
    "Sympathetic": ["Agree strongly","Agree moderately","Agree a little","Neither agree nor disagree","Disagree a little","Disagree moderately","Disagree strongly"],
    "Disorganized": ["Agree strongly","Agree moderately","Agree a little","Neither agree nor disagree","Disagree a little","Disagree moderately","Disagree strongly"],
    "Calm": ["Agree strongly","Agree moderately","Agree a little","Neither agree nor disagree","Disagree a little","Disagree moderately","Disagree strongly"],
    "Conventional": ["Agree strongly","Agree moderately","Agree a little","Neither agree nor disagree","Disagree a little","Disagree moderately","Disagree strongly"],

    "DifficultRefrain": ["Yes", "No"],
    "CigsPerDay": ["10 or less cigarettes/day", "11 to 20 cigarettes", "21 to 30 cigarettes", "31 cigarettes/day or more"],

    "SmokeMoreEarly": ["Yes", "No"],
    "SmokeWhileIll": ["Yes", "No"],

    "TimeToFirstCig": ["Within 5 minutes", "6 to 30 minutes", "31 to 60 minutes", "After 60 minutes"],
    "HateToGiveUp": ["The first one in the morning", "All others"],

    # FirstCigAge is numeric, no encoding needed. We'll handle that by leaving it as-is in the code.

    "SmokingChangePostCrisis": [
        "The number of cigarettes I smoke per day has remained the same",
        "The number of cigarettes I smoke per day has increased",
        "The number of cigarettes I smoke per day has decreased"
    ],
    "AffordFavoriteBrand": ["Yes", "No"],
    "SwitchedBrandPostCrisis": [
        "No, I am currently using my favorite or preferred cigarette brand(s)",
        "Yes, I am currently using a cheaper alternative"
    ],
    "Gender": ["Male", "Female", "Other"],

    # Age is numeric, no encoding.

    "Governorate": [
        "Beirut",
        "Mount Lebanon",
        "North Lebanon",
        "South Lebanon",
        "Bekaa",
        "Nabatieh",
        "Akkar",
        "Baalbek-Hermel",
        "Keserwan - Jbeil",
        "Other"  # To handle unexpected values
    ],

    "EducationLevel": [
        "High school degree or equivalent (e.g. GED)",
        "Incomplete bachelor's degree",
        "Bachelor's degree (BA/BS)",
        "Graduate degree (MA/MS)",
        "Post-graduate degree (PhD, MD, or other)",
        "Other"
    ],

    "CurrentEmploymentStatus": [
        "Business owner",
        "Student only",
        "Student with a part-time or full-time job",
        "Employed",
        "Unemployed",
        "Other"
    ],

    "MaritalStatus": [
        "Single",
        "Engaged",
        "In a relationship",
        "Married",
        "Other"
    ],

    "CloseFriends": ["Yes", "No"],

    # NumSmokerFriends is numeric, no encoding needed.

    "MainIncomeSource": [
        "Parents",
        "Own business income",
        "Job",
        "Investment",
        "Other, please specify"
    ],

    "HouseholdIncomeType": [
        "Fully in Lebanese Lira",
        "Fully in US Dollars",
        "Mixed",
        "I don't know",
        "I prefer not to say",
        "Other"
    ],

    "MonthlyIncome": [
        "Between 1 and 4 million L.L",
        "Between 4 and 8 million L.L",
        "Between 8 and 12 million L.L",
        "Between 12 and 16 million L.L",
        "Between 16 and 20 million L.L",
        "More than 20 million L.L",
        "I don't know",
        "Other"
    ],

    "IncomeSufficiency": [
        "Very low income: does not cover basic needs for a month",
        "Low: barely covers basic needs for a month",
        "Medium: covers all basic needs",
        "High: completely covers necessities with a few luxury items",
        "Extremely high: covers a wide range of luxury items",
        "Other"
    ],

    # HouseholdSize is numeric, no encoding.

    "FinancialImpact": ["Not at all", "Slightly", "Moderately", "Very", "Extremely", "Other"],

    "ExerciseFreq": [
        "Never",
        "Sometimes or a few days every month",
        "Often or at least 3 days every week",
        "Every day or at least 5 times every week",
        "Other"
    ],

    "SocialMediaHours": [
        "Less than 1 hour",
        "Between 1 hour and 2 hours",
        "Between 2 and 3 hours",
        "Between 3 and 4 hours",
        "More than 4 hours",
        "Other"
    ],

    "StressFreq": [
        "Never",
        "Occasionally",
        "Frequently",
        "Constantly",
        "Rarely",
        "Other"
    ],

    "EmploymentStatusGeneral": ["Employed", "Unemployed", "Other"]
}


In [29]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assume df is your DataFrame after dropping the free-text columns and renaming columns
df_encoded = df1.copy()

# Identify categorical columns
categorical_columns = df_encoded.select_dtypes(include=['object', 'category']).columns

for col in categorical_columns:
    # Get the predefined categories if available; if not, treat all known as valid + "Other"
    if col in predefined_categories:
        valid_categories = predefined_categories[col]
    else:
        # If not predefined, take the unique values and add "Other"
        unique_vals = df_encoded[col].dropna().unique().tolist()
        valid_categories = unique_vals + ["Other"]

    # Map values to valid categories or "Other"
    df_encoded[col] = df_encoded[col].apply(
        lambda x: x if pd.notna(x) and x in valid_categories else "Other"
    )

    # Label encode
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col].astype(str))

# Numeric columns remain unchanged.

df_encoded.to_csv('LabelEncodedData.csv', index=False)
print("Label encoding completed with predefined categories. Non-matching and missing values mapped to 'Other'.")


Label encoding completed with predefined categories. Non-matching and missing values mapped to 'Other'.
