In [None]:

import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np # For potential median filling


In [None]:
FILE_NAME = 'Psychologica_data.csv'
TARGET_COLUMN = 'Condition Summary' # Original name of the target column
ENCODED_TARGET_COLUMN = 'emotional_state_encoded'


In [None]:
NEW_COLUMN_NAMES = [
    'mood',
    'anxious_social_scale',
    'anxiety_triggers',
    'sleep_quality',
    'appetite_change',
    'lack_of_interest',
    'enjoyable_activities',
    'physical_anxiety_symptoms',
    'concentration_difficulty',
    'coping_strategies',
    'condition_summary' # Keep the original target column name here for now
]

print("--- Starting Data Preprocessing ---")

--- Starting Data Preprocessing ---


In [None]:
try:
    df = pd.read_csv(FILE_NAME)
    print(f"Dataset '{FILE_NAME}' loaded successfully.")
    print(f"Initial shape: {df.shape}")
    print("\nInitial DataFrame Info:")
    df.info()
except FileNotFoundError:
    print(f"Error: The file '{FILE_NAME}' was not found. Please ensure it's in the same directory as this script.")
    exit() # Exit if the file isn't found

Dataset 'Psychologica_data.csv' loaded successfully.
Initial shape: (9504, 11)

Initial DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9504 entries, 0 to 9503
Data columns (total 11 columns):
 #   Column                                                                                                                               Non-Null Count  Dtype 
---  ------                                                                                                                               --------------  ----- 
 0   Mood: How would you describe your mood over the past two weeks?                                                                      9504 non-null   object
 1   Anxious Social Scale: On a scale of 1-10, how often have you felt anxious in social situations recently?                             9504 non-null   object
 2   Anxiety Triggers: Have you experienced any of the following anxiety triggers in the past month?                                      9504 no

In [None]:
if len(df.columns) == len(NEW_COLUMN_NAMES):
    df.columns = NEW_COLUMN_NAMES
    print("Columns renamed successfully.")
else:
    print(f"Mismatch in column count! CSV has {len(df.columns)}, but NEW_COLUMN_NAMES has {len(NEW_COLUMN_NAMES)}.")
    print("Original column names:", df.columns.tolist())
    exit()



Columns renamed successfully:
['mood', 'anxious_social_scale', 'anxiety_triggers', 'sleep_quality', 'appetite_change', 'lack_of_interest', 'enjoyable_activities', 'physical_anxiety_symptoms', 'concentration_difficulty', 'coping_strategies', 'condition_summary']

First 5 rows with new column names:
              mood anxious_social_scale     anxiety_triggers  \
0        Happiness       Mildly anxious        Family issues   
1     Irritability     Somewhat anxious  Work-related stress   
2        Happiness     Slightly anxious  Work-related stress   
3      Fluctuating     Somewhat anxious    None of the above   
4  Extreme sadness         Very anxious        Family issues   

               sleep_quality     appetite_change lack_of_interest  \
0       Early morning waking  Increased cravings     Occasionally   
1                    Restful    Fluctuates daily           Always   
2       Early morning waking    Fluctuates daily           Always   
3                Interrupted    Fluctua

In [None]:
print(f"\nMissing values before processing:\n{df.isnull().sum()}")



Missing values before processing:
mood                         0
anxious_social_scale         0
anxiety_triggers             0
sleep_quality                0
appetite_change              0
lack_of_interest             0
enjoyable_activities         0
physical_anxiety_symptoms    0
concentration_difficulty     0
coping_strategies            0
condition_summary            0
dtype: int64


In [None]:
print("\n--- Processing 'mood' column ---")
print("Unique values in 'mood':")
print(df['mood'].unique())
mood_mapping = {
    'Extreme sadness': 0,
    'Very sad': 1,
    'Somewhat sad': 2,
    'Irritability': 3,
    'Fluctuating': 4,
    'Neutral': 5, # Added as a common neutral/middle ground
    'Slightly anxious': 6,
    'Somewhat anxious': 7,
    'Mildly anxious': 8,
    'Slightly happy': 9,
    'Happiness': 10,
    'Very happy': 11
}
df['mood_encoded'] = df['mood'].map(mood_mapping)
# Handle potential unmapped values (NaNs)
if df['mood_encoded'].isnull().sum() > 0:
    unmapped_moods = df.loc[df['mood_encoded'].isnull(), 'mood'].unique()
    print(f"Warning: Unmapped 'mood' values found: {unmapped_moods}. Filling with median.")
    df['mood_encoded'] = df['mood_encoded'].fillna(df['mood_encoded'].median())
print("'mood' column encoded.")



--- Processing 'mood' column ---
Unique values in 'mood':
['Happiness' 'Irritability' 'Fluctuating' 'Extreme sadness' 'Stable'
 'Anxiety' 'Mild sadness']
'mood' column encoded.


In [None]:
print("\n--- Processing 'anxious_social_scale' column ---")
print("Unique values in 'anxious_social_scale':")
print(df['anxious_social_scale'].unique())
anxious_social_mapping = {
    'Not at all anxious': 0,
    'Rarely anxious': 1,
    'Slightly anxious': 2,
    'Somewhat anxious': 3,
    'Often anxious': 4,
    'Very anxious': 5,
    'Extremely anxious': 6
}
df['anxious_social_scale_encoded'] = df['anxious_social_scale'].map(anxious_social_mapping)
if df['anxious_social_scale_encoded'].isnull().sum() > 0:
    unmapped_anxious = df.loc[df['anxious_social_scale_encoded'].isnull(), 'anxious_social_scale'].unique()
    print(f"Warning: Unmapped 'anxious_social_scale' values found: {unmapped_anxious}. Filling with median.")
    df['anxious_social_scale_encoded'] = df['anxious_social_scale_encoded'].fillna(df['anxious_social_scale_encoded'].median())
print("'anxious_social_scale' column encoded.")



--- Processing 'anxious_social_scale' column ---
Unique values in 'anxious_social_scale':
['Mildly anxious' 'Somewhat anxious' 'Slightly anxious' 'Very anxious'
 'Extremely anxious' 'Rarely anxious' 'Fairly anxious'
 'Moderately anxious' 'Constantly anxious' 'Not at all']
 'Constantly anxious' 'Not at all']. Filling with median.
'anxious_social_scale' column encoded.


In [None]:
print("\n--- Processing 'anxious_social_scale' column ---")
print("Unique values in 'anxious_social_scale':")
print(df['anxious_social_scale'].unique())
anxious_social_mapping = {
    'Not at all anxious': 0,
    'Rarely anxious': 1,
    'Slightly anxious': 2,
    'Somewhat anxious': 3,
    'Often anxious': 4,
    'Very anxious': 5,
    'Extremely anxious': 6
}
df['anxious_social_scale_encoded'] = df['anxious_social_scale'].map(anxious_social_mapping)
if df['anxious_social_scale_encoded'].isnull().sum() > 0:
    unmapped_anxious = df.loc[df['anxious_social_scale_encoded'].isnull(), 'anxious_social_scale'].unique()
    print(f"Warning: Unmapped 'anxious_social_scale' values found: {unmapped_anxious}. Filling with median.")
    df['anxious_social_scale_encoded'] = df['anxious_social_scale_encoded'].fillna(df['anxious_social_scale_encoded'].median())
print("'anxious_social_scale' column encoded.")



--- Processing 'anxious_social_scale' column ---
Unique values in 'anxious_social_scale':
['Mildly anxious' 'Somewhat anxious' 'Slightly anxious' 'Very anxious'
 'Extremely anxious' 'Rarely anxious' 'Fairly anxious'
 'Moderately anxious' 'Constantly anxious' 'Not at all']
 'Constantly anxious' 'Not at all']. Filling with median.
'anxious_social_scale' column encoded.


In [None]:
print("\n--- Processing 'anxious_social_scale' column ---")
print("Unique values in 'anxious_social_scale':")
print(df['anxious_social_scale'].unique())
anxious_social_mapping = {
    'Not at all anxious': 0,
    'Rarely anxious': 1,
    'Slightly anxious': 2,
    'Somewhat anxious': 3,
    'Often anxious': 4,
    'Very anxious': 5,
    'Extremely anxious': 6
}
df['anxious_social_scale_encoded'] = df['anxious_social_scale'].map(anxious_social_mapping)
if df['anxious_social_scale_encoded'].isnull().sum() > 0:
    unmapped_anxious = df.loc[df['anxious_social_scale_encoded'].isnull(), 'anxious_social_scale'].unique()
    print(f"Warning: Unmapped 'anxious_social_scale' values found: {unmapped_anxious}. Filling with median.")
    df['anxious_social_scale_encoded'] = df['anxious_social_scale_encoded'].fillna(df['anxious_social_scale_encoded'].median())
print("'anxious_social_scale' column encoded.")



--- Processing 'anxious_social_scale' column ---
Unique values in 'anxious_social_scale':
['Mildly anxious' 'Somewhat anxious' 'Slightly anxious' 'Very anxious'
 'Extremely anxious' 'Rarely anxious' 'Fairly anxious'
 'Moderately anxious' 'Constantly anxious' 'Not at all']
 'Constantly anxious' 'Not at all']. Filling with median.
'anxious_social_scale' column encoded.


In [None]:
print("\n--- Processing 'anxious_social_scale' column ---")
print("Unique values in 'anxious_social_scale':")
print(df['anxious_social_scale'].unique())
anxious_social_mapping = {
    'Not at all anxious': 0,
    'Rarely anxious': 1,
    'Slightly anxious': 2,
    'Somewhat anxious': 3,
    'Often anxious': 4,
    'Very anxious': 5,
    'Extremely anxious': 6
}
df['anxious_social_scale_encoded'] = df['anxious_social_scale'].map(anxious_social_mapping)
if df['anxious_social_scale_encoded'].isnull().sum() > 0:
    unmapped_anxious = df.loc[df['anxious_social_scale_encoded'].isnull(), 'anxious_social_scale'].unique()
    print(f"Warning: Unmapped 'anxious_social_scale' values found: {unmapped_anxious}. Filling with median.")
    df['anxious_social_scale_encoded'] = df['anxious_social_scale_encoded'].fillna(df['anxious_social_scale_encoded'].median())
print("'anxious_social_scale' column encoded.")



--- Processing 'anxious_social_scale' column ---
Unique values in 'anxious_social_scale':
['Mildly anxious' 'Somewhat anxious' 'Slightly anxious' 'Very anxious'
 'Extremely anxious' 'Rarely anxious' 'Fairly anxious'
 'Moderately anxious' 'Constantly anxious' 'Not at all']
 'Constantly anxious' 'Not at all']. Filling with median.
'anxious_social_scale' column encoded.


In [None]:
print("\n--- Processing 'anxious_social_scale' column ---")
print("Unique values in 'anxious_social_scale':")
print(df['anxious_social_scale'].unique())
anxious_social_mapping = {
    'Not at all anxious': 0,
    'Rarely anxious': 1,
    'Slightly anxious': 2,
    'Somewhat anxious': 3,
    'Often anxious': 4,
    'Very anxious': 5,
    'Extremely anxious': 6
}
df['anxious_social_scale_encoded'] = df['anxious_social_scale'].map(anxious_social_mapping)
if df['anxious_social_scale_encoded'].isnull().sum() > 0:
    unmapped_anxious = df.loc[df['anxious_social_scale_encoded'].isnull(), 'anxious_social_scale'].unique()
    print(f"Warning: Unmapped 'anxious_social_scale' values found: {unmapped_anxious}. Filling with median.")
    df['anxious_social_scale_encoded'] = df['anxious_social_scale_encoded'].fillna(df['anxious_social_scale_encoded'].median())
print("'anxious_social_scale' column encoded.")



--- Processing 'anxious_social_scale' column ---
Unique values in 'anxious_social_scale':
['Mildly anxious' 'Somewhat anxious' 'Slightly anxious' 'Very anxious'
 'Extremely anxious' 'Rarely anxious' 'Fairly anxious'
 'Moderately anxious' 'Constantly anxious' 'Not at all']
 'Constantly anxious' 'Not at all']. Filling with median.
'anxious_social_scale' column encoded.


In [None]:
print("\n--- Processing 'anxious_social_scale' column ---")
print("Unique values in 'anxious_social_scale':")
print(df['anxious_social_scale'].unique())
anxious_social_mapping = {
    'Not at all anxious': 0,
    'Rarely anxious': 1,
    'Slightly anxious': 2,
    'Somewhat anxious': 3,
    'Often anxious': 4,
    'Very anxious': 5,
    'Extremely anxious': 6
}
df['anxious_social_scale_encoded'] = df['anxious_social_scale'].map(anxious_social_mapping)
if df['anxious_social_scale_encoded'].isnull().sum() > 0:
    unmapped_anxious = df.loc[df['anxious_social_scale_encoded'].isnull(), 'anxious_social_scale'].unique()
    print(f"Warning: Unmapped 'anxious_social_scale' values found: {unmapped_anxious}. Filling with median.")
    df['anxious_social_scale_encoded'] = df['anxious_social_scale_encoded'].fillna(df['anxious_social_scale_encoded'].median())
print("'anxious_social_scale' column encoded.")



--- Processing 'anxious_social_scale' column ---
Unique values in 'anxious_social_scale':
['Mildly anxious' 'Somewhat anxious' 'Slightly anxious' 'Very anxious'
 'Extremely anxious' 'Rarely anxious' 'Fairly anxious'
 'Moderately anxious' 'Constantly anxious' 'Not at all']
 'Constantly anxious' 'Not at all']. Filling with median.
'anxious_social_scale' column encoded.


In [None]:
print("\n--- Processing 'anxious_social_scale' column ---")
print("Unique values in 'anxious_social_scale':")
print(df['anxious_social_scale'].unique())
anxious_social_mapping = {
    'Not at all anxious': 0,
    'Rarely anxious': 1,
    'Slightly anxious': 2,
    'Somewhat anxious': 3,
    'Often anxious': 4,
    'Very anxious': 5,
    'Extremely anxious': 6
}
df['anxious_social_scale_encoded'] = df['anxious_social_scale'].map(anxious_social_mapping)
if df['anxious_social_scale_encoded'].isnull().sum() > 0:
    unmapped_anxious = df.loc[df['anxious_social_scale_encoded'].isnull(), 'anxious_social_scale'].unique()
    print(f"Warning: Unmapped 'anxious_social_scale' values found: {unmapped_anxious}. Filling with median.")
    df['anxious_social_scale_encoded'] = df['anxious_social_scale_encoded'].fillna(df['anxious_social_scale_encoded'].median())
print("'anxious_social_scale' column encoded.")



--- Processing 'anxious_social_scale' column ---
Unique values in 'anxious_social_scale':
['Mildly anxious' 'Somewhat anxious' 'Slightly anxious' 'Very anxious'
 'Extremely anxious' 'Rarely anxious' 'Fairly anxious'
 'Moderately anxious' 'Constantly anxious' 'Not at all']
 'Constantly anxious' 'Not at all']. Filling with median.
'anxious_social_scale' column encoded.


In [None]:
print("\n--- Processing 'anxious_social_scale' column ---")
print("Unique values in 'anxious_social_scale':")
print(df['anxious_social_scale'].unique())
anxious_social_mapping = {
    'Not at all anxious': 0,
    'Rarely anxious': 1,
    'Slightly anxious': 2,
    'Somewhat anxious': 3,
    'Often anxious': 4,
    'Very anxious': 5,
    'Extremely anxious': 6
}
df['anxious_social_scale_encoded'] = df['anxious_social_scale'].map(anxious_social_mapping)
if df['anxious_social_scale_encoded'].isnull().sum() > 0:
    unmapped_anxious = df.loc[df['anxious_social_scale_encoded'].isnull(), 'anxious_social_scale'].unique()
    print(f"Warning: Unmapped 'anxious_social_scale' values found: {unmapped_anxious}. Filling with median.")
    df['anxious_social_scale_encoded'] = df['anxious_social_scale_encoded'].fillna(df['anxious_social_scale_encoded'].median())
print("'anxious_social_scale' column encoded.")



--- Processing 'anxious_social_scale' column ---
Unique values in 'anxious_social_scale':
['Mildly anxious' 'Somewhat anxious' 'Slightly anxious' 'Very anxious'
 'Extremely anxious' 'Rarely anxious' 'Fairly anxious'
 'Moderately anxious' 'Constantly anxious' 'Not at all']
 'Constantly anxious' 'Not at all']. Filling with median.
'anxious_social_scale' column encoded.


In [None]:
print("\n--- Processing 'anxious_social_scale' column ---")
print("Unique values in 'anxious_social_scale':")
print(df['anxious_social_scale'].unique())
anxious_social_mapping = {
    'Not at all anxious': 0,
    'Rarely anxious': 1,
    'Slightly anxious': 2,
    'Somewhat anxious': 3,
    'Often anxious': 4,
    'Very anxious': 5,
    'Extremely anxious': 6
}
df['anxious_social_scale_encoded'] = df['anxious_social_scale'].map(anxious_social_mapping)
if df['anxious_social_scale_encoded'].isnull().sum() > 0:
    unmapped_anxious = df.loc[df['anxious_social_scale_encoded'].isnull(), 'anxious_social_scale'].unique()
    print(f"Warning: Unmapped 'anxious_social_scale' values found: {unmapped_anxious}. Filling with median.")
    df['anxious_social_scale_encoded'] = df['anxious_social_scale_encoded'].fillna(df['anxious_social_scale_encoded'].median())
print("'anxious_social_scale' column encoded.")



--- Processing 'anxious_social_scale' column ---
Unique values in 'anxious_social_scale':
['Mildly anxious' 'Somewhat anxious' 'Slightly anxious' 'Very anxious'
 'Extremely anxious' 'Rarely anxious' 'Fairly anxious'
 'Moderately anxious' 'Constantly anxious' 'Not at all']
 'Constantly anxious' 'Not at all']. Filling with median.
'anxious_social_scale' column encoded.


In [None]:
print("\n--- Processing 'anxious_social_scale' column ---")
print("Unique values in 'anxious_social_scale':")
print(df['anxious_social_scale'].unique())
anxious_social_mapping = {
    'Not at all anxious': 0,
    'Rarely anxious': 1,
    'Slightly anxious': 2,
    'Somewhat anxious': 3,
    'Often anxious': 4,
    'Very anxious': 5,
    'Extremely anxious': 6
}
df['anxious_social_scale_encoded'] = df['anxious_social_scale'].map(anxious_social_mapping)
if df['anxious_social_scale_encoded'].isnull().sum() > 0:
    unmapped_anxious = df.loc[df['anxious_social_scale_encoded'].isnull(), 'anxious_social_scale'].unique()
    print(f"Warning: Unmapped 'anxious_social_scale' values found: {unmapped_anxious}. Filling with median.")
    df['anxious_social_scale_encoded'] = df['anxious_social_scale_encoded'].fillna(df['anxious_social_scale_encoded'].median())
print("'anxious_social_scale' column encoded.")



--- Processing 'anxious_social_scale' column ---
Unique values in 'anxious_social_scale':
['Mildly anxious' 'Somewhat anxious' 'Slightly anxious' 'Very anxious'
 'Extremely anxious' 'Rarely anxious' 'Fairly anxious'
 'Moderately anxious' 'Constantly anxious' 'Not at all']
 'Constantly anxious' 'Not at all']. Filling with median.
'anxious_social_scale' column encoded.


In [None]:
print("\n--- Processing 'anxiety_triggers' column ---")
print("Unique values in 'anxiety_triggers':")
print(df['anxiety_triggers'].unique())
# This is a categorical column. One-Hot Encoding is a good choice if multiple triggers are not common in one cell.
# If a cell can contain "Work-related stress, Family issues", then you'd need more complex splitting/binary flags.
# For simplicity and given the sample, we'll assume single categories per cell or 'None of the above'.
df = pd.get_dummies(df, columns=['anxiety_triggers'], prefix='trigger')
print("'anxiety_triggers' column one-hot encoded.")


--- Processing 'anxiety_triggers' column ---
Unique values in 'anxiety_triggers':
['Family issues' 'Work-related stress' 'None of the above'
 'Financial concerns' 'Social situations' 'Health concerns']
'anxiety_triggers' column one-hot encoded.


In [None]:
print("\n--- Processing 'sleep_quality' column ---")
print("Unique values in 'sleep_quality':")
print(df['sleep_quality'].unique())
sleep_quality_mapping = {
    'Difficulty staying asleep': 0,
    'Early morning waking': 1,
    'Interrupted': 2,
    'Restless': 3,
    'Normal': 4,
    'Restful': 5,
    'Excellent': 6
}
df['sleep_quality_encoded'] = df['sleep_quality'].map(sleep_quality_mapping)
if df['sleep_quality_encoded'].isnull().sum() > 0:
    unmapped_sleep = df.loc[df['sleep_quality_encoded'].isnull(), 'sleep_quality'].unique()
    print(f"Warning: Unmapped 'sleep_quality' values found: {unmapped_sleep}. Filling with median.")
    df['sleep_quality_encoded'] = df['sleep_quality_encoded'].fillna(df['sleep_quality_encoded'].median())
print("'sleep_quality' column encoded.")


--- Processing 'sleep_quality' column ---
Unique values in 'sleep_quality':
['Early morning waking' 'Restful' 'Interrupted'
 'Difficulty staying asleep' 'None of the above' 'Trouble falling asleep']
'sleep_quality' column encoded.


In [None]:
print("\n--- Processing 'appetite_change' column ---")
print("Unique values in 'appetite_change':")
print(df['appetite_change'].unique())
appetite_change_mapping = {
    'Loss of appetite': 0,
    'Decreased': 1,
    'Fluctuates daily': 2,
    'No significant change': 3,
    'Increased cravings': 4,
    'Increased': 5
}
df['appetite_change_encoded'] = df['appetite_change'].map(appetite_change_mapping)
if df['appetite_change_encoded'].isnull().sum() > 0:
    unmapped_appetite = df.loc[df['appetite_change_encoded'].isnull(), 'appetite_change'].unique()
    print(f"Warning: Unmapped 'appetite_change' values found: {unmapped_appetite}. Filling with median.")
    df['appetite_change_encoded'] = df['appetite_change_encoded'].fillna(df['appetite_change_encoded'].median())
print("'appetite_change' column encoded.")


--- Processing 'appetite_change' column ---
Unique values in 'appetite_change':
['Increased cravings' 'Fluctuates daily' 'Loss of appetite' 'No change']
'appetite_change' column encoded.


In [None]:
print("\n--- Processing 'lack_of_interest' column ---")
print("Unique values in 'lack_of_interest':")
print(df['lack_of_interest'].unique())
lack_of_interest_mapping = {
    'Never': 0,
    'Rarely': 1,
    'Occasionally': 2,
    'Frequently': 3,
    'Always': 4
}
df['lack_of_interest_encoded'] = df['lack_of_interest'].map(lack_of_interest_mapping)
if df['lack_of_interest_encoded'].isnull().sum() > 0:
    unmapped_interest = df.loc[df['lack_of_interest_encoded'].isnull(), 'lack_of_interest'].unique()
    print(f"Warning: Unmapped 'lack_of_interest' values found: {unmapped_interest}. Filling with median.")
    df['lack_of_interest_encoded'] = df['lack_of_interest_encoded'].fillna(df['lack_of_interest_encoded'].median())
print("'lack_of_interest' column encoded.")


--- Processing 'lack_of_interest' column ---
Unique values in 'lack_of_interest':
['Occasionally' 'Always' 'Frequently' 'Rarely' 'Never']
'lack_of_interest' column encoded.


In [None]:
print("\n--- Processing 'enjoyable_activities' column ---")
print("Unique values in 'enjoyable_activities':")
print(df['enjoyable_activities'].unique())
enjoyable_activities_mapping = {
    'Never': 0,
    'Rarely': 1,
    'Once a week': 2,
    'A few times a week': 3,
    'Daily': 4,
    'Always': 5 # Or 'Very Often'
}
df['enjoyable_activities_encoded'] = df['enjoyable_activities'].map(enjoyable_activities_mapping)
if df['enjoyable_activities_encoded'].isnull().sum() > 0:
    unmapped_enjoy = df.loc[df['enjoyable_activities_encoded'].isnull(), 'enjoyable_activities'].unique()
    print(f"Warning: Unmapped 'enjoyable_activities' values found: {unmapped_enjoy}. Filling with median.")
    df['enjoyable_activities_encoded'] = df['enjoyable_activities_encoded'].fillna(df['enjoyable_activities_encoded'].median())
print("'enjoyable_activities' column encoded.")


--- Processing 'enjoyable_activities' column ---
Unique values in 'enjoyable_activities':
['Rarely' 'A few times a week' 'Once a week' 'Never' 'Daily']
'enjoyable_activities' column encoded.


In [None]:
print("\n--- Processing 'physical_anxiety_symptoms' column ---")
print("Unique values in 'physical_anxiety_symptoms':")
print(df['physical_anxiety_symptoms'].unique())
physical_anxiety_mapping = {
    'No': 0,
    'Rarely': 1,
    'Yes, occasionally': 2,
    'Yes, frequently': 3,
    'Always': 4
}
df['physical_anxiety_symptoms_encoded'] = df['physical_anxiety_symptoms'].map(physical_anxiety_mapping)
if df['physical_anxiety_symptoms_encoded'].isnull().sum() > 0:
    unmapped_physical = df.loc[df['physical_anxiety_symptoms_encoded'].isnull(), 'physical_anxiety_symptoms'].unique()
    print(f"Warning: Unmapped 'physical_anxiety_symptoms' values found: {unmapped_physical}. Filling with median.")
    df['physical_anxiety_symptoms_encoded'] = df['physical_anxiety_symptoms_encoded'].fillna(df['physical_anxiety_symptoms_encoded'].median())
print("'physical_anxiety_symptoms' column encoded.")


--- Processing 'physical_anxiety_symptoms' column ---
Unique values in 'physical_anxiety_symptoms':
['Yes, occasionally' 'Yes, frequently' 'Rarely' 'No, not at all']
'physical_anxiety_symptoms' column encoded.


In [None]:
print("\n--- Processing 'concentration_difficulty' column ---")
print("Unique values in 'concentration_difficulty':")
print(df['concentration_difficulty'].unique())
concentration_difficulty_mapping = {
    'Never': 0,
    'Rarely': 1,
    'Occasionally': 2,
    'Frequently': 3,
    'Constantly': 4
}
df['concentration_difficulty_encoded'] = df['concentration_difficulty'].map(concentration_difficulty_mapping)
if df['concentration_difficulty_encoded'].isnull().sum() > 0:
    unmapped_conc = df.loc[df['concentration_difficulty_encoded'].isnull(), 'concentration_difficulty'].unique()
    print(f"Warning: Unmapped 'concentration_difficulty' values found: {unmapped_conc}. Filling with median.")
    df['concentration_difficulty_encoded'] = df['concentration_difficulty_encoded'].fillna(df['concentration_difficulty_encoded'].median())
print("'concentration_difficulty' column encoded.")


--- Processing 'concentration_difficulty' column ---
Unique values in 'concentration_difficulty':
['Occasionally' 'Constantly' 'Never' 'Frequently']
'concentration_difficulty' column encoded.


In [None]:
print("\n--- Processing 'coping_strategies' column ---")
print("Unique values in 'coping_strategies':")
print(df['coping_strategies'].unique())
# This can be tricky if multiple strategies are listed in one cell.
# For simplicity, we'll create a binary flag: 1 if any strategy is mentioned, 0 if "No coping strategies".
df['has_coping_strategies'] = df['coping_strategies'].apply(lambda x: 0 if 'No coping strategies' in str(x) else 1)
print("'coping_strategies' column simplified to binary flag.")



--- Processing 'coping_strategies' column ---
Unique values in 'coping_strategies':
['Physical activity' 'Journaling or writing' 'No coping strategies'
 'Social engagement' 'Mindfulness or meditation']
'coping_strategies' column simplified to binary flag.


In [None]:
print("\n--- Processing Target Column: 'condition_summary' ---")
print("Unique values in 'condition_summary' (before mapping to Happy/Normal/Sad):")
print(df['condition_summary'].unique())


--- Processing Target Column: 'condition_summary' ---
Unique values in 'condition_summary' (before mapping to Happy/Normal/Sad):
['Sleep Disorders: Sleep disturbances potentially linked to mood disorders or anxiety, impacting overall well-being.'
 'Stress-Related Conditions: Stress due to identifiable triggers, often linked to reduced self-care and coping strategy use.'
 'Mood Disorders: Indicators include mood instability, lack of interest in activities, appetite changes, and sleep disturbances.'
 'Cognitive Impairments: Difficulty focusing, potentially related to anxiety, fatigue, or mood disorders.'
 'Eating Disorders: Appetite fluctuations, often tied to mood changes and stress.'
 'Generalized Anxiety Disorder: Characterized by chronic worry, physical anxiety symptoms, and difficulty concentrating.'
 'General Mental Health: Responses do not strongly align with specific conditions but indicate areas for further exploration.'
 'Post-Traumatic Stress Disorder: Anxiety symptoms tied t

In [27]:
def map_condition_to_emotion(summary):
    summary = str(summary).lower() # Convert to string and lowercase for robust matching
    if any(phrase in summary for phrase in ['happiness', 'positive outlook', 'well-being', 'good mental state', 'optimistic']):
        return 'Happy'
    elif any(phrase in summary for phrase in ['extreme sadness', 'very sad', 'depression', 'mood disorders', 'difficulty concentrating', 'loss of interest', 'significant distress', 'overwhelmed', 'hopeless']):
        return 'Sad'
    # 'Normal' or 'Neutral' conditions, or perhaps 'Mildly Anxious'
    elif any(phrase in summary for phrase in ['irritability', 'fluctuating', 'somewhat anxious', 'mildly anxious', 'sleep disorders', 'stress-related conditions', 'physical symptoms', 'mild anxiety', 'unspecified discomfort', 'somewhat anxious', 'unstable', 'average']):
        return 'Normal'
    else:
        # Fallback for any unmapped summaries. This is important to catch all data points.
        # You might want to print these during development to refine your mapping.
        # print(f"Warning: Unmapped summary '{summary}' - defaulting to Normal")
        return 'Normal'

df['emotional_state_label'] = df['condition_summary'].apply(map_condition_to_emotion)

In [28]:
print("\nDistribution of mapped emotional states:")
print(df['emotional_state_label'].value_counts())


Distribution of mapped emotional states:
emotional_state_label
Sad       4293
Happy     3606
Normal    1605
Name: count, dtype: int64


In [60]:
le = LabelEncoder()
df[ENCODED_TARGET_COLUMN] = le.fit_transform(df['emotional_state_label'])

print(f"\nLabelEncoder classes (numerical mapping for '{ENCODED_TARGET_COLUMN}'):")
# This tells you which number corresponds to which label (e.g., 0: 'Happy', 1: 'Normal', 2: 'Sad')
print(list(le.classes_))


LabelEncoder classes (numerical mapping for 'emotional_state_encoded'):
['Happy', 'Normal', 'Sad']


In [62]:
import joblib
joblib.dump(le, 'label_encoder.pkl')
print("LabelEncoder saved to 'label_encoder.pkl'.")

LabelEncoder saved to 'label_encoder.pkl'.


In [73]:
columns_to_drop_final = [
    'mood', 'anxious_social_scale', 'sleep_quality',
    'appetite_change', 'lack_of_interest', 'enjoyable_activities',
    'physical_anxiety_symptoms', 'concentration_difficulty', 'coping_strategies',
    'condition_summary',        
    'emotional_state_label'     
]

In [75]:
df_processed = df.drop(columns=columns_to_drop_final, errors='ignore')

print("\n--- Preprocessing Complete ---")
print(f"Final processed DataFrame shape: {df_processed.shape}")
print("\nFinal Processed DataFrame (first 5 rows - ready for ML):")
print(df_processed.head())
print("\nFinal Processed DataFrame Columns:")
print(df_processed.columns.tolist())



--- Preprocessing Complete ---
Final processed DataFrame shape: (9504, 16)

Final Processed DataFrame (first 5 rows - ready for ML):
   mood_encoded  anxious_social_scale_encoded  trigger_Family issues  \
0          10.0                           3.0                   True   
1           3.0                           3.0                  False   
2          10.0                           2.0                  False   
3           4.0                           3.0                  False   
4           0.0                           5.0                   True   

   trigger_Financial concerns  trigger_Health concerns  \
0                       False                    False   
1                       False                    False   
2                       False                    False   
3                       False                    False   
4                       False                    False   

   trigger_None of the above  trigger_Social situations  \
0                      Fa

In [None]:
# --- END OF PREPROCESSING ---


In [79]:
X = df_processed.drop(ENCODED_TARGET_COLUMN, axis=1) # Features are all columns except the encoded target
y = df_processed[ENCODED_TARGET_COLUMN]             # Target is the encoded emotional state


In [81]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [83]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
print("\n--- Training Machine Learning Model ---")


--- Training Machine Learning Model ---


In [85]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
print("Model trained successfully.")

Model trained successfully.


In [87]:
y_pred = model.predict(X_test)
print(f"\nModel Accuracy on Test Set: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:")
# You'll need to use the LabelEncoder to get the actual class names for the report
target_names = le.classes_ # These are ['Happy', 'Normal', 'Sad'] or similar
print(classification_report(y_test, y_pred, target_names=target_names))


Model Accuracy on Test Set: 0.67

Classification Report:
              precision    recall  f1-score   support

       Happy       0.61      0.78      0.68       721
      Normal       0.61      0.47      0.53       321
         Sad       0.76      0.64      0.70       859

    accuracy                           0.67      1901
   macro avg       0.66      0.63      0.64      1901
weighted avg       0.68      0.67      0.66      1901



In [89]:
joblib.dump(model, 'emotion_model.pkl')
print("\nTrained model saved to 'emotion_model.pkl'.")

print("\n--- ML Model Training Complete ---")
print("You now have a processed DataFrame, a trained model, and a LabelEncoder saved.")
print("The next step is to build your Streamlit (or other UI) application.")


Trained model saved to 'emotion_model.pkl'.

--- ML Model Training Complete ---
You now have a processed DataFrame, a trained model, and a LabelEncoder saved.
The next step is to build your Streamlit (or other UI) application.
