In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the dataset
# Assuming your dataset is in a CSV file named 'depression_data.csv'
df = pd.read_csv('Deepression.csv')

# Display the first few rows of the dataframe
print(df.head())

   Number   Sleep  Appetite  Interest  Fatigue  Worthlessness  Concentration  \
0        1    1.0       1.0       1.0      5.0            5.0            1.0   
1        2    2.0       5.0       5.0      1.0            1.0            5.0   
2        3    5.0       2.0       2.0      2.0            2.0            2.0   
3        4    1.0       1.0       1.0      5.0            5.0            1.0   
4        5    2.0       5.0       5.0      1.0            1.0            5.0   

   Agitation  Suicidal Ideation  Sleep Disturbance  Aggression  Panic Attacks  \
0        5.0                5.0                1.0         5.0            5.0   
1        1.0                1.0                5.0         1.0            1.0   
2        2.0                2.0                2.0         2.0            2.0   
3        5.0                5.0                1.0         5.0            5.0   
4        1.0                1.0                5.0         1.0            1.0   

   Hopelessness  Restlessness  L

In [2]:
# Map answers to numeric values
answer_mapping = {
    1: 'Never', 
    2: 'Always', 
    3: 'Often', 
    4: 'Rarely', 
    5: 'Sometimes', 
    6: 'Not at all'
}
for col in df.columns[:-1]:  # Apply mapping to all columns except the last one (depression state)
    df[col] = df[col].map(answer_mapping)


In [3]:
# Encode categorical features as numeric
le = LabelEncoder()
for col in df.columns[:-1]:
    df[col] = le.fit_transform(df[col])

In [4]:
# Check for missing values
print(df.isnull().sum())

# Handle missing values
# Option 1: Fill missing values with the most frequent value in each column
df.fillna(df.mode().iloc[0], inplace=True)

# Option 2: Drop rows with missing values
# df.dropna(inplace=True)

# Verify that there are no missing values left
print(df.isnull().sum())

Number                 0
Sleep                  0
Appetite               0
Interest               0
Fatigue                0
Worthlessness          0
Concentration          0
Agitation              0
Suicidal Ideation      0
Sleep Disturbance      0
Aggression             0
Panic Attacks          0
Hopelessness           0
Restlessness           0
Low Energy             0
Depression State     273
dtype: int64
Number               0
Sleep                0
Appetite             0
Interest             0
Fatigue              0
Worthlessness        0
Concentration        0
Agitation            0
Suicidal Ideation    0
Sleep Disturbance    0
Aggression           0
Panic Attacks        0
Hopelessness         0
Restlessness         0
Low Energy           0
Depression State     0
dtype: int64


In [5]:
# Split the data into features and target
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Check the distribution of the target variable
print(y.value_counts())


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Depression State
No depression       430
Mild                117
Moderate            109
Severe              108
\tNo depression      15
\tModerate           11
\tMild               11
\tSevere             10
2\tNo depression      1
5\tNo depression      1
Name: count, dtype: int64


In [6]:
# Train a RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)


In [7]:
# Predict on the test set
y_pred = clf.predict(X_test)


In [8]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred, zero_division=1))

Accuracy: 0.5828220858895705
                precision    recall  f1-score   support

         	Mild       0.00      0.00      0.00         2
     	Moderate       0.00      0.00      0.00         1
	No depression       0.00      0.00      0.00         3
       	Severe       1.00      0.00      0.00         1
          Mild       0.50      0.17      0.25        18
      Moderate       0.41      0.38      0.39        24
 No depression       0.97      0.75      0.85        97
        Severe       0.43      0.59      0.50        17

      accuracy                           0.58       163
     macro avg       0.41      0.24      0.25       163
  weighted avg       0.75      0.58      0.64       163



In [9]:
# Save the model to a file
joblib.dump(clf, 'depression_model.pkl')

['depression_model.pkl']