In [15]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix

In [16]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [17]:
def process_time_intervals(df):
    df = df.copy()

    def parse_sleep_duration(x):
        if pd.isna(x):
            return np.nan
        x = str(x).lower()
        if '-' in x:
            parts = [int(s) for s in x.replace('hours', '').strip().split('-') if s.strip().isdigit()]
            return np.mean(parts) if parts else np.nan
        elif 'less than' in x:
            return 4.5
        elif 'more than' in x:
            return 9.5
        elif x.replace('hours', '').strip().isdigit():
            return float(x.replace('hours', '').strip())
        return np.nan

    def parse_work_study(x):
        try:
            return float(x)
        except:
            return np.nan

    df['Sleep Duration'] = df['Sleep Duration'].apply(parse_sleep_duration)
    df['Work/Study Hours'] = df['Work/Study Hours'].apply(parse_work_study)

    return df

# Apply processing
train_df = process_time_intervals(train_df)
test_df = process_time_intervals(test_df)

# Drop unused columns
X_train = train_df.drop(columns=['Depression', 'Name', 'id'])
y_train = train_df['Depression']
X_test = test_df.drop(columns=['Name', 'id'])

In [18]:
numeric_features = [
    'Age', 'CGPA', 'Sleep Duration', 'Work/Study Hours',
    'Academic Pressure', 'Work Pressure', 'Financial Stress',
    'Study Satisfaction', 'Job Satisfaction'
]

categorical_features = [
    'Gender', 'City', 'Working Professional or Student',
    'Profession', 'Dietary Habits', 'Degree',
    'Have you ever had suicidal thoughts ?',
    'Family History of Mental Illness'
]

In [19]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [20]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        class_weight='balanced',
        random_state=42))
])


In [21]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X_train, y_train, scoring='f1', cv=cv)
print(f"F1 scores (CV): {scores}")
print(f"Average F1: {np.mean(scores):.4f}")

F1 scores (CV): [0.73214286 0.81967213 0.68965517 0.73684211 0.8       ]
Average F1: 0.7557


In [22]:
model.fit(X_train, y_train)
test_pred = model.predict(X_test)
submission = pd.DataFrame({
    'id': test_df['id'],
    'Depression': test_pred
})
submission.to_csv('submission.csv', index=False)
print("Submission file created.")

Submission file created.
