In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer
import os
import joblib

# Ensure output directories exist
os.makedirs('../models', exist_ok=True)
os.makedirs('../reports/figures/ml', exist_ok=True)

# Load the cleaned and processed dataset
df = pd.read_csv('../data/processed/mental_health_tech_analyzed.csv')

# Prepare features and target
# Let's predict comfort discussing mental health with supervisor

# First, clean up the target variable
df['comfortable_with_supervisor'] = (df['mental_health_discussion_comfort_supervisor'] == 'Yes').astype(int)

# Select features
categorical_features = ['self_employed', 'tech_company', 'number_of_employees']
binary_features = [col for col in df.columns if col.endswith('_binary') and col != 'mental_health_discussion_comfort_supervisor_binary']
numeric_features = ['year']

# Only use rows with non-null target
df_model = df.dropna(subset=['comfortable_with_supervisor'])

# Split features and target
X = df_model[categorical_features + binary_features + numeric_features]
y = df_model['comfortable_with_supervisor']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Preprocessing pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numeric_transformer, numeric_features),
        ('bin', binary_transformer, binary_features)
    ])

# Model pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model
rf_pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = rf_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.savefig('../reports/figures/ml/confusion_matrix.png')
plt.show()

# Feature importance (for Random Forest)
feature_names = (
    categorical_transformer.named_steps['onehot'].get_feature_names_out(categorical_features).tolist() + 
    numeric_features + 
    binary_features
)

# Get feature importances
importances = rf_pipeline.named_steps['classifier'].feature_importances_

# Create DataFrame for feature importances
feature_importance_df = pd.DataFrame({
    'Feature': feature_names[:len(importances)],  # Ensure lengths match
    'Importance': importances
}).sort_values('Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(10, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15))
plt.title('Top 15 Feature Importances')
plt.tight_layout()
plt.savefig('../reports/figures/ml/feature_importance.png')
plt.show()

# Cross-validation
cv_scores = cross_val_score(rf_pipeline, X, y, cv=5)
print(f"\nCross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean():.4f}")

# Save the model
joblib.dump(rf_pipeline, '../models/mental_health_predictor.pkl')

# Save predictions for further analysis
test_results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})
test_results.to_csv('../data/processed/model_predictions.csv', index=False)

print("Predictive modeling complete!")

FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/mental_health_tech_analyzed.csv'