In [30]:
# train_model.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib
import os

In [32]:
print("Starting model training script...")

# --- 1. Load Data ---
# Ensure 'project_risk_data.csv' is in the same directory as this script
try:
    df = pd.read_csv('project_risk_data.csv')
    print("Data loaded successfully from 'project_risk_data.csv'")
    print(f"Dataset shape: {df.shape}")
    print("First 5 rows of data:")
    print(df.head())
except FileNotFoundError:
    print("Error: 'project_risk_data.csv' not found.")
    print("Please ensure you have run the data generation script and the CSV file is in the correct directory.")
    exit() # Exit if data is not found

Starting model training script...
Data loaded successfully from 'project_risk_data.csv'
Dataset shape: (500, 12)
First 5 rows of data:
  project_id  project_complexity  num_overdue_tasks  num_assignees  \
0   PROJ_001                 152                 41             17   
1   PROJ_002                 485                 69              4   
2   PROJ_003                 398                 14              1   
3   PROJ_004                 320                 53              8   
4   PROJ_005                 156                 59             17   

   owner_success_rate  planned_duration_days  daily_task_updates  \
0                0.78                     31                 2.5   
1                0.74                    106                 1.0   
2                0.94                    161                 3.5   
3                0.77                    119                 5.0   
4                0.72                    118                 3.8   

   initial_budget_usd  project_dela

In [34]:
# --- 2. Data Cleaning and Preprocessing ---

# Drop identifier columns not used for training
# 'project_id' is just an identifier and should not be used as a feature.
# 'start_date', 'due_date', 'actual_end_date' are used to derive 'project_delayed'
# but not directly as features for the model in this simplified example.
# In a real scenario, features like 'days_remaining_until_due' could be engineered.
features_to_drop = ['project_id', 'start_date', 'due_date', 'actual_end_date']
df_processed = df.drop(columns=features_to_drop, errors='ignore')

print(f"\nDropped columns: {features_to_drop}")
print(f"Processed data shape: {df_processed.shape}")



Dropped columns: ['project_id', 'start_date', 'due_date', 'actual_end_date']
Processed data shape: (500, 8)


In [36]:
# Check for missing values (synthetic data should be clean, but good practice)
if df_processed.isnull().sum().sum() > 0:
    print("\nWarning: Missing values found. Imputing with median.")
    for col in df_processed.columns:
        if df_processed[col].isnull().any():
            if pd.api.types.is_numeric_dtype(df_processed[col]):
                df_processed[col] = df_processed[col].fillna(df_processed[col].median())
            else: # For categorical, fill with mode or a placeholder
                df_processed[col] = df_processed[col].fillna(df_processed[col].mode()[0])

In [38]:
# Define features (X) and target (y)
# The target variable is 'project_delayed'
X = df_processed.drop('project_delayed', axis=1)
y = df_processed['project_delayed']

In [40]:
print(f"\nFeatures (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print("Features used for training:")
print(X.columns.tolist())


Features (X) shape: (500, 7)
Target (y) shape: (500,)
Features used for training:
['project_complexity', 'num_overdue_tasks', 'num_assignees', 'owner_success_rate', 'planned_duration_days', 'daily_task_updates', 'initial_budget_usd']


In [42]:
# Split data into training and testing sets
# Using a stratify split to maintain the proportion of delayed/not delayed projects
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"\nTraining set shape (X_train, y_train): {X_train.shape}, {y_train.shape}")
print(f"Testing set shape (X_test, y_test): {X_test.shape}, {y_test.shape}")


Training set shape (X_train, y_train): (400, 7), (400,)
Testing set shape (X_test, y_test): (100, 7), (100,)


In [44]:
# Scale numerical features
# It's crucial to fit the scaler ONLY on the training data to prevent data leakage.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test) # Transform test data using the *fitted* scaler

In [46]:
# Convert back to DataFrame for better readability (optional, but useful for inspection)
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X.columns, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X.columns, index=X_test.index)

In [48]:
# --- 3. Model Selection and Training ---
# Using RandomForestClassifier for its robustness and good performance
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced') # 'balanced' helps with imbalanced classes
print(f"\nTraining {type(model).__name__} model...")
model.fit(X_train_scaled, y_train)
print("Model training complete.")


Training RandomForestClassifier model...
Model training complete.


In [50]:
# --- 4. Evaluation ---
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1] # Probability of being delayed

print("\n--- Model Evaluation ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not Delayed (0)', 'Delayed (1)']))


--- Model Evaluation ---
Accuracy: 0.6500

Classification Report:
                 precision    recall  f1-score   support

Not Delayed (0)       0.57      0.34      0.43        38
    Delayed (1)       0.68      0.84      0.75        62

       accuracy                           0.65       100
      macro avg       0.62      0.59      0.59       100
   weighted avg       0.63      0.65      0.63       100



In [52]:
# --- 5. Save Model and Scaler ---
# Create a 'models' directory if it doesn't exist
models_dir = 'models'
os.makedirs(models_dir, exist_ok=True)

In [54]:
model_path = os.path.join(models_dir, 'random_forest_model.joblib')
scaler_path = os.path.join(models_dir, 'scaler.joblib')
feature_names_path = os.path.join(models_dir, 'feature_names.joblib')

joblib.dump(model, model_path)
joblib.dump(scaler, scaler_path)
joblib.dump(X.columns.tolist(), feature_names_path) # Save feature names for consistent input order

['models\\feature_names.joblib']

In [56]:
print(f"\nModel saved to: {model_path}")
print(f"Scaler saved to: {scaler_path}")
print(f"Feature names saved to: {feature_names_path}")

print("\nModel training script finished.")


Model saved to: models\random_forest_model.joblib
Scaler saved to: models\scaler.joblib
Feature names saved to: models\feature_names.joblib

Model training script finished.
