In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import joblib
# Load and prepare data
df = pd.read_csv('data_final.csv')

# Critical data cleaning
df = df.drop(columns=[
    'Unnamed: 0',
    'Project ID',
    'Estimated Salary Hours',
    'Total Direct Time for Project for Hourly Employees (Including Drive Time)',
    #Removed the next 3 columns, but that is not reflected within the existing model. 
    'Total # of Days on Site',
    'Total # Hourly Empoyees on Site',
    'Estimated # of Salaried Employees on Site'
])

# Handle missing values
df[['Azimuth2']] = df[['Azimuth2']].fillna(0)
df[['Azimuth2']] = df[['Azimuth2']].fillna(0)
df[['Azimuth3']] = df[['Azimuth3']].fillna(0)
df = df.fillna(df.mean(numeric_only=True))

df[["Estimated Total Direct Time"]] = df[["Estimated Total Direct Time"]] * 60  # Convert to mins

# Separate features (X) and target variable (y)
X = df.drop('Estimated Total Direct Time', axis=1)
y = df['Estimated Total Direct Time']

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(exclude=['object']).columns.tolist()

# Preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

# Split data into train, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)

# Initialize lists to store MAE
train_mae = []
val_mae = []
test_mae = []
n_range = range(10, 300, 10)

for n in n_range:
    model = Pipeline([
        ('preprocess', preprocessor),
        ('rf', RandomForestRegressor(n_estimators=n, random_state=42))
    ])
    model.fit(X_train, y_train)

    # Predict and apply constraints to all sets
    def constrained_predict(X, y_true):
        preds = model.predict(X)
        preds = np.maximum(preds, y_true.values)  # No underestimate
        preds = np.minimum(preds, y_true.values + 120)  # Max 120 overestimate
        return preds

    # Calculate constrained predictions
    train_preds = constrained_predict(X_train, y_train)
    val_preds = constrained_predict(X_val, y_val)
    test_preds = constrained_predict(X_test, y_test)

    # Calculate MAEs
    train_mae.append(mean_absolute_error(y_train, train_preds))
    val_mae.append(mean_absolute_error(y_val, val_preds))
    test_mae.append(mean_absolute_error(y_test, test_preds))

 #Plotting all MAEs together
plt.figure(figsize=(10, 6))
plt.plot(n_range, train_mae, marker='o', label='Train MAE')
plt.plot(n_range, val_mae, marker='o', label='Validation MAE')
plt.plot(n_range, test_mae, marker='o', label='Test MAE')
plt.title('Random Forest Error Curves')
plt.xlabel('Number of Estimators')
plt.ylabel('Mean Absolute Error (minutes)')
plt.legend()
plt.grid(True)
plt.show()
#joblib.dump(model, 'model.joblib')
