In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the datasets
train_path = "New_DataSet.xlsx"
test_path = "Project_DataSet.xlsx"

train_df = pd.read_excel(train_path)
test_df = pd.read_excel(test_path)

# Drop extra columns from the test dataset
extra_columns = ["PRICE:", "Total", "Total.1"]
test_df = test_df.drop(columns=[col for col in extra_columns if col in test_df.columns], errors="ignore")

# Fill missing values with column means
train_df = train_df.fillna(train_df.mean())
test_df = test_df.fillna(test_df.mean())

# Ensure the column order matches exactly
test_df = test_df[train_df.columns]

# Define target column
target_column = "Net profit"

# Split into features (X) and target (y)
X_train = train_df.drop(columns=[target_column])
y_train = train_df[target_column]

X_test = test_df.drop(columns=[target_column])
y_test = test_df[target_column]

# Train the model
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, min_samples_split=2, min_samples_leaf=2)
model.fit(X_train, y_train)

# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Evaluate Training Performance
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Evaluate Testing Performance
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

# Print results
print("Training Performance:")
print(f"MAE: {train_mae:.2f}, RMSE: {train_rmse:.2f}, R²: {train_r2:.4f}")

print("\nTesting Performance:")
print(f"MAE: {test_mae:.2f}, RMSE: {test_rmse:.2f}, R²: {test_r2:.4f}")


Training Performance:
MAE: 734.74, RMSE: 2574827.88, R²: 0.9729

Testing Performance:
MAE: 2368.15, RMSE: 23180446.00, R²: 0.8813


In [3]:
train_df = train_df.reset_index(drop=True)  # Reset index to avoid mismatches
y_test = y_test.reset_index(drop=True)  # Reset y_test index
y_pred = pd.Series(y_test_pred, index=y_test.index)  # Ensure y_pred index matches

# Create output DataFrame
output_df = pd.DataFrame({"Sales": y_test, "Predicted Sales": y_pred})

# Save to CSV
output_df.to_csv("predicted_sales.csv", index=False)

print("Predicted values saved successfully.")


Predicted values saved successfully.


In [4]:
# ✅ Save model correctly
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import joblib

joblib.dump(model, "random_forest_model.pkl")

['random_forest_model.pkl']