In [None]:
Use case: Predict the daily battery energy (Wh) required for a personalized powered wheelchair based on user anthropometry (age, height, weight, BMI) and expected daily usage (distance, terrain). This supports design sizing and battery selection for assistive mobility devices.


In [None]:
# Cell 1: Imports & settings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib, os
np.random.seed(42)
%matplotlib inline

plt.rcParams.update({'figure.figsize': (9,6), 'font.size': 12})


In [None]:
# Cell 2: Load processed data (or fallback to raw)
processed_path = 'summative/linear_regression/processed_bmi.csv'
raw_path = 'data/bmi.csv'

if os.path.exists(processed_path):
    df = pd.read_csv(processed_path)
    print('Loaded processed dataset:', processed_path)
elif os.path.exists(raw_path):
    print('Processed file not found. Loading raw file:', raw_path)
    df = pd.read_csv(raw_path)
    print('Raw loaded. You should run process_data.py to create processed_bmi.csv.')
else:
    raise FileNotFoundError('No dataset found. Put bmi.csv into data/ or run process_data.py to create processed file.')

print('Shape:', df.shape)
display(df.head())


In [None]:
# Cell 3: Confirm engineered columns and target
expected = ['Age','Height','Weight','Bmi','BmiClass']
for c in expected:
    if c not in df.columns:
        print('Warning: expected column missing:', c)

# Check for engineered columns
for c in ['daily_distance_km','terrain_factor','battery_Wh']:
    print(c, 'in df:', c in df.columns)

if 'battery_Wh' in df.columns:
    display(df[['daily_distance_km','terrain_factor','battery_Wh']].describe().round(3))


In [None]:
# Cell 4: Exploratory Data Analysis - histograms and correlation heatmap
numeric_cols = ['Age','Height','Weight','Bmi','daily_distance_km','terrain_factor','battery_Wh']
numeric_available = [c for c in numeric_cols if c in df.columns]
df[numeric_available].hist(bins=25, layout=(3,3), figsize=(12,10))
plt.suptitle('Feature distributions')
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

plt.figure(figsize=(8,6))
sns.heatmap(df[numeric_available].corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation heatmap')
plt.show()


In [None]:
# Cell 5: Feature engineering (run only if processed_bmi.csv not present)
if 'daily_distance_km' not in df.columns or 'terrain_factor' not in df.columns or 'battery_Wh' not in df.columns:
    print('Adding engineered features (reproducible seed=42).')
    np.random.seed(42)
    df = df.copy()
    df['daily_distance_km'] = np.round(np.random.uniform(1.0, 10.0, size=len(df)), 3)
    df['terrain_factor'] = np.random.choice([1.0,1.3,1.6], size=len(df), p=[0.55,0.35,0.10])
    df['battery_Wh'] = df['Weight'] * 0.35 * df['terrain_factor'] * df['daily_distance_km']
    df.to_csv('summative/linear_regression/processed_bmi.csv', index=False)
    print('Engineered features added and saved to summative/linear_regression/processed_bmi.csv')


In [None]:
# Cell 6: Feature selection and preparation
features = ['Age','Height','Weight','Bmi','daily_distance_km','terrain_factor']
target = 'battery_Wh'

# drop BmiClass (categorical, redundant)
if 'BmiClass' in df.columns:
    print('Dropping BmiClass (categorical, redundant with Bmi).')
    df = df.drop(columns=['BmiClass'])

X = df[features].copy()
y = df[target].copy()
print('Features shape:', X.shape)


In [None]:
# Cell 7: Missing values & train-test split
print('Missing values per column:')
print(X.isna().sum())

# Impute median if needed
X = X.fillna(X.median())

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
print('Train size:', X_train.shape, 'Test size:', X_test.shape)


In [None]:
# Cell 8: Standardization
scaler = StandardScaler().fit(X_train)
Xtr = scaler.transform(X_train)
Xte = scaler.transform(X_test)
# show scaler means and scales
scaler_info = pd.DataFrame({'feature': X_train.columns, 'mean': scaler.mean_, 'scale': scaler.scale_})
display(scaler_info.round(4))

# Save scaler for API
joblib.dump(scaler, 'summative/linear_regression/scaler.pkl')
print('Saved scaler to summative/linear_regression/scaler.pkl')


In [None]:
# Cell 9: Linear Regression baseline (closed-form)
lr = LinearRegression().fit(Xtr, y_train)
y_pred_lr = lr.predict(Xte)
print('LinearRegression Test MSE:', mean_squared_error(y_test, y_pred_lr).round(4))
print('LinearRegression Test MAE:', mean_absolute_error(y_test, y_pred_lr).round(4))
print('LinearRegression R2:', r2_score(y_test, y_pred_lr).round(4))


In [None]:
# Cell 10: SGDRegressor (Gradient Descent) - collect loss curves
sgd = SGDRegressor(max_iter=1, tol=None, learning_rate='invscaling', eta0=0.01, random_state=42, warm_start=True)
n_epochs = 200
train_losses = []
test_losses = []
for epoch in range(n_epochs):
    sgd.partial_fit(Xtr, y_train)
    train_losses.append(mean_squared_error(y_train, sgd.predict(Xtr)))
    test_losses.append(mean_squared_error(y_test, sgd.predict(Xte)))

plt.plot(train_losses, label='Train MSE')
plt.plot(test_losses, label='Test MSE')
plt.xlabel('Epoch')
plt.ylabel('MSE')
plt.legend()
plt.title('SGDRegressor Loss Curves')
plt.show()

print('SGD final Test MSE:', test_losses[-1].round(4))


In [None]:
# Cell 11: Decision Tree & Random Forest
dt = DecisionTreeRegressor(random_state=42).fit(Xtr, y_train)
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1).fit(Xtr, y_train)

y_pred_dt = dt.predict(Xte)
y_pred_rf = rf.predict(Xte)

print('DecisionTree Test MSE:', mean_squared_error(y_test, y_pred_dt).round(4))
print('RandomForest Test MSE:', mean_squared_error(y_test, y_pred_rf).round(4))


In [None]:
# Cell 12: Compare models and save best
models = {'LinearRegression': lr, 'SGDRegressor': sgd, 'DecisionTree': dt, 'RandomForest': rf}
results = {}
for name, model in models.items():
    preds = model.predict(Xte)
    results[name] = {'mse': mean_squared_error(y_test, preds), 'mae': mean_absolute_error(y_test, preds), 'r2': r2_score(y_test, preds)}

results_df = pd.DataFrame(results).T.sort_values('mse')
display(results_df.round(4))
best_name = results_df.index[0]
best_model = models[best_name]
print('Best model:', best_name)

# Save artifacts
outdir = 'summative/linear_regression'
os.makedirs(outdir, exist_ok=True)
joblib.dump(best_model, os.path.join(outdir, 'best_model.pkl'))
joblib.dump(scaler, os.path.join(outdir, 'scaler.pkl'))
results_df.to_csv(os.path.join(outdir, 'model_metrics.csv'))
print('Saved best_model.pkl, scaler.pkl and model_metrics.csv to', outdir)


In [None]:
# Cell 13: Scatter plots - before & after (Weight example)
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.scatter(X['Weight'], y, alpha=0.6, s=20, color='orange')
# overlay linear fit line
w_vals = np.linspace(X['Weight'].min(), X['Weight'].max(), 100)
med = X.median()
X_line = pd.DataFrame({c: np.repeat(med[c], 100) for c in X.columns})
X_line['Weight'] = w_vals
X_line_scaled = scaler.transform(X_line)
y_line = lr.predict(X_line_scaled)
plt.plot(w_vals, y_line, color='red', linewidth=2)
plt.xlabel('Weight (kg)'); plt.ylabel('battery_Wh'); plt.title('Weight vs battery_Wh + Linear fit')

plt.subplot(1,2,2)
preds = best_model.predict(Xte)
plt.scatter(y_test, preds, alpha=0.6, s=20, color='orange')
mn = min(y_test.min(), preds.min()); mx = max(y_test.max(), preds.max())
plt.plot([mn,mx],[mn,mx], color='red', linestyle='--')
plt.xlabel('Actual battery_Wh'); plt.ylabel('Predicted battery_Wh'); plt.title('Predicted vs Actual')
plt.tight_layout()
plt.show()

# Save plot files for README/video
plt.savefig(os.path.join('summative/linear_regression','scatter_before_after.png'))


In [None]:
# Cell 14: Final test metrics & save test predictions
preds = best_model.predict(Xte)
final_df = pd.DataFrame({'actual': y_test.values, 'predicted': preds})
final_df.to_csv(os.path.join('summative/linear_regression','test_predictions.csv'), index=False)
print('Final Test MSE:', mean_squared_error(y_test, preds).round(4))
print('Final Test MAE:', mean_absolute_error(y_test, preds).round(4))
print('R2 score:', r2_score(y_test, preds).round(4))
display(final_df.head())

# Prediction function for API
def predict_from_dict(input_dict, scaler_path=os.path.join('summative','linear_regression','scaler.pkl'), model_path=os.path.join('summative','linear_regression','best_model.pkl')):
    scaler_local = joblib.load(scaler_path)
    model_local = joblib.load(model_path)
    df_in = pd.DataFrame([input_dict])
    X_in = df_in[['Age','Height','Weight','Bmi','daily_distance_km','terrain_factor']]
    X_scaled = scaler_local.transform(X_in)
    pred = model_local.predict(X_scaled)[0]
    return float(pred)

# Quick test (first row of test set)
sample = X_test.iloc[0].to_dict()
print('Sample input (test):', sample)
print('Predicted battery_Wh (sample):', predict_from_dict(sample))


---
**Notebook complete.**

Files saved to `summative/linear_regression/`:
- `best_model.pkl`
- `scaler.pkl`
- `model_metrics.csv`
- `test_predictions.csv`
- `scatter_before_after.png`
- `sgd_loss_curves.png` (if plotted)

Include this notebook (or the training script) in your repo under `summative/linear_regression/` for submission.
