In [1]:
import pandas as pd
import numpy as np
import gcsfs
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import r2_score

import sys
sys.path.append("../models")

from baseline import GroupMeanRegressor

In [2]:
import os

# Path to your uploaded JSON key — adjust if needed
SERVICE_ACCOUNT_PATH = "../secrets/service_account.json"

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SERVICE_ACCOUNT_PATH

print("Authenticated using service account.")

Authenticated using service account.


In [3]:
BUCKET = "group_project2"       
TRAIN_PATH = "inputs/train.csv"
VAL_PATH = "inputs/val.csv"

print("Bucket:", BUCKET)

Bucket: group_project2


In [4]:
fs = gcsfs.GCSFileSystem()

def read_gcs_csv(bucket, filepath):
    full_path = f"{bucket}/{filepath}"
    print("Loading:", full_path)
    with fs.open(full_path) as f:
        return pd.read_csv(f)

train = read_gcs_csv(BUCKET, TRAIN_PATH)
val   = read_gcs_csv(BUCKET, VAL_PATH)

train.head(), val.head()


Loading: group_project2/inputs/train.csv
Loading: group_project2/inputs/val.csv


(   crop_year  annual_rainfall   fertilizer  pesticide  crop_Arhar/Tur  \
 0       1997           1852.9     26457.26      86.18               0   
 1       1997           1852.9   1187816.77    3869.11               0   
 2       1997           1852.9   6592521.07   21474.01               0   
 3       1997           1852.9   1588767.98    5175.14               0   
 4       1997           1852.9  34962983.58  113885.94               0   
 
    crop_Bajra  crop_Banana  crop_Barley  crop_Black pepper  crop_Cardamom  \
 0           0            0            0                  0              0   
 1           0            0            0                  0              0   
 2           0            0            0                  0              0   
 3           0            0            0                  0              0   
 4           0            0            0                  0              0   
 
    ...  state_Puducherry  state_Punjab  state_Sikkim  state_Tamil Nadu  \
 0  ...  

In [6]:
target = "yield"

# Baseline takes first column as grouping
first_feature = train.columns[0]

X_train = train[[first_feature]]
y_train = train[target]

X_val = val[[first_feature]]
y_val = val[target]

print("Using baseline grouping column:", first_feature)

Using baseline grouping column: crop_year


In [7]:
baseline = GroupMeanRegressor()
baseline.fit(X_train, y_train)

print("Group means learned:", len(baseline.group_means_))
print("Global mean:", baseline.global_mean_)

Group means learned: 17
Global mean: 78.63793723849258


In [9]:
train_pred = baseline.predict(X_train)
val_pred   = baseline.predict(X_val)

baseline_train_mae = mean_absolute_error(y_train, train_pred)
baseline_val_mae   = mean_absolute_error(y_val, val_pred)

baseline_train_rmse = mean_squared_error(y_train, train_pred) ** 0.5
baseline_val_rmse   = mean_squared_error(y_val, val_pred) ** 0.5

baseline_train_r2 = r2_score(y_train, train_pred)
baseline_val_r2   = r2_score(y_val, val_pred)


In [10]:
results = pd.DataFrame({
    "Metric": ["MAE", "RMSE", "R²"],
    "Train": [baseline_train_mae, baseline_train_rmse, baseline_train_r2],
    "Validation": [baseline_val_mae, baseline_val_rmse, baseline_val_r2]
})

results


Unnamed: 0,Metric,Train,Validation
0,MAE,147.936847,164.207458
1,RMSE,853.749684,1026.933717
2,R²,7.6e-05,-0.000195


In [12]:
print("Baseline Validation MAE:", baseline_val_mae)
print("Baseline Validation RMSE:", baseline_val_rmse)
print("Baseline Validation R²:", baseline_val_r2)

Baseline Validation MAE: 164.20745777362305
Baseline Validation RMSE: 1026.9337169058338
Baseline Validation R²: -0.00019508278540292245


### Random Forrest Validation Metrics

TRAIN MAE: 3.0755917657250995
VAL MAE:   17.16689511973502