In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_log_error



In [4]:


# 1. Load the data
train = pd.read_csv('train.csv')
test  = pd.read_csv('test.csv')

# 2. Basic info & statistics
print("=== TRAIN INFO ===")
train.info()
print("\n=== TRAIN DESCRIBE ===")
print(train.describe())

# 3. Peek at the first few rows
print("\n=== TRAIN HEAD ===")
print(train.head())

# 4. Quick look at categorical distribution
print("\n=== SEX VALUE COUNTS ===")
print(train['Sex'].value_counts())


=== TRAIN INFO ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB

=== TRAIN DESCRIBE ===
                  id            Age         Height         Weight  \
count  750000.000000  750000.000000  750000.000000  750000.000000   
mean   374999.500000      41.420404     174.697685      75.145668   
std    216506.495284      15.175049      12.824496      13.982704   
min         0.000000      20.000000     126.00

In [5]:


def keytel_calories(row):
    hr, w, a, dur = row['Heart_Rate'], row['Weight'], row['Age'], row['Duration']
    if row['Sex'] == 'male':
        kcal_per_min = (-55.0969 + 0.6309*hr + 0.1988*w + 0.2017*a) / 4.184
    else:
        kcal_per_min = (-20.4022 + 0.4472*hr - 0.1263*w + 0.074*a) / 4.184
    return kcal_per_min * dur

# Apply to your DataFrame
train['Calories_baseline'] = train.apply(keytel_calories, axis=1)

# Quick check: compare first few predictions to actuals
print(train[['Calories', 'Calories_baseline']].head())


   Calories  Calories_baseline
0     150.0         200.013576
1      34.0          28.236711
2      29.0          21.504254
3     140.0         197.619503
4     146.0         121.414316


In [7]:

# Clip negatives to zero
y_true = train['Calories']
y_pred = train['Calories_baseline'].clip(lower=0)

# Compute RMSLE
rmsle = np.sqrt(mean_squared_log_error(y_true, y_pred))
print(f"Baseline RMSLE: {rmsle:.4f}")


Baseline RMSLE: 0.3282


In [11]:
# 1) Build your submission DataFrame

sub_df = pd.read_csv("test.csv")
preds = sub_df.apply(keytel_calories, axis=1)

submission = pd.DataFrame({
    'id':    sub_df['id'],      # test_df came from reading “test.csv”
    'Calories': preds            # preds is your model’s 1D array of calorie predictions
})

# 2) Write it out
submission.to_csv('submission.csv', index=False)
print("Wrote submission.csv with", len(submission), "rows.")


Wrote submission.csv with 250000 rows.
