In [1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import xgboost as xgb
import lightgbm as lgb
import catboost as cb


In [2]:
!dir data

 Volume in drive F is Big Disk
 Volume Serial Number is 078C-BC10

 Directory of f:\code\My GitHub\predict_calorie_expenditure\data

07/05/2025  15:38    <DIR>          .
07/05/2025  15:38    <DIR>          ..
07/05/2025  16:59         3,971,843 pred.csv
06/05/2025  09:57         3,500,012 sample_submission.csv
06/05/2025  09:57        10,524,308 test.csv
06/05/2025  09:57        35,463,548 train.csv
               4 File(s)     53,459,711 bytes
               2 Dir(s)  249,881,739,264 bytes free


In [3]:

df = pd.read_csv('data\\train.csv')

df.head()


Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB


In [5]:

df.isnull().sum()


id            0
Sex           0
Age           0
Height        0
Weight        0
Duration      0
Heart_Rate    0
Body_Temp     0
Calories      0
dtype: int64

In [6]:
df['Sex'] = df['Sex'].map({'male': 1, 'female': 0})


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  int64  
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(3)
memory usage: 51.5 MB


In [8]:
# Sex kolonunu düşelim (boş çünkü), id de modele katkı vermez
df = df.drop(columns=['id'])

# Özellikler ve hedef
X = df.drop(columns='Calories')
y = df['Calories']


In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

models = {
    'RandomForest': RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=3,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    ),
    
    # 'GradientBoosting': GradientBoostingRegressor(
    #     n_estimators=200,
    #     learning_rate=0.1,
    #     max_depth=5,
    #     random_state=42
    # ),
    
    'XGBoost': xgb.XGBRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1
    ),
    
    'LightGBM': lgb.LGBMRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    ),
    
    'CatBoost': cb.CatBoostRegressor(
        iterations=200,
        learning_rate=0.1,
        depth=5,
        verbose=0,
        random_state=42
    )
}


# rf.fit(X_train, y_train)


# # Tahmin
# y_pred = rf.predict(X_test)

# # Değerlendirme
# print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
# print("R2 Score:", r2_score(y_test, y_pred))


In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Sonuçları saklamak için
results = {}

for name, model in models.items():
    print(f"🔧 Eğitim başlıyor: {name}")
    
    # Modeli eğit
    model.fit(X_train, y_train)
    
    # Tahmin yap
    y_pred = model.predict(X_test)
    
    # Değerlendirme metrikleri
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Sonuçları kaydet
    results[name] = {
        'MAE': mae,
        'RMSE': rmse,
        'R2 Score': r2
    }
    
    print(f"{name} tamamlandı ✅\n")


🔧 Eğitim başlıyor: RandomForest
RandomForest tamamlandı ✅

🔧 Eğitim başlıyor: XGBoost
XGBoost tamamlandı ✅

🔧 Eğitim başlıyor: LightGBM
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015685 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 360
[LightGBM] [Info] Number of data points in the train set: 600000, number of used features: 7
[LightGBM] [Info] Start training from score 88.298465
LightGBM tamamlandı ✅

🔧 Eğitim başlıyor: CatBoost
CatBoost tamamlandı ✅



In [16]:
pd.DataFrame(results).round(4)

Unnamed: 0,RandomForest,XGBoost,LightGBM,CatBoost
MAE,3.473,2.3015,2.3085,2.324
RMSE,29.2885,13.8378,13.9264,13.8949
R2 Score,0.9924,0.9964,0.9964,0.9964


In [17]:
test_df = pd.read_csv('data/test.csv')
df.head()


Unnamed: 0,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,1,36,189.0,82.0,26.0,101.0,41.0,150.0
1,0,64,163.0,60.0,8.0,85.0,39.7,34.0
2,0,51,161.0,64.0,7.0,84.0,39.8,29.0
3,1,20,192.0,90.0,25.0,105.0,40.7,140.0
4,0,38,166.0,61.0,25.0,102.0,40.6,146.0


In [18]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          250000 non-null  int64  
 1   Sex         250000 non-null  object 
 2   Age         250000 non-null  int64  
 3   Height      250000 non-null  float64
 4   Weight      250000 non-null  float64
 5   Duration    250000 non-null  float64
 6   Heart_Rate  250000 non-null  float64
 7   Body_Temp   250000 non-null  float64
dtypes: float64(5), int64(2), object(1)
memory usage: 15.3+ MB


In [19]:
test_df['Sex'] = test_df['Sex'].map({'male': 1, 'female': 0})


In [20]:
ids = test_df['id']
test_df = test_df.drop(columns=['id'], errors='ignore')
test_predictions = models["XGBoost"].predict(test_df)


In [21]:
submission = pd.DataFrame({
    'id': ids,
    'Calories': test_predictions
})


In [22]:
submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   id        250000 non-null  int64  
 1   Calories  250000 non-null  float32
dtypes: float32(1), int64(1)
memory usage: 2.9 MB


In [None]:
print("Negative values in Calories:", (submission['Calories'] < 0).sum())
if (submission['Calories'] < 0).sum() > 0:
    print("\nNegative values before fix:")
    print(submission[submission['Calories'] < 0])
    # Take absolute value of negative predictions
    submission.loc[submission['Calories'] < 0, 'Calories'] = submission.loc[submission['Calories'] < 0, 'Calories'].abs()
    print("\nFixed negative values")

In [25]:
print("Negative values in Calories:", (submission['Calories'] < 0).sum())
if (submission['Calories'] < 0).sum() > 0:
    print("\nNegative values before fix:")
    print(submission[submission['Calories'] < 0])

Negative values in Calories: 0


In [None]:
test_predictions = models['XGBoost'].predict(test_df)


In [26]:

submission.to_csv('submission.csv', index=False)