In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math


import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import pynvml
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_squared_error,
    mean_squared_log_error,
    mean_absolute_error,
    r2_score
)

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [4]:
train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB


In [3]:
def apply_feature_engineering(df):
    # Mevcut özellikler...
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
    
    # 1. BMI: Vücut kitle endeksi
    df["BMI"] = df["Weight"] / ((df["Height"] / 100) ** 2)
    
    # 2. Max_HR: Maksimum kalp atış hızı
    df["Max_HR"] = 220 - df["Age"]
    
    # 3. Intensity_Percent: Egzersiz yoğunluk yüzdesi
    df["Intensity_Percent"] = (df["Heart_Rate"] / df["Max_HR"]) * 100
    
    # 4. Intensity_Level_Num: Yoğunluk düzeyi sayısal (0–3)
    def intensity_level(percent):
        if percent < 50:
            return 0
        elif percent < 70:
            return 1
        elif percent < 85:
            return 2
        else:
            return 3
    df["Intensity_Level_Num"] = df["Intensity_Percent"].apply(intensity_level)
    
    # 5. Age_Group_Num: Yaş grubu
    df["Age_Group"] = pd.cut(
        df["Age"],
        bins=[0, 18, 30, 45, 60, 100],
        labels=[0, 1, 2, 3, 4]
    ).astype(int)
    
    # 6. Effort: Nabız × Süre
    df["Effort"] = df["Heart_Rate"] * df["Duration"]
    
    # 7. Weight_to_Height: Kilo / Boy
    df["Weight_to_Height"] = df["Weight"] / df["Height"]

    return df

In [6]:
df = apply_feature_engineering(train)
df_test = apply_feature_engineering(test)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   750000 non-null  int64  
 1   Sex                  750000 non-null  int32  
 2   Age                  750000 non-null  int64  
 3   Height               750000 non-null  float64
 4   Weight               750000 non-null  float64
 5   Duration             750000 non-null  float64
 6   Heart_Rate           750000 non-null  float64
 7   Body_Temp            750000 non-null  float64
 8   Calories             750000 non-null  float64
 9   BMI                  750000 non-null  float64
 10  Max_HR               750000 non-null  int64  
 11  Intensity_Percent    750000 non-null  float64
 12  Intensity_Level_Num  750000 non-null  int64  
 13  Age_Group            750000 non-null  int32  
 14  Effort               750000 non-null  float64
 15  Weight_to_Height 

In [8]:
# Define features (X) and target (y)
X = df.drop(['Calories'], axis=1)
y = df['Calories']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes to verify
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (600000, 15)
X_test shape: (150000, 15)
y_train shape: (600000,)
y_test shape: (150000,)


In [10]:
# CatBoost için parametre grid'ini tanımlayalım
params = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5, 7],
    'iterations': [100, 200]
}

# CatBoost model
catboost_model = CatBoostRegressor(
    loss_function='RMSE',
    random_seed=42,
    verbose=False
)

# GridSearchCV ile parametre optimizasyonu
grid_search = GridSearchCV(
    estimator=catboost_model,
    param_grid=params,
    cv=5,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=1
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# En iyi parametreleri görüntüle
print("Best parameters:", grid_search.best_params_)
print("Best score:", np.sqrt(-grid_search.best_score_))

# En iyi parametreleri kullanarak modeli eğit
best_catboost = CatBoostRegressor(
    **grid_search.best_params_,
    loss_function='RMSE',
    random_seed=42
)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


KeyboardInterrupt: 

In [12]:
catboost = CatBoostRegressor(
    iterations=500,
    depth=6,
    learning_rate=0.05,
    l2_leaf_reg=3,
    loss_function='RMSE',
    random_seed=42,
    verbose=100
)

catboost.fit(X_train, y_train, verbose=100)

# Modeli değerlendir
y_pred = catboost.predict(X_test)

# Metrik hesaplamaları
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2 Score: {r2:.4f}")



0:	learn: 59.4917416	total: 45ms	remaining: 22.4s
100:	learn: 4.1934112	total: 4.22s	remaining: 16.7s
200:	learn: 3.7426326	total: 8.13s	remaining: 12.1s
300:	learn: 3.6429225	total: 12s	remaining: 7.91s
400:	learn: 3.5878608	total: 15.8s	remaining: 3.9s
499:	learn: 3.5496729	total: 19.6s	remaining: 0us
RMSE: 3.64
MAE: 2.22
R2 Score: 0.9966


In [13]:
# Özellik önemlerini görselleştir
feature_importance = catboost.feature_importances_
feature_names = X_train.columns

# Özellik önemlerini azalan sırada sırala
sorted_idx = np.argsort(feature_importance)[::-1]
plt.figure(figsize=(10, 6))

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

In [15]:

# Test verisi üzerinde tahmin yap
test_predictions = catboost.predict(df_test)

# Submission dosyasını oluştur
submission = pd.DataFrame({
    'id': test['id'],
    'Calories': test_predictions
})

# CSV dosyasını kaydet
submission.to_csv('submission.csv', index=False)

print(f"Submission dosyası oluşturuldu. İlk 5 satır:")
print(submission.head())

# İstatistiksel bilgileri göster
print("\nTahmin edilen Calories değerlerinin istatistikleri:")
print(f"Ortalama: {submission['Calories'].mean():.2f}")
print(f"Minimum: {submission['Calories'].min():.2f}")
print(f"Maksimum: {submission['Calories'].max():.2f}")
print(f"Standart Sapma: {submission['Calories'].std():.2f}")

Submission dosyası oluşturuldu. İlk 5 satır:
       id    Calories
0  750000   26.814385
1  750001  106.223898
2  750002   88.678968
3  750003  127.053582
4  750004   75.369597

Tahmin edilen Calories değerlerinin istatistikleri:
Ortalama: 88.24
Minimum: 0.47
Maksimum: 301.88
Standart Sapma: 62.27
