In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math


import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import pynvml
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_squared_error,
    mean_squared_log_error,
    mean_absolute_error,
    r2_score
)

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train.head()

Unnamed: 0,id,Sex,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,0,male,36,189.0,82.0,26.0,101.0,41.0,150.0
1,1,female,64,163.0,60.0,8.0,85.0,39.7,34.0
2,2,female,51,161.0,64.0,7.0,84.0,39.8,29.0
3,3,male,20,192.0,90.0,25.0,105.0,40.7,140.0
4,4,female,38,166.0,61.0,25.0,102.0,40.6,146.0


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   id          750000 non-null  int64  
 1   Sex         750000 non-null  object 
 2   Age         750000 non-null  int64  
 3   Height      750000 non-null  float64
 4   Weight      750000 non-null  float64
 5   Duration    750000 non-null  float64
 6   Heart_Rate  750000 non-null  float64
 7   Body_Temp   750000 non-null  float64
 8   Calories    750000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 51.5+ MB


In [5]:
def apply_feature_engineering(df):
    # Mevcut özellikler...
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
    
    # 1. BMI: Vücut kitle endeksi
    df["BMI"] = df["Weight"] / ((df["Height"] / 100) ** 2)
    
    # 2. Max_HR: Maksimum kalp atış hızı
    df["Max_HR"] = 220 - df["Age"]
    
    # 3. Intensity_Percent: Egzersiz yoğunluk yüzdesi
    df["Intensity_Percent"] = (df["Heart_Rate"] / df["Max_HR"]) * 100
    
    # 4. Intensity_Level_Num: Yoğunluk düzeyi sayısal (0–3)
    def intensity_level(percent):
        if percent < 50:
            return 0
        elif percent < 70:
            return 1
        elif percent < 85:
            return 2
        else:
            return 3
    df["Intensity_Level_Num"] = df["Intensity_Percent"].apply(intensity_level)
    
    # 5. Age_Group_Num: Yaş grubu
    df["Age_Group"] = pd.cut(
        df["Age"],
        bins=[0, 18, 30, 45, 60, 100],
        labels=[0, 1, 2, 3, 4]
    ).astype(int)
    
    # 6. Effort: Nabız × Süre
    df["Effort"] = df["Heart_Rate"] * df["Duration"]
    
    # 7. Weight_to_Height: Kilo / Boy
    df["Weight_to_Height"] = df["Weight"] / df["Height"]

    return df

In [6]:
df = apply_feature_engineering(train)
df_test = apply_feature_engineering(test)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 16 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   id                   750000 non-null  int64  
 1   Sex                  750000 non-null  int32  
 2   Age                  750000 non-null  int64  
 3   Height               750000 non-null  float64
 4   Weight               750000 non-null  float64
 5   Duration             750000 non-null  float64
 6   Heart_Rate           750000 non-null  float64
 7   Body_Temp            750000 non-null  float64
 8   Calories             750000 non-null  float64
 9   BMI                  750000 non-null  float64
 10  Max_HR               750000 non-null  int64  
 11  Intensity_Percent    750000 non-null  float64
 12  Intensity_Level_Num  750000 non-null  int64  
 13  Age_Group            750000 non-null  int32  
 14  Effort               750000 non-null  float64
 15  Weight_to_Height 

In [8]:
# Define features (X) and target (y)
X = df.drop(['Calories'], axis=1)
y = df['Calories']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes to verify
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (600000, 15)
X_test shape: (150000, 15)
y_train shape: (600000,)
y_test shape: (150000,)


In [13]:
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

# Verileri standartlaştır
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Neural Network modeli
nn_model = MLPRegressor(
    hidden_layer_sizes=(100, 50),
    activation='relu',
    solver='adam',
    alpha=0.0001,
    max_iter=500,
    random_state=42,
    early_stopping=True,
    validation_fraction=0.1
)

nn_model.fit(X_train_scaled, y_train)

nn_pred = nn_model.predict(X_test_scaled)
nn_rmse = np.sqrt(mean_squared_error(y_test, nn_pred))
nn_mae = mean_absolute_error(y_test, nn_pred)
nn_r2 = r2_score(y_test, nn_pred)

print(f"Neural Network RMSE: {nn_rmse:.2f}")
print(f"Neural Network MAE: {nn_mae:.2f}")
print(f"Neural Network R2 Score: {nn_r2:.4f}")

Neural Network RMSE: 3.63
Neural Network MAE: 2.15
Neural Network R2 Score: 0.9966


In [14]:
# Test verileri için standartlaştırma (aynı scaler'ı kullanarak)
df_test_scaled = scaler.transform(df_test)

# Neural Network modeli ile tahminler
nn_test_predictions = nn_model.predict(df_test_scaled)

# Submission dosyasını oluştur
nn_submission = pd.DataFrame({
    'id': test['id'],
    'Calories': nn_test_predictions
})

# CSV dosyasını kaydet
nn_submission.to_csv('nn_submission.csv', index=False)

print(f"Neural Network submission dosyası oluşturuldu. İlk 5 satır:")
print(nn_submission.head())

print("\nTahmin edilen Calories değerlerinin istatistikleri:")
print(f"Ortalama: {nn_submission['Calories'].mean():.2f}")
print(f"Minimum: {nn_submission['Calories'].min():.2f}")
print(f"Maksimum: {nn_submission['Calories'].max():.2f}")
print(f"Standart Sapma: {nn_submission['Calories'].std():.2f}")

Neural Network submission dosyası oluşturuldu. İlk 5 satır:
       id    Calories
0  750000   27.178095
1  750001  107.506740
2  750002   87.931695
3  750003  125.455286
4  750004   76.621422

Tahmin edilen Calories değerlerinin istatistikleri:
Ortalama: 88.29
Minimum: -1.88
Maksimum: 309.76
Standart Sapma: 62.18


In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
# Random Forest için parametre grid'i tanımla
rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5],
    'max_features': ['sqrt', 'log2']
}

# Base Random Forest modelini oluştur
rf_model = RandomForestRegressor(random_state=42, n_jobs=-1)

# GridSearchCV nesnesini oluştur
rf_grid = GridSearchCV(
    estimator=rf_model,
    param_grid=rf_params,
    cv=5,
    n_jobs=-1,
    scoring='neg_root_mean_squared_error',
    verbose=2
)

# Grid search'ü gerçekleştir
rf_grid.fit(X_train[:2000], y_train[:2000])

# En iyi parametreleri ve skoru yazdır
print("\nEn iyi parametreler:", rf_grid.best_params_)
print("En iyi RMSE:", (-rf_grid.best_score_))

# En iyi model ile tahmin yap
best_rf = rf_grid.best_estimator_
rf_pred = best_rf.predict(X_test)

# Performans metriklerini hesapla
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred))
rf_mae = mean_absolute_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)

print("\nTest seti sonuçları:")
print(f"Random Forest RMSE: {rf_rmse:.2f}")
print(f"Random Forest MAE: {rf_mae:.2f}")
print(f"Random Forest R2 Score: {rf_r2:.4f}")

# Özellik önemlerini görselleştir
feature_importance = best_rf.feature_importances_
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5

plt.figure(figsize=(12, 6))
plt.barh(pos, feature_importance[sorted_idx])
plt.yticks(pos, X_train.columns[sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.tight_layout()
plt.show()

NameError: name 'X_train' is not defined

In [1]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

models = {
    'RandomForest': RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=3,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    ),
    
    # 'GradientBoosting': GradientBoostingRegressor(
    #     n_estimators=200,
    #     learning_rate=0.1,
    #     max_depth=5,
    #     random_state=42
    # ),
    
    'XGBoost': xgb.XGBRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        objective='reg:squarederror',
        random_state=42,
        n_jobs=-1
    ),
    
    'LightGBM': lgb.LGBMRegressor(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    ),
    
    'CatBoost': cb.CatBoostRegressor(
        iterations=200,
        learning_rate=0.1,
        depth=5,
        verbose=0,
        random_state=42
    )
}