# BTK DATATHON-2025

In [90]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

In [91]:
df_train = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\train.csv")
df_test = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\test.csv")
df_submission = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\sample_submission.csv")

In [92]:
df_train

Unnamed: 0,event_time,event_type,product_id,category_id,user_id,user_session,session_value
0,2025-06-19 10:23:07+00:00,ADD_CART,PROD_011223,CAT_00054,USER_097562,SESSION_158779,90.29
1,2025-06-07 21:34:45+00:00,ADD_CART,PROD_005519,CAT_00144,USER_006535,SESSION_029987,16.39
2,2025-06-21 21:29:09+00:00,ADD_CART,PROD_000577,CAT_00273,USER_047199,SESSION_022134,64.27
3,2025-06-09 09:10:20+00:00,ADD_CART,PROD_019235,CAT_00442,USER_082028,SESSION_161308,41.67
4,2025-06-19 11:13:58+00:00,ADD_CART,PROD_001702,CAT_00025,USER_096574,SESSION_182859,86.11
...,...,...,...,...,...,...,...
141214,2025-06-20 13:39:03+00:00,BUY,PROD_014962,CAT_00019,USER_090115,SESSION_141762,177.32
141215,2025-06-13 11:59:24+00:00,BUY,PROD_002627,CAT_00030,USER_088499,SESSION_083133,35.73
141216,2025-06-07 09:21:08+00:00,BUY,PROD_002575,CAT_00030,USER_038679,SESSION_177107,73.35
141217,2025-06-16 20:15:38+00:00,BUY,PROD_000500,CAT_00062,USER_019663,SESSION_019029,378.94


In [93]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141219 entries, 0 to 141218
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     141219 non-null  object 
 1   event_type     141219 non-null  object 
 2   product_id     141219 non-null  object 
 3   category_id    141219 non-null  object 
 4   user_id        141219 non-null  object 
 5   user_session   141219 non-null  object 
 6   session_value  141219 non-null  float64
dtypes: float64(1), object(6)
memory usage: 7.5+ MB


In [94]:
df_train.describe()

Unnamed: 0,session_value
count,141219.0
mean,75.348539
std,121.794683
min,5.38
25%,23.78
50%,40.95
75%,86.44
max,2328.66


In [95]:
df_train["user_id"].unique()

array(['USER_097562', 'USER_006535', 'USER_047199', ..., 'USER_008110',
       'USER_052714', 'USER_081093'], dtype=object)

## Feature Enginering

In [96]:
def preprocess_df(df):
    """
    Güvenli preprocess fonksiyonu. 
    encoders: dict, önceden fit edilmiş LabelEncoder'lar
    fit_encoders=True ise yeni encoder fit edilir
    """
    df = df.copy()
    
    # 1️⃣ Event time
    df['event_time'] = pd.to_datetime(df['event_time'])
    df['day_type'] = df['event_time'].dt.weekday.apply(lambda x: 0 if x < 5 else 1)
    
    def get_time_of_day(hour):
        if 5 <= hour < 12:
            return 0
        elif 12 <= hour < 17:
            return 1
        elif 17 <= hour < 21:
            return 2
        else:
            return 3
    df['time_of_day'] = df['event_time'].dt.hour.apply(get_time_of_day)
    df = df.drop(columns=['event_time'])
    
    # 2️⃣ event_type mapping
    event_type_mapping = {'ADD_CART':0,'VIEW':1,'REMOVE_CART':2,'BUY':3}
    df['event_type'] = df['event_type'].map(event_type_mapping)
    
    df['product_id'] = df['product_id'].str.split('_').str[1]
    df['category_id'] = df['category_id'].str.split('_').str[1]
    df['user_id'] = df['user_id'].str.split('_').str[1]

    return df


In [97]:
# 1️⃣ Train set’i preprocess et ve encoder’ları al
df_train_processed = preprocess_df(df_train)

# 2️⃣ Test set’i aynı encoder’larla preprocess et
df_test_processed = preprocess_df(df_test)


In [98]:
df_train_processed

Unnamed: 0,event_type,product_id,category_id,user_id,user_session,session_value,day_type,time_of_day
0,0,011223,00054,097562,SESSION_158779,90.29,0,0
1,0,005519,00144,006535,SESSION_029987,16.39,1,3
2,0,000577,00273,047199,SESSION_022134,64.27,1,3
3,0,019235,00442,082028,SESSION_161308,41.67,0,0
4,0,001702,00025,096574,SESSION_182859,86.11,0,0
...,...,...,...,...,...,...,...,...
141214,3,014962,00019,090115,SESSION_141762,177.32,0,1
141215,3,002627,00030,088499,SESSION_083133,35.73,0,0
141216,3,002575,00030,038679,SESSION_177107,73.35,1,0
141217,3,000500,00062,019663,SESSION_019029,378.94,0,2


### Model Hazırlık

In [99]:
X = df_train_processed.drop(columns=['user_session','session_value'])
y = df_train_processed['session_value']

In [100]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
# Convert string IDs to integers
df_train_processed['product_id'] = df_train_processed['product_id'].astype(int)
df_train_processed['category_id'] = df_train_processed['category_id'].astype(int)
df_train_processed['user_id'] = df_train_processed['user_id'].astype(int)

In [102]:
df_train_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141219 entries, 0 to 141218
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_type     141219 non-null  int64  
 1   product_id     141219 non-null  int64  
 2   category_id    141219 non-null  int64  
 3   user_id        141219 non-null  int64  
 4   user_session   141219 non-null  object 
 5   session_value  141219 non-null  float64
 6   day_type       141219 non-null  int64  
 7   time_of_day    141219 non-null  int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 8.6+ MB


In [112]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

models = {
    'RandomForest': RandomForestRegressor(
n_estimators=600,
    max_depth=None,
    min_samples_split=2,
    max_features=None,
    random_state=42,
    n_jobs=-1
    ),
}


In [113]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Sonuçları saklamak için
results = {}

for name, model in models.items():
    print(f"🔧 Eğitim başlıyor: {name}")
    
    # Modeli eğit
    model.fit(X_train, y_train)
    
    # Tahmin yap
    y_pred = model.predict(X_test)
    
    # Değerlendirme metrikleri
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Sonuçları kaydet
    results[name] = {
        'MSE': mse,
        'R2 Score': r2
    }
    
    print(f"{name} tamamlandı ✅\n")

🔧 Eğitim başlıyor: RandomForest
RandomForest tamamlandı ✅



In [114]:
results

{'RandomForest': {'MSE': 4142.311981528396, 'R2 Score': 0.7134547538371275}}