# BTK DATATHON-2025

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df_train = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\train.csv")
df_test = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\test.csv")
df_submission = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\sample_submission.csv")

In [None]:
df_train

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train["user_id"].unique()

## Feature Enginering

In [None]:
def preprocess_df(df):
    """
    Verilen dataframe üzerinde aşağıdaki işlemleri yapar:
    1. event_time → day_type, time_of_day (manuel mapping ile sayısal)
    2. event_type → manuel mapping
    3. product_id, category_id, user_id, user_session → Label Encoding
    4. Orijinal event_time sütunu silinir
    """
    df = df.copy()
    
    # 1️⃣ Event time feature engineering
    df['event_time'] = pd.to_datetime(df['event_time'])
    
    # Weekday / Weekend
    df['day_type'] = df['event_time'].dt.weekday.apply(lambda x: 'Weekend' if x >= 5 else 'Weekday')
    # Manuel mapping: Weekday=0, Weekend=1
    day_type_mapping = {'Weekday': 0, 'Weekend': 1}
    df['day_type'] = df['day_type'].map(day_type_mapping)
    
    # Saat dilimi
    def get_time_of_day(hour):
        if 5 <= hour < 12:
            return 'Morning'
        elif 12 <= hour < 17:
            return 'Afternoon'
        elif 17 <= hour < 21:
            return 'Evening'
        else:
            return 'Night'
    
    df['time_of_day'] = df['event_time'].dt.hour.apply(get_time_of_day)
    # Manuel mapping: Morning=0, Afternoon=1, Evening=2, Night=3
    time_mapping = {'Morning': 0, 'Afternoon': 1, 'Evening': 2, 'Night': 3}
    df['time_of_day'] = df['time_of_day'].map(time_mapping)
    
    # Orijinal event_time sütunu sil
    df = df.drop(columns=['event_time'])
    
    # 2️⃣ event_type manuel mapping
    event_type_mapping = {
        'ADD_CART': 0,
        'VIEW': 1,
        'REMOVE_CART': 2,
        'BUY': 3
    }
    df['event_type'] = df['event_type'].map(event_type_mapping)
    
    # 3️⃣ Label Encoding diğer kategorik sütunlar
    le_cols = ['product_id', 'category_id', 'user_id']
    for col in le_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    
    return df

In [None]:
df_train_processed = preprocess_df(df_train)
df_test_processed = preprocess_df(df_test)

In [None]:
df_train_processed

### Model Hazırlık

In [None]:
X = df_train_processed.drop(columns=['user_session','session_value'])
y = df_train_processed['session_value']

In [None]:
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import numpy as np
import joblib

# K-Fold
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Baseline CV
model = RandomForestRegressor(random_state=42, n_jobs=-1)
scores = cross_val_score(model, X, y, cv=kf, scoring="neg_root_mean_squared_error", n_jobs=-1)
rmse_scores = -scores
print(f"CV RMSE: mean = {rmse_scores.mean():.5f}, std = {rmse_scores.std():.5f}")

# Hyperparam search space
param_dist = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2']
}

# RandomizedSearchCV (konsistent CV kullanıyoruz)
search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=20,
    scoring="neg_root_mean_squared_error",
    cv=kf,
    random_state=42,
    n_jobs=-1,
    verbose=2,
    return_train_score=True
)

search.fit(X, y)

print("Best Params:", search.best_params_)
# DOĞRU: best_score_ zaten 'neg_root_mean_squared_error' -> negatif RMSE
print("Best RMSE (CV):", -search.best_score_)

# İstersen en iyi estimator'u tekrar CV ile doğrula
best = search.best_estimator_
scores_best = cross_val_score(best, X, y, cv=kf, scoring="neg_root_mean_squared_error", n_jobs=-1)
print(f"Best estimator CV RMSE: mean = {(-scores_best).mean():.5f}, std = {(-scores_best).std():.5f}")

# Kaydet
joblib.dump(best, "rf_best.pkl")
