# BTK DATATHON-2025

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df_train = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\train.csv")
df_test = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\test.csv")
df_submission = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\sample_submission.csv")

In [3]:
df_train

Unnamed: 0,event_time,event_type,product_id,category_id,user_id,user_session,session_value
0,2025-06-19 10:23:07+00:00,ADD_CART,PROD_011223,CAT_00054,USER_097562,SESSION_158779,90.29
1,2025-06-07 21:34:45+00:00,ADD_CART,PROD_005519,CAT_00144,USER_006535,SESSION_029987,16.39
2,2025-06-21 21:29:09+00:00,ADD_CART,PROD_000577,CAT_00273,USER_047199,SESSION_022134,64.27
3,2025-06-09 09:10:20+00:00,ADD_CART,PROD_019235,CAT_00442,USER_082028,SESSION_161308,41.67
4,2025-06-19 11:13:58+00:00,ADD_CART,PROD_001702,CAT_00025,USER_096574,SESSION_182859,86.11
...,...,...,...,...,...,...,...
141214,2025-06-20 13:39:03+00:00,BUY,PROD_014962,CAT_00019,USER_090115,SESSION_141762,177.32
141215,2025-06-13 11:59:24+00:00,BUY,PROD_002627,CAT_00030,USER_088499,SESSION_083133,35.73
141216,2025-06-07 09:21:08+00:00,BUY,PROD_002575,CAT_00030,USER_038679,SESSION_177107,73.35
141217,2025-06-16 20:15:38+00:00,BUY,PROD_000500,CAT_00062,USER_019663,SESSION_019029,378.94


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141219 entries, 0 to 141218
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     141219 non-null  object 
 1   event_type     141219 non-null  object 
 2   product_id     141219 non-null  object 
 3   category_id    141219 non-null  object 
 4   user_id        141219 non-null  object 
 5   user_session   141219 non-null  object 
 6   session_value  141219 non-null  float64
dtypes: float64(1), object(6)
memory usage: 7.5+ MB


In [5]:
df_train.describe()

Unnamed: 0,session_value
count,141219.0
mean,75.348539
std,121.794683
min,5.38
25%,23.78
50%,40.95
75%,86.44
max,2328.66


In [6]:
df_train["user_id"].unique()

array(['USER_097562', 'USER_006535', 'USER_047199', ..., 'USER_008110',
       'USER_052714', 'USER_081093'], dtype=object)

## Feature Enginering

In [7]:
def preprocess_df(df):
    """
    Verilen dataframe üzerinde aşağıdaki işlemleri yapar:
    1. event_time → day_type, time_of_day (manuel mapping ile sayısal)
    2. event_type → manuel mapping
    3. product_id, category_id, user_id, user_session → Label Encoding
    4. Orijinal event_time sütunu silinir
    """
    df = df.copy()
    
    # 1️⃣ Event time feature engineering
    df['event_time'] = pd.to_datetime(df['event_time'])
    
    # Weekday / Weekend
    df['day_type'] = df['event_time'].dt.weekday.apply(lambda x: 'Weekend' if x >= 5 else 'Weekday')
    # Manuel mapping: Weekday=0, Weekend=1
    day_type_mapping = {'Weekday': 0, 'Weekend': 1}
    df['day_type'] = df['day_type'].map(day_type_mapping)
    
    # Saat dilimi
    def get_time_of_day(hour):
        if 5 <= hour < 12:
            return 'Morning'
        elif 12 <= hour < 17:
            return 'Afternoon'
        elif 17 <= hour < 21:
            return 'Evening'
        else:
            return 'Night'
    
    df['time_of_day'] = df['event_time'].dt.hour.apply(get_time_of_day)
    # Manuel mapping: Morning=0, Afternoon=1, Evening=2, Night=3
    time_mapping = {'Morning': 0, 'Afternoon': 1, 'Evening': 2, 'Night': 3}
    df['time_of_day'] = df['time_of_day'].map(time_mapping)
    
    # Orijinal event_time sütunu sil
    df = df.drop(columns=['event_time'])
    
    # 2️⃣ event_type manuel mapping
    event_type_mapping = {
        'ADD_CART': 0,
        'VIEW': 1,
        'REMOVE_CART': 2,
        'BUY': 3
    }
    df['event_type'] = df['event_type'].map(event_type_mapping)
    
    # 3️⃣ Label Encoding diğer kategorik sütunlar
    le_cols = ['product_id', 'category_id', 'user_id']
    for col in le_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    
    return df

In [8]:
df_train_processed = preprocess_df(df_train)
df_test_processed = preprocess_df(df_test)

In [9]:
df_train_processed

Unnamed: 0,event_type,product_id,category_id,user_id,user_session,session_value,day_type,time_of_day
0,0,9278,52,49935,SESSION_158779,90.29,0,0
1,0,4574,139,3275,SESSION_029987,16.39,1,3
2,0,510,265,23114,SESSION_022134,64.27,1,3
3,0,15561,431,41642,SESSION_161308,41.67,0,0
4,0,1411,24,49444,SESSION_182859,86.11,0,0
...,...,...,...,...,...,...,...,...
141214,3,12275,18,46050,SESSION_141762,177.32,0,1
141215,3,2214,28,45172,SESSION_083133,35.73,0,0
141216,3,2162,28,18922,SESSION_177107,73.35,1,0
141217,3,446,60,9757,SESSION_019029,378.94,0,2


### Model Hazırlık

In [10]:
X = df_train_processed.drop(columns=['user_session','session_value'])
y = df_train_processed['session_value']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
# Initialize and train the Linear Regression model
model = LinearRegression(n_jobs=-1)
model.fit(X_train, y_train)

In [13]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate MSE and RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Modelin Ortalama Kare Hatası (MSE): {mse:.4f}")
print(f"Modelin Kök Ortalama Kare Hatası (RMSE): {rmse:.4f}")

Modelin Ortalama Kare Hatası (MSE): 13925.5485
Modelin Kök Ortalama Kare Hatası (RMSE): 118.0066
