# BTK DATATHON-2025

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

In [4]:
df_train = pd.read_csv(r"..\data\train.csv")
df_test = pd.read_csv(r"..\data\test.csv")
df_submission = pd.read_csv(r"..\data\sample_submission.csv")

In [5]:
df_train

Unnamed: 0,event_time,event_type,product_id,category_id,user_id,user_session,session_value
0,2025-06-19 10:23:07+00:00,ADD_CART,PROD_011223,CAT_00054,USER_097562,SESSION_158779,90.29
1,2025-06-07 21:34:45+00:00,ADD_CART,PROD_005519,CAT_00144,USER_006535,SESSION_029987,16.39
2,2025-06-21 21:29:09+00:00,ADD_CART,PROD_000577,CAT_00273,USER_047199,SESSION_022134,64.27
3,2025-06-09 09:10:20+00:00,ADD_CART,PROD_019235,CAT_00442,USER_082028,SESSION_161308,41.67
4,2025-06-19 11:13:58+00:00,ADD_CART,PROD_001702,CAT_00025,USER_096574,SESSION_182859,86.11
...,...,...,...,...,...,...,...
141214,2025-06-20 13:39:03+00:00,BUY,PROD_014962,CAT_00019,USER_090115,SESSION_141762,177.32
141215,2025-06-13 11:59:24+00:00,BUY,PROD_002627,CAT_00030,USER_088499,SESSION_083133,35.73
141216,2025-06-07 09:21:08+00:00,BUY,PROD_002575,CAT_00030,USER_038679,SESSION_177107,73.35
141217,2025-06-16 20:15:38+00:00,BUY,PROD_000500,CAT_00062,USER_019663,SESSION_019029,378.94


In [6]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141219 entries, 0 to 141218
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     141219 non-null  object 
 1   event_type     141219 non-null  object 
 2   product_id     141219 non-null  object 
 3   category_id    141219 non-null  object 
 4   user_id        141219 non-null  object 
 5   user_session   141219 non-null  object 
 6   session_value  141219 non-null  float64
dtypes: float64(1), object(6)
memory usage: 7.5+ MB


In [7]:
df_train.describe()

Unnamed: 0,session_value
count,141219.0
mean,75.348539
std,121.794683
min,5.38
25%,23.78
50%,40.95
75%,86.44
max,2328.66


In [8]:
df_train["user_id"].unique()

array(['USER_097562', 'USER_006535', 'USER_047199', ..., 'USER_008110',
       'USER_052714', 'USER_081093'], dtype=object)

## Feature Enginering

In [9]:
def preprocess_df(df):
    """
    Güvenli preprocess fonksiyonu. 
    encoders: dict, önceden fit edilmiş LabelEncoder'lar
    fit_encoders=True ise yeni encoder fit edilir
    """
    df = df.copy()
    
    # 1️⃣ Event time
    df['event_time'] = pd.to_datetime(df['event_time'])
    df['day_type'] = df['event_time'].dt.weekday.apply(lambda x: 0 if x < 5 else 1)
    
    def get_time_of_day(hour):
        if 5 <= hour < 12:
            return 0
        elif 12 <= hour < 17:
            return 1
        elif 17 <= hour < 21:
            return 2
        else:
            return 3
    df['time_of_day'] = df['event_time'].dt.hour.apply(get_time_of_day)
    df = df.drop(columns=['event_time'])
    
    # 2️⃣ event_type mapping
    event_type_mapping = {'ADD_CART':0,'VIEW':1,'REMOVE_CART':2,'BUY':3}
    df['event_type'] = df['event_type'].map(event_type_mapping)
    
    df['product_id'] = df['product_id'].str.split('_').str[1]
    df['category_id'] = df['category_id'].str.split('_').str[1]
    df['user_id'] = df['user_id'].str.split('_').str[1]

    return df


In [10]:
# 1️⃣ Train set’i preprocess et ve encoder’ları al
df_train_processed = preprocess_df(df_train)

# 2️⃣ Test set’i aynı encoder’larla preprocess et
df_test_processed = preprocess_df(df_test)


In [11]:
df_train_processed

Unnamed: 0,event_type,product_id,category_id,user_id,user_session,session_value,day_type,time_of_day
0,0,011223,00054,097562,SESSION_158779,90.29,0,0
1,0,005519,00144,006535,SESSION_029987,16.39,1,3
2,0,000577,00273,047199,SESSION_022134,64.27,1,3
3,0,019235,00442,082028,SESSION_161308,41.67,0,0
4,0,001702,00025,096574,SESSION_182859,86.11,0,0
...,...,...,...,...,...,...,...,...
141214,3,014962,00019,090115,SESSION_141762,177.32,0,1
141215,3,002627,00030,088499,SESSION_083133,35.73,0,0
141216,3,002575,00030,038679,SESSION_177107,73.35,1,0
141217,3,000500,00062,019663,SESSION_019029,378.94,0,2


### Model Hazırlık

In [12]:
X = df_train_processed.drop(columns=['user_session','session_value'])
y = df_train_processed['session_value']

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Convert string IDs to integers
df_train_processed['product_id'] = df_train_processed['product_id'].astype(int)
df_train_processed['category_id'] = df_train_processed['category_id'].astype(int)
df_train_processed['user_id'] = df_train_processed['user_id'].astype(int)

In [15]:
df_train_processed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141219 entries, 0 to 141218
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_type     141219 non-null  int64  
 1   product_id     141219 non-null  int32  
 2   category_id    141219 non-null  int32  
 3   user_id        141219 non-null  int32  
 4   user_session   141219 non-null  object 
 5   session_value  141219 non-null  float64
 6   day_type       141219 non-null  int64  
 7   time_of_day    141219 non-null  int64  
dtypes: float64(1), int32(3), int64(3), object(1)
memory usage: 7.0+ MB


In [16]:
RandomForestRegressor()

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [17]:
from sklearn.model_selection import RandomizedSearchCV

In [18]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
print(n_estimators)

[100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200]


In [19]:
 #Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Method of selecting samples for training each tree
# bootstrap = [True, False]

In [20]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

{'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200], 'max_features': ['auto', 'sqrt'], 'max_depth': [5, 10, 15, 20, 25, 30], 'min_samples_split': [2, 5, 10, 15, 100], 'min_samples_leaf': [1, 2, 5, 10]}


In [21]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

In [22]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=100, n_estimators=400; total time=   9.7s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=100, n_estimators=400; total time=  10.0s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=100, n_estimators=400; total time=   9.8s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=100, n_estimators=400; total time=   9.6s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=10, min_samples_split=100, n_estimators=400; total time=   9.6s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.0s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=200; total time=   0.0s
[CV] END max_depth=20, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_es

In [None]:
rf_random.best_params_

In [None]:
rf_random.best_score_

In [None]:
predictions=rf_random.predict(X_test)

In [None]:
sns.distplot(y_test-predictions)

In [None]:
plt.scatter(y_test,prediction)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, prediction))
print('MSE:', metrics.mean_squared_error(y_test, prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, prediction)))