# BTK DATATHON-2025

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df_train = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\train.csv")
df_test = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\test.csv")
df_submission = pd.read_csv(r"C:\Users\yusuf\OneDrive\Masaüstü\btk25\data\sample_submission.csv")

In [None]:
df_train

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train["user_id"].unique()

## Feature Enginering

In [None]:
def preprocess_df(df):
    """
    Güvenli preprocess fonksiyonu. 
    encoders: dict, önceden fit edilmiş LabelEncoder'lar
    fit_encoders=True ise yeni encoder fit edilir
    """
    df = df.copy()
    
    # 1️⃣ Event time
    df['event_time'] = pd.to_datetime(df['event_time'])
    df['day_type'] = df['event_time'].dt.weekday.apply(lambda x: 0 if x < 5 else 1)
    
    def get_time_of_day(hour):
        if 5 <= hour < 12:
            return 0
        elif 12 <= hour < 17:
            return 1
        elif 17 <= hour < 21:
            return 2
        else:
            return 3
    df['time_of_day'] = df['event_time'].dt.hour.apply(get_time_of_day)
    df = df.drop(columns=['event_time'])
    
    # 2️⃣ event_type mapping
    event_type_mapping = {'ADD_CART':0,'VIEW':1,'REMOVE_CART':2,'BUY':3}
    df['event_type'] = df['event_type'].map(event_type_mapping)
    
    df['product_id'] = df['product_id'].str.split('_').str[1]
    df['category_id'] = df['category_id'].str.split('_').str[1]
    df['user_id'] = df['user_id'].str.split('_').str[1]

    return df


In [None]:
# 1️⃣ Train set’i preprocess et ve encoder’ları al
df_train_processed = preprocess_df(df_train)

# 2️⃣ Test set’i aynı encoder’larla preprocess et
df_test_processed = preprocess_df(df_test)


In [None]:
df_train_processed

### Model Hazırlık

In [None]:
X = df_train_processed.drop(columns=['user_session','session_value'])
y = df_train_processed['session_value']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Convert string IDs to integers
df_train_processed['product_id'] = df_train_processed['product_id'].astype(int)
df_train_processed['category_id'] = df_train_processed['category_id'].astype(int)
df_train_processed['user_id'] = df_train_processed['user_id'].astype(int)

In [None]:
df_train_processed.info()

In [None]:
RandomForestRegressor()

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
print(n_estimators)

In [None]:
 #Randomized Search CV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]
# max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]
# Method of selecting samples for training each tree
# bootstrap = [True, False]

In [None]:
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()

In [None]:
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = 1)

In [None]:
rf_random.fit(X_train,y_train)

In [None]:
rf_random.best_params_

In [None]:
rf_random.best_score_

In [None]:
predictions=rf_random.predict(X_test)

In [None]:
sns.distplot(y_test-predictions)

In [None]:
plt.scatter(y_test,prediction)

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
from sklearn import metrics

In [None]:
print('MAE:', metrics.mean_absolute_error(y_test, prediction))
print('MSE:', metrics.mean_squared_error(y_test, prediction))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, prediction)))