In [1]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('../../data/train.csv')

# Display the first few rows of the dataframe
print("İlk 5 satır:")
print(df.head())

# Get a summary of the dataframe
print("\nVeri seti hakkında genel bilgiler:")
df.info()

# Check for missing values
print("\nEksik veri sayıları:")
print(df.isnull().sum())

İlk 5 satır:
                  event_time event_type   product_id category_id      user_id  \
0  2025-06-19 10:23:07+00:00   ADD_CART  PROD_011223   CAT_00054  USER_097562   
1  2025-06-07 21:34:45+00:00   ADD_CART  PROD_005519   CAT_00144  USER_006535   
2  2025-06-21 21:29:09+00:00   ADD_CART  PROD_000577   CAT_00273  USER_047199   
3  2025-06-09 09:10:20+00:00   ADD_CART  PROD_019235   CAT_00442  USER_082028   
4  2025-06-19 11:13:58+00:00   ADD_CART  PROD_001702   CAT_00025  USER_096574   

     user_session  session_value  
0  SESSION_158779          90.29  
1  SESSION_029987          16.39  
2  SESSION_022134          64.27  
3  SESSION_161308          41.67  
4  SESSION_182859          86.11  

Veri seti hakkında genel bilgiler:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141219 entries, 0 to 141218
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     141219 non-null  object 
 1   even

In [2]:
# Convert 'event_time' to datetime objects
df['event_time'] = pd.to_datetime(df['event_time'])

# Extract time-based features
df['hour'] = df['event_time'].dt.hour
df['day_of_week'] = df['event_time'].dt.dayofweek # Monday=0, Sunday=6
df['month'] = df['event_time'].dt.month

# Display the dataframe with new features
print("Yeni zaman özellikleri eklenmiş veri seti:")
print(df.head())

# One-hot encode the 'event_type' column
event_type_dummies = pd.get_dummies(df['event_type'], prefix='event')
df = pd.concat([df, event_type_dummies], axis=1)

print("\n'event_type' sütunu one-hot encoding ile dönüştürüldü:")
print(df.head())

Yeni zaman özellikleri eklenmiş veri seti:
                 event_time event_type   product_id category_id      user_id  \
0 2025-06-19 10:23:07+00:00   ADD_CART  PROD_011223   CAT_00054  USER_097562   
1 2025-06-07 21:34:45+00:00   ADD_CART  PROD_005519   CAT_00144  USER_006535   
2 2025-06-21 21:29:09+00:00   ADD_CART  PROD_000577   CAT_00273  USER_047199   
3 2025-06-09 09:10:20+00:00   ADD_CART  PROD_019235   CAT_00442  USER_082028   
4 2025-06-19 11:13:58+00:00   ADD_CART  PROD_001702   CAT_00025  USER_096574   

     user_session  session_value  hour  day_of_week  month  
0  SESSION_158779          90.29    10            3      6  
1  SESSION_029987          16.39    21            5      6  
2  SESSION_022134          64.27    21            5      6  
3  SESSION_161308          41.67     9            0      6  
4  SESSION_182859          86.11    11            3      6  

'event_type' sütunu one-hot encoding ile dönüştürüldü:
                 event_time event_type   product_id ca

In [3]:
# Aggregate data by user_session
session_df = df.groupby('user_session').agg(
    # Count the occurrences of each event type
    views=('event_VIEW', 'sum'),
    add_to_carts=('event_ADD_CART', 'sum'),
    removals_from_cart=('event_REMOVE_CART', 'sum'),
    buys=('event_BUY', 'sum'),
    # Count unique products and categories
    unique_products=('product_id', 'nunique'),
    unique_categories=('category_id', 'nunique'),
    # Take the mean of session_value
    session_value=('session_value', 'mean'),
    # Count total events in the session
    total_events=('event_type', 'count')
)

print("Oturum bazında gruplanmış veri seti:")
print(session_df.head())

# Outlier removal using the IQR method
Q1 = session_df['session_value'].quantile(0.25)
Q3 = session_df['session_value'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(f"\nAykırı değerleri temizlemeden önceki veri boyutu: {session_df.shape}")

session_df_no_outliers = session_df[(session_df['session_value'] >= lower_bound) & (session_df['session_value'] <= upper_bound)]

print(f"Aykırı değerleri temizledikten sonraki veri boyutu: {session_df_no_outliers.shape}")

# Save the processed data to a new CSV file
session_df_no_outliers.to_csv('processed_train.csv')

Oturum bazında gruplanmış veri seti:
                views  add_to_carts  removals_from_cart  buys  \
user_session                                                    
SESSION_000000      0            20                   8     0   
SESSION_000001      1             2                   2     1   
SESSION_000004      1             0                   0     0   
SESSION_000005      1             0                   0     0   
SESSION_000012      1             0                   0     0   

                unique_products  unique_categories  session_value  \
user_session                                                        
SESSION_000000               24                 20         355.80   
SESSION_000001                5                  5          96.60   
SESSION_000004                1                  1          30.92   
SESSION_000005                1                  1          40.09   
SESSION_000012                1                  1          23.06   

                total_e

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Load the processed data
data = pd.read_csv('processed_train.csv')

# Prepare the data
X = data.drop(['user_session', 'session_value'], axis=1)
y = data['session_value']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 1. Randomized Search for Random Forest ---
print("Random Forest modeli için Randomized Search başlatılıyor...")

# Define the parameter grid
rf_param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [int(x) for x in np.linspace(10, 110, num=11)] + [None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Instantiate the model and RandomizedSearchCV
rf = RandomForestRegressor(random_state=42)
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=rf_param_dist,
    n_iter=20,  # Number of parameter settings that are sampled
    cv=3,       # 3-fold cross-validation
    verbose=2,
    random_state=42,
    n_jobs=-1   # Use all available cores
)

# Fit the model
rf_random.fit(X_train, y_train)

print("\nRandom Forest için en iyi parametreler bulundu:")
print(rf_random.best_params_)

# --- 2. Randomized Search for LightGBM ---
print("\nLightGBM modeli için Randomized Search başlatılıyor...")

# Define the parameter grid
lgb_param_dist = {
    'n_estimators': [int(x) for x in np.linspace(start=100, stop=1000, num=10)],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'num_leaves': [20, 31, 40, 50, 60],
    'max_depth': [-1, 10, 20, 30],
    'min_child_samples': [20, 30, 50],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

# Instantiate the model and RandomizedSearchCV
lgbm = lgb.LGBMRegressor(random_state=42)
lgb_random = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=lgb_param_dist,
    n_iter=20,
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the model
lgb_random.fit(X_train, y_train)

print("\nLightGBM için en iyi parametreler bulundu:")
print(lgb_random.best_params_)

# --- 3. Evaluate the best models ---
print("\n--- Model Değerlendirme Sonuçları ---")

# Get the best estimators
best_rf = rf_random.best_estimator_
best_lgb = lgb_random.best_estimator_

# Make predictions
y_pred_rf = best_rf.predict(X_test)
y_pred_lgb = best_lgb.predict(X_test)

# Calculate metrics
def print_metrics(model_name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{model_name} Performansı:")
    print(f"  Ortalama Mutlak Hata (MAE): {mae:.4f}")
    print(f"  Ortalama Karesel Hata (MSE): {mse:.4f}")
    print(f"  Kök Ortalama Karesel Hata (RMSE): {rmse:.4f}")
    print(f"  R-kare (R²): {r2:.4f}")

print_metrics("Random Forest", y_test, y_pred_rf)
print_metrics("LightGBM", y_test, y_pred_lgb)

  from scipy.sparse import csr_matrix, issparse


Random Forest modeli için Randomized Search başlatılıyor...
Fitting 3 folds for each of 20 candidates, totalling 60 fits


30 fits failed out of a total of 60.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "f:\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "f:\Python\Python312\Lib\site-packages\sklearn\base.py", line 1358, in wrapper
    estimator._validate_params()
  File "f:\Python\Python312\Lib\site-packages\sklearn\base.py", line 471, in _validate_params
    validate_parameter_constraints(
  File "f:\Python\Python312\Lib\site-packages\sklearn\utils\_param_validation.py", line 98, in validate_parameter_constraints
    raise InvalidParameterError(
sklearn.utils


Random Forest için en iyi parametreler bulundu:
{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'max_depth': 10, 'bootstrap': True}

LightGBM modeli için Randomized Search başlatılıyor...
Fitting 3 folds for each of 20 candidates, totalling 60 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003593 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 115
[LightGBM] [Info] Number of data points in the train set: 51136, number of used features: 7
[LightGBM] [Info] Start training from score 30.726905

LightGBM için en iyi parametreler bulundu:
{'subsample': 1.0, 'num_leaves': 31, 'n_estimators': 800, 'min_child_samples': 50, 'max_depth': 10, 'learning_rate': 0.01, 'colsample_bytree': 0.6}

--- Model Değerlendirme Sonuçları ---

Random Forest Performansı:
  Ortalama Mutlak Hata (MAE): 10.5050
  O

## Testcsv pred

In [5]:
print("\n--- Adım 3: Test Verisi Üzerinde Tahmin Yapılıyor ---")

# Test verisini yükle
try:
    df_test = pd.read_csv('../../data/test.csv')
except FileNotFoundError:
    print("Hata: 'test.csv' dosyası bulunamadı. Lütfen dosyanın doğru yolda olduğundan emin olun.")
    exit()

# Eğitim verisine uygulanan adımların aynısını test verisine uygula
df_test['event_time'] = pd.to_datetime(df_test['event_time'])
df_test['hour'] = df_test['event_time'].dt.hour
df_test['day_of_week'] = df_test['event_time'].dt.dayofweek
df_test['month'] = df_test['event_time'].dt.month

test_event_dummies = pd.get_dummies(df_test['event_type'], prefix='event')
df_test = pd.concat([df_test, test_event_dummies], axis=1)

# Test verisini user_session bazında grupla
# Not: Test verisinde 'session_value' olmadığı için agg fonksiyonundan çıkarıldı
test_session_df = df_test.groupby('user_session').agg(
    views=('event_VIEW', 'sum'),
    add_to_carts=('event_ADD_CART', 'sum'),
    removals_from_cart=('event_REMOVE_CART', 'sum'),
    buys=('event_BUY', 'sum'),
    unique_products=('product_id', 'nunique'),
    unique_categories=('category_id', 'nunique'),
    total_events=('event_type', 'count')
)

# Eğitim ve test setlerindeki sütunları hizala
training_columns = X_train.columns
# Test setinde eksik olan sütunları bul ve 0 ile doldur
missing_cols = set(training_columns) - set(test_session_df.columns)
for c in missing_cols:
    test_session_df[c] = 0
# Sütunların sırasının eğitim setiyle aynı olduğundan emin ol
test_session_df = test_session_df[training_columns]

print(f"Test verisi işlendi. Boyut: {test_session_df.shape}")

test_session_df.to_csv('processed_test.csv')
# Tahminleri yap
# test_predictions = best_lgb.predict(test_session_df)

# # Submission dosyasını oluştur
# submission_df = pd.DataFrame({
#     'user_session': test_session_df.index,
#     'session_value': test_predictions
# })

# # Olası negatif tahminleri 0'a çek
# submission_df['session_value'] = submission_df['session_value'].clip(lower=0)

# # Dosyayı kaydet
# submission_df.to_csv('submission.csv', index=False)

# print("\n'submission.csv' dosyası başarıyla oluşturuldu.")
# print("Submission dosyasının ilk 5 satırı:")
# print(submission_df.head())



--- Adım 3: Test Verisi Üzerinde Tahmin Yapılıyor ---
Test verisi işlendi. Boyut: (30789, 7)
