In [1]:
import polars as pl

DATA_PATH = "/kaggle/input/trendyol-e-ticaret-hackathonu-2025-kaggle/data"

In [2]:
# Read train, test and content data
train_sessions = pl.read_parquet(f"{DATA_PATH}/train_sessions.parquet")
test_sessions = pl.read_parquet(f"{DATA_PATH}/test_sessions.parquet")

content_metadata = pl.read_parquet(f"{DATA_PATH}/content/metadata.parquet")

In [3]:
# Read user data
user_metadata = pl.read_parquet(f"{DATA_PATH}/user/metadata.parquet")
user_search_log = pl.read_parquet(f"{DATA_PATH}/user/search_log.parquet")

In [4]:
user_fashion_search_log = pl.read_parquet(f"{DATA_PATH}/user/fashion_search_log.parquet")

unique_users = user_fashion_search_log["user_id_hashed"].unique()
chunk_size = 10000

summaries = []
for i in range(0, len(unique_users), chunk_size):
    chunk_users = unique_users[i:i+chunk_size]
    temp = (
        user_fashion_search_log
        .filter(pl.col("user_id_hashed").is_in(chunk_users))
        .group_by(["user_id_hashed", "content_id_hashed"])
        .agg([
            pl.col("total_search_click").mean().alias("user_fashion_search_log_avg_total_search_click"),
            pl.col("total_search_impression").mean().alias("user_fashion_search_log_avg_total_search_impression"),
        ])
    )
    summaries.append(temp)

user_fashion_search_log_summary = pl.concat(summaries)

In [5]:
del user_fashion_search_log

import gc
gc.collect()

0

In [6]:
content_sitewide_log = pl.read_parquet(f"{DATA_PATH}/content/sitewide_log.parquet")

# Summarize the content past history with mean of specific content ID values
content_sitewide_log_summary = (
    content_sitewide_log
    .group_by("content_id_hashed")
    .agg([
        pl.col("total_click").mean().alias("content_sitewide_avg_total_click"),
        pl.col("total_cart").mean().alias("content_sitewide_avg_total_cart"),
        pl.col("total_fav").mean().alias("content_sitewide_avg_total_fav"),
        pl.col("total_order").mean().alias("content_sitewide_avg_total_order"),
    ])
)

In [7]:
user_sitewide_log = pl.read_parquet(f"{DATA_PATH}/user/sitewide_log.parquet")

# Summarize the user site log and fashion site log history with mean of specific user ID values
user_sitewide_log_summary = (
    user_sitewide_log
    .group_by("user_id_hashed")
    .agg([
        pl.col("total_click").mean().alias("user_sitewide_avg_total_click"),
        pl.col("total_cart").mean().alias("user_sitewide_avg_total_cart"),
        pl.col("total_fav").mean().alias("user_sitewide_avg_total_fav"),
        pl.col("total_order").mean().alias("user_sitewide_avg_total_order"),
    ])
)



In [8]:
user_fashion_sitewide_log = pl.read_parquet(f"{DATA_PATH}/user/fashion_sitewide_log.parquet")

unique_users = user_fashion_sitewide_log["user_id_hashed"].unique()
chunk_size = 1000

summaries = []
for i in range(0, len(unique_users), chunk_size):
    chunk_users = unique_users[i:i+chunk_size]
    temp = (
        user_fashion_sitewide_log
        .filter(pl.col("user_id_hashed").is_in(chunk_users))
        .group_by(["user_id_hashed", "content_id_hashed"])
        .agg([
            pl.col("total_click").mean().alias("user_fashion_sitewide_avg_total_click"),
            pl.col("total_cart").mean().alias("user_fashion_sitewide_avg_total_cart"),
            pl.col("total_fav").mean().alias("user_fashion_sitewide_avg_total_fav"),
            pl.col("total_order").mean().alias("user_fashion_sitewide_avg_total_order"),
        ])
    )
    summaries.append(temp)

user_fashion_sitewide_log_summary = pl.concat(summaries)



In [9]:
content_top_terms_log = pl.read_parquet(f"{DATA_PATH}/content/top_terms_log.parquet")

unique_users = content_top_terms_log["content_id_hashed"].unique()
chunk_size = 1000

summaries = []
for i in range(0, len(unique_users), chunk_size):
    chunk_users = unique_users[i:i+chunk_size]
    temp = (
        content_top_terms_log
        .filter(pl.col("content_id_hashed").is_in(chunk_users))
        .group_by(["content_id_hashed", "search_term_normalized"])
        .agg([
            pl.col("total_search_click").mean().alias("content_top_terms_log_avg_total_search_click"),
            pl.col("total_search_impression").mean().alias("content_top_terms_log_avg_total_search_impression"),
        ])
    )
    summaries.append(temp)

content_top_terms_log_summary = pl.concat(summaries)

In [10]:
user_top_terms_log = pl.read_parquet(f"{DATA_PATH}/user/top_terms_log.parquet")

unique_users = user_top_terms_log["user_id_hashed"].unique()
chunk_size = 1000

summaries = []
for i in range(0, len(unique_users), chunk_size):
    chunk_users = unique_users[i:i+chunk_size]
    temp = (
        user_top_terms_log
        .filter(pl.col("user_id_hashed").is_in(chunk_users))
        .group_by(["user_id_hashed", "search_term_normalized"])
        .agg([
            pl.col("total_search_click").mean().alias("user_top_terms_log_avg_total_search_click"),
            pl.col("total_search_impression").mean().alias("user_top_terms_log_avg_total_search_impression"),
        ])
    )
    summaries.append(temp)

user_top_terms_log_summary = pl.concat(summaries)


In [11]:
term_search_log = pl.read_parquet(f"{DATA_PATH}/term/search_log.parquet")

term_search_log_summary = (
    term_search_log
    .group_by("search_term_normalized")
    .agg([
        pl.col("total_search_click").mean().alias("term_search_log_avg_total_search_click"),
        pl.col("total_search_impression").mean().alias("term_search_log_avg_total_search_impression"),
    ])
)



In [12]:
content_price_data = pl.read_parquet(f"{DATA_PATH}/content/price_rate_review_data.parquet")

content_price_data_summary = (
    content_price_data
    .group_by("content_id_hashed")
    .agg([
        pl.col("update_date").mean().alias("content_price_data_avg_update_date"),
        pl.col("original_price").mean().alias("content_price_data_avg_original_price"),
        pl.col("selling_price").mean().alias("content_price_data_avg_selling_price"),
        pl.col("discounted_price").mean().alias("content_price_data_avg_discounted_price"),
        pl.col("content_review_count").mean().alias("content_price_data_avg_content_review_count"),        
        pl.col("content_review_wth_media_count").mean().alias("content_price_data_avg_content_review_wth_media_count"),
        pl.col("content_rate_count").mean().alias("content_price_data_avg_content_rate_count"),
        pl.col("content_rate_avg").mean().alias("content_price_data_avg_content_rate_avg"),        
    ])
)

In [13]:
del content_sitewide_log
del user_sitewide_log
del user_fashion_sitewide_log
del content_top_terms_log
del user_top_terms_log
del term_search_log
del content_price_data

import gc
gc.collect()

0

In [14]:
def calculate_click_impression_ratio(df: pl.DataFrame, prefix: str):
    return df.with_columns(
        (pl.col(f'{prefix}_total_search_click') / pl.col(f'{prefix}_total_search_impression')).alias(f'{prefix}_click_impression_ratio')
    )

In [15]:
#new features about price and rate
"""
 from (original price - selling price - discounted price)
choose the min value 
choose the max value 
calculate the discount rate =  ((max - min) / max) * 100

total_rate_score = content_rate_count * content_rate_avg

media_review_ratio = content_review_wth_media_count / content_review_count
"""

price_cols = ['content_price_data_avg_original_price', 'content_price_data_avg_selling_price', 'content_price_data_avg_discounted_price']

# Determine the min and max values out of the given price values
content_price_data_summary = content_price_data_summary.with_columns([
    (pl.max_horizontal(price_cols).alias('max_price')),
    pl.min_horizontal(price_cols).alias("min_price")
    ])

# From the min and max values, compute discount rate information value
content_price_data_summary = content_price_data_summary.with_columns([
    ((pl.col("max_price") - pl.col("min_price")) / pl.col("max_price") * 100)
    .alias("discount_rate")
])

#create a total score feature from content count and content rate average values
content_price_data_summary = content_price_data_summary.with_columns([
    (pl.col("content_price_data_avg_content_rate_avg") / pl.col("content_price_data_avg_content_rate_count"))
    .alias("total_rate_score")
])

#create a media_rate feature from reviews with and without media file
content_price_data_summary = content_price_data_summary.with_columns([
    (pl.col("content_price_data_avg_content_review_wth_media_count") / pl.col("content_price_data_avg_content_review_count"))
    .alias("media_review_ratio")
])

# Remove the deprecated original_price, selling_price and discounted_price
content_price_data_summary = content_price_data_summary.drop(['content_price_data_avg_original_price', 'content_price_data_avg_selling_price', 'content_price_data_avg_discounted_price'])

In [16]:
# Calculate ratio
def add_ratios(df: pl.DataFrame, prefix: str):
    return df.with_columns([
        (pl.col(f"{prefix}_total_click") / pl.col(f"{prefix}_total_cart").clip(lower_bound=1e-6) / pl.col(f"{prefix}_total_order").clip(lower_bound=1e-6)).alias(f"{prefix}_click_cart_order_ratio"),
        (pl.col(f"{prefix}_total_click") / pl.col(f"{prefix}_total_cart").clip(lower_bound=1e-6)).alias(f"{prefix}_click_cart_ratio"),
        (pl.col(f"{prefix}_total_click") / pl.col(f"{prefix}_total_order").clip(lower_bound=1e-6)).alias(f"{prefix}_click_order_ratio"),
    ])

In [17]:
# Apply the click_impression_ratio function to each column with search and impression value in it
#content_top_terms_log = content_top_terms_log.with_columns(pl.col('content_total_search_click') / pl.col('content_total_search_impression')).alias(f'{prefix}_click_impresssion_ratio')

# Rename column names so that they dont get similar to each other when joining
content_top_terms_log_summary = calculate_click_impression_ratio(content_top_terms_log_summary, "content_top_terms_log_avg")
user_top_terms_log_summary = calculate_click_impression_ratio(user_top_terms_log_summary, "user_top_terms_log_avg")
user_fashion_search_log_summary = calculate_click_impression_ratio(user_fashion_search_log_summary, "user_fashion_search_log_avg")
term_search_log_summary = calculate_click_impression_ratio(term_search_log_summary, "term_search_log_avg")

In [18]:
# Transform user birth year to user age information
user_metadata = user_metadata.with_columns(
    (pl.lit(2025.0) - pl.col('user_birth_year')).alias('user_age')
)

# Transform user tenure in days to user tenure in years
user_metadata = user_metadata.with_columns(
    (pl.col('user_tenure_in_days').alias('user_tenure_in_years') // 365.25)
)

# Remove the deprecated user birth year and user tenure in days informations
user_metadata = user_metadata.drop(['user_birth_year', 'user_tenure_in_days'])

# Define the gender map for encoding gender information
gender_map = {
    "Bayan": 1,
    "Bay": -1,
    "UNKNOWN": 0
}

# Gender encode operation
df_user_metadata = user_metadata.with_columns(
    pl.col("user_gender").replace(gender_map).cast(pl.Int8).alias("user_gender_encoded")
)
# Remove the deprecated user gender information in string
df_user_metadata = df_user_metadata.drop('user_gender')

df_user_metadata.head()

user_id_hashed,user_age,user_tenure_in_years,user_gender_encoded
str,f64,f64,i8
"""94238d723579f0bd""",33.0,7.0,1
"""b041bad2ad93c114""",27.0,2.0,0
"""90361194b60e5f3b""",29.0,5.0,1
"""e570ed7193aa197a""",46.0,5.0,1
"""f71c3c802b3934a1""",38.0,4.0,0


In [19]:
train_sessions = train_sessions.with_columns(train_sessions["ts_hour"].cast(pl.Date).alias("ts_date"))
test_sessions = test_sessions.with_columns(test_sessions["ts_hour"].cast(pl.Date).alias("ts_date"))
content_price_data_summary = content_price_data_summary.with_columns(content_price_data_summary["content_price_data_avg_update_date"].cast(pl.Date).alias("ts_date"))

In [20]:
# Oran kolonlarını ekliyorum
user_sitewide_log_summary = add_ratios(user_sitewide_log_summary, "user_sitewide_avg")
user_fashion_sitewide_log_summary = add_ratios(user_fashion_sitewide_log_summary, "user_fashion_sitewide_avg")

content_sitewide_log_summary = add_ratios(content_sitewide_log_summary, "content_sitewide_avg")

In [21]:
train_sessions = (
    train_sessions
    .join(content_metadata, on=["content_id_hashed"], how="left")
    .join(content_price_data_summary, on=["content_id_hashed"], how="left")
    .join(df_user_metadata, on=["user_id_hashed"], how="left")
    .join(user_sitewide_log_summary, on=["user_id_hashed"], how="left")
    .join(user_fashion_sitewide_log_summary, on=["user_id_hashed", "content_id_hashed"], how="left")
    .join(content_sitewide_log_summary, on=["content_id_hashed"], how="left")
    .join(content_top_terms_log_summary, on=["content_id_hashed", "search_term_normalized"], how="left")
    .join(user_top_terms_log_summary, on=["user_id_hashed", "search_term_normalized"], how="left")
    .join(user_fashion_search_log_summary, on=["user_id_hashed", "content_id_hashed"], how="left")
    .join(term_search_log_summary, on=["search_term_normalized"], how="left")
)

test_sessions = (
    test_sessions
    .join(content_metadata, on=["content_id_hashed"], how="left")
    .join(content_price_data_summary, on=["content_id_hashed"], how="left")
    .join(df_user_metadata, on=["user_id_hashed"], how="left")
    .join(user_sitewide_log_summary, on=["user_id_hashed"], how="left")
    .join(user_fashion_sitewide_log_summary, on=["user_id_hashed", "content_id_hashed"], how="left")
    .join(content_sitewide_log_summary, on=["content_id_hashed"], how="left")
    .join(content_top_terms_log_summary, on=["content_id_hashed", "search_term_normalized"], how="left")
    .join(user_top_terms_log_summary, on=["user_id_hashed", "search_term_normalized"], how="left")
    .join(user_fashion_search_log_summary, on=["user_id_hashed", "content_id_hashed"], how="left")
    .join(term_search_log_summary, on=["search_term_normalized"], how="left")
)

In [22]:
import lightgbm as lgb

# Define the columns to be used for train data
train_data_ordered = train_sessions.select([
    "ordered", "content_price_data_avg_content_review_count", "content_price_data_avg_content_review_wth_media_count",
    "content_price_data_avg_content_rate_count", "content_price_data_avg_content_rate_avg", "max_price", "min_price", "discount_rate", "total_rate_score",
    "media_review_ratio", "attribute_type_count", 
    "total_attribute_option_count", "merchant_count", "filterable_label_count",
    "user_age", "user_tenure_in_years",
    "user_gender_encoded", "user_sitewide_avg_total_click", 
    "user_sitewide_avg_total_cart", "user_sitewide_avg_total_fav", "user_sitewide_avg_total_order",
    "user_sitewide_avg_click_cart_order_ratio", "user_sitewide_avg_click_cart_ratio", 
    "user_sitewide_avg_click_order_ratio", "user_fashion_sitewide_avg_total_click", 
    "user_fashion_sitewide_avg_total_cart","user_fashion_sitewide_avg_total_fav", 
    "user_fashion_sitewide_avg_total_order", "user_fashion_sitewide_avg_click_cart_order_ratio", 
    "user_fashion_sitewide_avg_click_cart_ratio", "user_fashion_sitewide_avg_click_order_ratio",
    "content_sitewide_avg_total_click", "content_sitewide_avg_total_cart","content_sitewide_avg_total_fav", 
    "content_sitewide_avg_total_order", "content_sitewide_avg_click_cart_order_ratio", 
    "content_sitewide_avg_click_cart_ratio", "content_sitewide_avg_click_order_ratio",
    "content_top_terms_log_avg_total_search_click", "content_top_terms_log_avg_total_search_impression", "content_top_terms_log_avg_click_impression_ratio",
    "user_top_terms_log_avg_total_search_click", "user_top_terms_log_avg_total_search_impression", "user_top_terms_log_avg_click_impression_ratio",
    "user_fashion_search_log_avg_total_search_click", "user_fashion_search_log_avg_total_search_impression", "user_fashion_search_log_avg_click_impression_ratio",
    "term_search_log_avg_total_search_click", "term_search_log_avg_total_search_impression", "term_search_log_avg_click_impression_ratio"
    ])

# Separate the train data to X and Y values
train_X_ordered = train_data_ordered.drop("ordered").to_pandas()
train_y_ordered = train_data_ordered["ordered"].to_pandas()

# Define the parameters for the LightGBM model
best_params = {
    'num_leaves': int(42.75006318015603),       # 42
    'feature_fraction': 0.3655838442409418,
    'bagging_fraction': 0.9106145615289158,
    'max_depth': int(5.166369536643747),        # 5
    'lambda_l1': 0.5382833996798397,
    'lambda_l2': 1.7851561926187207,
    'min_split_gain': 0.053451918857089345,
    'min_child_weight': 28.034408570615987
}

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'verbose': -1,
    'random_state': 7,
    **best_params
}

model_ordered = lgb.LGBMClassifier(**params)
model_ordered.fit(train_X_ordered, train_y_ordered)

In [23]:
# Define the columns to be used for train data
train_data_clicked = train_sessions.select([
    "clicked", "content_price_data_avg_content_review_count", "content_price_data_avg_content_review_wth_media_count",
    "content_price_data_avg_content_rate_count", "content_price_data_avg_content_rate_avg", "max_price", "min_price", "discount_rate", "total_rate_score",
    "media_review_ratio", "attribute_type_count", 
    "total_attribute_option_count", "merchant_count", "filterable_label_count",
    "user_age", "user_tenure_in_years",
    "user_gender_encoded", "user_sitewide_avg_total_click", 
    "user_sitewide_avg_total_cart", "user_sitewide_avg_total_fav", "user_sitewide_avg_total_order",
    "user_sitewide_avg_click_cart_order_ratio", "user_sitewide_avg_click_cart_ratio", 
    "user_sitewide_avg_click_order_ratio", "user_fashion_sitewide_avg_total_click", 
    "user_fashion_sitewide_avg_total_cart","user_fashion_sitewide_avg_total_fav", 
    "user_fashion_sitewide_avg_total_order", "user_fashion_sitewide_avg_click_cart_order_ratio", 
    "user_fashion_sitewide_avg_click_cart_ratio", "user_fashion_sitewide_avg_click_order_ratio",
    "content_sitewide_avg_total_click", "content_sitewide_avg_total_cart","content_sitewide_avg_total_fav", 
    "content_sitewide_avg_total_order", "content_sitewide_avg_click_cart_order_ratio", 
    "content_sitewide_avg_click_cart_ratio", "content_sitewide_avg_click_order_ratio",
    "content_top_terms_log_avg_total_search_click", "content_top_terms_log_avg_total_search_impression", "content_top_terms_log_avg_click_impression_ratio",
    "user_top_terms_log_avg_total_search_click", "user_top_terms_log_avg_total_search_impression", "user_top_terms_log_avg_click_impression_ratio",
    "user_fashion_search_log_avg_total_search_click", "user_fashion_search_log_avg_total_search_impression", "user_fashion_search_log_avg_click_impression_ratio",
    "term_search_log_avg_total_search_click", "term_search_log_avg_total_search_impression", "term_search_log_avg_click_impression_ratio"
    ])

# Separate the train data to X and Y values
train_X_clicked = train_data_clicked.drop("clicked").to_pandas()
train_y_clicked = train_data_clicked["clicked"].to_pandas()

# Define the parameters for the LightGBM model
best_params = {
    'num_leaves': int(29.444871248515597),
    'feature_fraction': 0.9,
    'bagging_fraction': 1.0,
    'max_depth': int(8.551469766095197),
    'lambda_l1': 4.682851969380666,
    'lambda_l2': 1.0868114611097193,
    'min_split_gain': 0.001,
    'min_child_weight': 28.161955204255598
}

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'verbose': -1,
    'random_state': 7,
    **best_params
}
model_clicked = lgb.LGBMClassifier(**params)
model_clicked.fit(train_X_clicked, train_y_clicked)

In [24]:
# Define the columns to be used for train data
train_data_added_to_cart = train_sessions.select([
    "added_to_cart", "content_price_data_avg_content_review_count", "content_price_data_avg_content_review_wth_media_count",
    "content_price_data_avg_content_rate_count", "content_price_data_avg_content_rate_avg", "max_price", "min_price", "discount_rate", "total_rate_score",
    "media_review_ratio", "attribute_type_count", 
    "total_attribute_option_count", "merchant_count", "filterable_label_count",
    "user_age", "user_tenure_in_years",
    "user_gender_encoded", "user_sitewide_avg_total_click", 
    "user_sitewide_avg_total_cart", "user_sitewide_avg_total_fav", "user_sitewide_avg_total_order",
    "user_sitewide_avg_click_cart_order_ratio", "user_sitewide_avg_click_cart_ratio", 
    "user_sitewide_avg_click_order_ratio", "user_fashion_sitewide_avg_total_click", 
    "user_fashion_sitewide_avg_total_cart","user_fashion_sitewide_avg_total_fav", 
    "user_fashion_sitewide_avg_total_order", "user_fashion_sitewide_avg_click_cart_order_ratio", 
    "user_fashion_sitewide_avg_click_cart_ratio", "user_fashion_sitewide_avg_click_order_ratio",
    "content_sitewide_avg_total_click", "content_sitewide_avg_total_cart","content_sitewide_avg_total_fav", 
    "content_sitewide_avg_total_order", "content_sitewide_avg_click_cart_order_ratio", 
    "content_sitewide_avg_click_cart_ratio", "content_sitewide_avg_click_order_ratio",
    "content_top_terms_log_avg_total_search_click", "content_top_terms_log_avg_total_search_impression", "content_top_terms_log_avg_click_impression_ratio",
    "user_top_terms_log_avg_total_search_click", "user_top_terms_log_avg_total_search_impression", "user_top_terms_log_avg_click_impression_ratio",
    "user_fashion_search_log_avg_total_search_click", "user_fashion_search_log_avg_total_search_impression", "user_fashion_search_log_avg_click_impression_ratio",
    "term_search_log_avg_total_search_click", "term_search_log_avg_total_search_impression", "term_search_log_avg_click_impression_ratio"
    ])

# Separate the train data to X and Y values
train_X_added_to_cart = train_data_added_to_cart.drop("added_to_cart").to_pandas()
train_y_added_to_cart = train_data_added_to_cart["added_to_cart"].to_pandas()

# Define the parameters for the LightGBM model
best_params = {
    'num_leaves': int(29.367895167617537),
    'feature_fraction': 0.6370475455514243,
    'bagging_fraction': 0.7995029565695583,
    'max_depth': int(7.862155260766498),
    'lambda_l1': 4.686747673224523,
    'lambda_l2': 1.0554293056191062,
    'min_split_gain': 0.026109775649668203,
    'min_child_weight': 27.135987997022514 }

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'verbose': -1,
    'random_state': 7,
    **best_params
}

model_added_to_cart = lgb.LGBMClassifier(**params)
model_added_to_cart.fit(train_X_added_to_cart, train_y_added_to_cart)

In [25]:
# Define the columns to be used for train data
train_data_added_to_fav = train_sessions.select([
    "added_to_fav", "content_price_data_avg_content_review_count", "content_price_data_avg_content_review_wth_media_count",
    "content_price_data_avg_content_rate_count", "content_price_data_avg_content_rate_avg", "max_price", "min_price", "discount_rate", "total_rate_score",
    "media_review_ratio", "attribute_type_count", 
    "total_attribute_option_count", "merchant_count", "filterable_label_count",
    "user_age", "user_tenure_in_years",
    "user_gender_encoded", "user_sitewide_avg_total_click", 
    "user_sitewide_avg_total_cart", "user_sitewide_avg_total_fav", "user_sitewide_avg_total_order",
    "user_sitewide_avg_click_cart_order_ratio", "user_sitewide_avg_click_cart_ratio", 
    "user_sitewide_avg_click_order_ratio", "user_fashion_sitewide_avg_total_click", 
    "user_fashion_sitewide_avg_total_cart","user_fashion_sitewide_avg_total_fav", 
    "user_fashion_sitewide_avg_total_order", "user_fashion_sitewide_avg_click_cart_order_ratio", 
    "user_fashion_sitewide_avg_click_cart_ratio", "user_fashion_sitewide_avg_click_order_ratio",
    "content_sitewide_avg_total_click", "content_sitewide_avg_total_cart","content_sitewide_avg_total_fav", 
    "content_sitewide_avg_total_order", "content_sitewide_avg_click_cart_order_ratio", 
    "content_sitewide_avg_click_cart_ratio", "content_sitewide_avg_click_order_ratio",
    "content_top_terms_log_avg_total_search_click", "content_top_terms_log_avg_total_search_impression", "content_top_terms_log_avg_click_impression_ratio",
    "user_top_terms_log_avg_total_search_click", "user_top_terms_log_avg_total_search_impression", "user_top_terms_log_avg_click_impression_ratio",
    "user_fashion_search_log_avg_total_search_click", "user_fashion_search_log_avg_total_search_impression", "user_fashion_search_log_avg_click_impression_ratio",
    "term_search_log_avg_total_search_click", "term_search_log_avg_total_search_impression", "term_search_log_avg_click_impression_ratio"
    ])

# Separate the train data to X and Y values
train_X_added_to_fav = train_data_added_to_fav.drop("added_to_fav").to_pandas()
train_y_added_to_fav = train_data_added_to_fav["added_to_fav"].to_pandas()

# Define the parameters for the LightGBM model
best_params = {
    'num_leaves': int(30.703241072648808),
    'feature_fraction': 0.1782819436040616,
    'bagging_fraction': 0.5282457464911301,
    'max_depth': int(8.526412364373703),
    'lambda_l1': 0.8471383839571267,
    'lambda_l2': 1.3630054197633439,
    'min_split_gain': 0.028308385885611347,
    'min_child_weight': 5.8439044317519535
}

params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'verbose': -1,
    'random_state': 7,
    **best_params    
}

model_added_to_fav = lgb.LGBMClassifier(**params)
model_added_to_fav.fit(train_X_added_to_fav, train_y_added_to_fav)

In [26]:
test_sessions = test_sessions.with_columns(
    prediction_order = model_ordered.predict_proba(
        test_sessions.select(train_X_ordered.columns).to_pandas()
    )[:, 1],
    prediction_clicked = model_clicked.predict_proba(
        test_sessions.select(train_X_clicked.columns).to_pandas()
    )[:, 1]
)

In [27]:
test_sessions = test_sessions.with_columns(
    prediction_added_to_cart = model_added_to_cart.predict_proba(
        test_sessions.select(train_X_added_to_cart.columns).to_pandas()
    )[:, 1],
    prediction_added_to_fav = model_added_to_fav.predict_proba(
        test_sessions.select(train_X_added_to_fav.columns).to_pandas()
    )[:, 1]
)

In [28]:
order_weight = 24.893159164894975
added_to_cart_weight = 2.7180381266570746
added_to_fav_weight = 2.949427030275091
click_weight =  0.8420917092839927


test_sessions = test_sessions.with_columns(
    prediction = (
        (pl.col("prediction_order") * order_weight +
         pl.col("prediction_clicked") * click_weight) +
        (pl.col("prediction_added_to_cart") * added_to_cart_weight) +
        (pl.col("prediction_added_to_fav") * added_to_fav_weight)
    )
)

test_sessions = test_sessions.sort(["session_id", "prediction"], descending=True)
test_sessions.head()

ts_hour,search_term_normalized,user_id_hashed,content_id_hashed,session_id,ts_date,level1_category_name,level2_category_name,leaf_category_name,attribute_type_count,total_attribute_option_count,merchant_count,filterable_label_count,content_creation_date,cv_tags,content_price_data_avg_update_date,content_price_data_avg_content_review_count,content_price_data_avg_content_review_wth_media_count,content_price_data_avg_content_rate_count,content_price_data_avg_content_rate_avg,max_price,min_price,discount_rate,total_rate_score,media_review_ratio,ts_date_right,user_age,user_tenure_in_years,user_gender_encoded,user_sitewide_avg_total_click,user_sitewide_avg_total_cart,user_sitewide_avg_total_fav,user_sitewide_avg_total_order,user_sitewide_avg_click_cart_order_ratio,user_sitewide_avg_click_cart_ratio,user_sitewide_avg_click_order_ratio,user_fashion_sitewide_avg_total_click,user_fashion_sitewide_avg_total_cart,user_fashion_sitewide_avg_total_fav,user_fashion_sitewide_avg_total_order,user_fashion_sitewide_avg_click_cart_order_ratio,user_fashion_sitewide_avg_click_cart_ratio,user_fashion_sitewide_avg_click_order_ratio,content_sitewide_avg_total_click,content_sitewide_avg_total_cart,content_sitewide_avg_total_fav,content_sitewide_avg_total_order,content_sitewide_avg_click_cart_order_ratio,content_sitewide_avg_click_cart_ratio,content_sitewide_avg_click_order_ratio,content_top_terms_log_avg_total_search_click,content_top_terms_log_avg_total_search_impression,content_top_terms_log_avg_click_impression_ratio,user_top_terms_log_avg_total_search_click,user_top_terms_log_avg_total_search_impression,user_top_terms_log_avg_click_impression_ratio,user_fashion_search_log_avg_total_search_click,user_fashion_search_log_avg_total_search_impression,user_fashion_search_log_avg_click_impression_ratio,term_search_log_avg_total_search_click,term_search_log_avg_total_search_impression,term_search_log_avg_click_impression_ratio,prediction_order,prediction_clicked,prediction_added_to_cart,prediction_added_to_fav,prediction
"datetime[μs, UTC]",str,str,str,str,date,str,str,str,f64,f64,f64,f64,"datetime[μs, UTC]",str,datetime[ms],f64,f64,f64,f64,f64,f64,f64,f64,f64,date,f64,f64,i8,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
2025-07-12 17:00:00 UTC,"""canta""","""331eb01975a779d7""","""7461948a60e17761""","""test_fffe1513d5463e39""",2025-07-12,"""Aksesuar""","""Çanta""","""Omuz Çantası""",1.0,1.0,1.0,2.0,2025-06-11 00:00:00 UTC,"""minimalizm, ucuz şık""",2025-07-09 00:00:00,2e-06,8.3838e-07,2e-06,5.0,0.001596,0.001596,0.0,2783100.0,0.466667,2025-07-09,,1.0,0,3e-06,2.8745e-07,1e-06,0.0,2826500.0,2.826544,2.826544,0.0,0.0,4.7908e-07,0.0,0.0,0.0,0.0,0.000171,7e-06,4.6e-05,1e-06,19595000.0,24.528889,136.271605,1e-05,0.000504,0.018865,,,,,,,0.002599,0.125296,0.020739,0.017908,0.167501,0.021544,0.034367,0.746752
2025-07-12 17:00:00 UTC,"""canta""","""331eb01975a779d7""","""eb9bb5f0d155e0d4""","""test_fffe1513d5463e39""",2025-07-12,"""Aksesuar""","""Çanta""","""Omuz Çantası""",1.0,1.0,3.0,7.0,2025-02-10 00:00:00 UTC,,2025-07-11 00:00:00,2e-06,4.7908e-07,3e-06,4.857143,0.000846,0.000846,0.0,1448400.0,0.2,2025-07-11,,1.0,0,3e-06,2.8745e-07,1e-06,0.0,2826500.0,2.826544,2.826544,4.7908e-07,0.0,4.7908e-07,0.0,479075.273577,0.479075,0.479075,0.001571,3.5e-05,0.000229,5e-06,9002100.0,45.074468,313.851852,6.4e-05,0.002114,0.03014,,,,,,,0.002599,0.125296,0.020739,0.012727,0.147587,0.015294,0.052923,0.638762
2025-07-12 17:00:00 UTC,"""canta""","""331eb01975a779d7""","""6a4a415c816368b2""","""test_fffe1513d5463e39""",2025-07-12,"""Aksesuar""","""Çanta""","""Omuz Çantası""",1.0,1.0,3.0,2.0,2025-06-11 00:00:00 UTC,,2025-07-09 08:00:00,3.1938e-07,3.1938e-07,3.1938e-07,5.0,0.001863,0.001696,8.946352,15655000.0,1.0,2025-07-09,,1.0,0,3e-06,2.8745e-07,1e-06,0.0,2826500.0,2.826544,2.826544,0.0,0.0,4.7908e-07,0.0,0.0,0.0,0.0,0.000207,3e-06,4.3e-05,6.3362e-07,59937000.0,59.93722,206.558713,1.3e-05,0.000507,0.026253,,,,,,,0.002599,0.125296,0.020739,0.012431,0.167271,0.021692,0.039822,0.626714
2025-07-12 17:00:00 UTC,"""canta""","""331eb01975a779d7""","""1995da2ecc59696b""","""test_fffe1513d5463e39""",2025-07-12,"""Aksesuar""","""Çanta""","""El Çantası""",1.0,1.0,1.0,3.0,2025-06-11 00:00:00 UTC,,2025-07-11 00:00:00,0.0,0.0,0.0,,0.001145,0.001145,0.0,,,2025-07-11,,1.0,0,3e-06,2.8745e-07,1e-06,0.0,2826500.0,2.826544,2.826544,,,,,,,,4.3e-05,2e-06,1.5e-05,4.7908e-07,20000000.0,20.0,43.116775,4e-06,6.5e-05,0.055351,,,,,,,0.002599,0.125296,0.020739,0.001805,0.058305,0.003072,0.021778,0.166608
2025-07-12 17:00:00 UTC,"""canta""","""331eb01975a779d7""","""810cc6d724ee73e9""","""test_fffe1513d5463e39""",2025-07-12,"""Aksesuar""","""Çanta""","""Abiye Çanta""",1.0,1.0,1.0,4.0,2025-06-27 00:00:00 UTC,,2025-07-10 00:00:00,6.3877e-07,1.5969e-07,6.3877e-07,3.333333,0.001095,0.001095,0.0,5218400.0,0.25,2025-07-10,,1.0,0,3e-06,2.8745e-07,1e-06,0.0,2826500.0,2.826544,2.826544,,,,,,,,3.5e-05,2e-06,1e-05,7.5854e-07,19261000.0,19.26087,35.371724,4e-06,6.2e-05,0.059663,,,,,,,0.002599,0.125296,0.020739,0.001841,0.063341,0.003138,0.018234,0.161486


In [29]:
submission_df = test_sessions.group_by("session_id").agg(
    pl.col("content_id_hashed").alias("prediction")
).with_columns(
    pl.col("prediction").list.join(" ")
)
submission_df

session_id,prediction
str,str
"""test_b4d93291695ce111""","""08273eea541c9ffb 617602f231149…"
"""test_c82fda9fc67bb22f""","""51f35d901d6ff890 782b9565985d1…"
"""test_9ea6253c9f159ddf""","""06ae371d42dd4afc a5f102de5f5e0…"
"""test_5f123aa9e195d84a""","""7ce7661af5f28105 3d797a67e0733…"
"""test_900170e64c9dcc3a""","""7f243a11773e5108 43d1f18b8111a…"
…,…
"""test_79f2883d238ab02a""","""a4a5d7bb3bb25a81 6061ea770ef74…"
"""test_ebb702960282c5e8""","""6fdc03c036bea05f 7964e4fd38fec…"
"""test_5511551085cf012e""","""85c40f5e1be55c4a 161a7de01f932…"
"""test_057c61b8e7e65009""","""f38e3c14d003f1ba f4c6a33263e59…"


In [30]:
 submission_df.write_csv("sample_submission_level14.csv")