In [1]:
import pandas as pd
import numpy as np
import optuna
import numpy as np
from optuna.integration import LightGBMPruningCallback
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
import lightgbm as lgb

In [2]:
train_data = pd.read_csv("./IE582_Fall21_train_data.zip").drop_duplicates().reset_index(drop=True)
train_ids = train_data.unique_id.unique()
labels = train_data[["unique_id", "gender"]].drop_duplicates().set_index("unique_id").loc[train_ids]

test_data = pd.read_csv("./IE582_Fall21_test_data.zip").drop_duplicates().reset_index(drop=True)
test_ids = test_data.unique_id.unique()

all_data = pd.concat([train_data, test_data], axis=0)
all_data.time_stamp = pd.to_datetime(all_data.time_stamp)

def categorize_time(x):
    if x>=5 and x<12:
        return "morning"
    if x>=12 and x<17:
        return "noon"
    if x>=17 and x<22:
        return "evening"
    return "night"
        
all_data["action_period"] = all_data.time_stamp.dt.hour.apply(categorize_time)
all_data["weekend_weekday"] = all_data.time_stamp.dt.dayofweek.apply(lambda x: "weekend" if x>5 else "weekday")

print(train_data.shape, test_data.shape, all_data.shape)

(2077356, 19) (877989, 19) (2955345, 21)


In [3]:
action_counts = (
    all_data.groupby(["contentid"])
    .user_action
    .value_counts(normalize=True)
    .reset_index(name="value")
    .pivot(index="contentid", columns="user_action", values="value")
    .fillna(0).reset_index()
)

#all_data["sellingprice"] = all_data.groupby("contentid").sellingprice.transform(lambda x: x.fillna(x.mean()))
avg_selling_price = (
    all_data
    .groupby(["contentid"])
    .sellingprice.mean()
    .fillna(all_data.sellingprice.median())
    .reset_index()
)

gender_dist =(
    all_data
    .groupby("contentid")
    .gender
    .value_counts(normalize=True)
    .reset_index(name="value")
    .pivot(index="contentid", columns="gender", values="value")
    .fillna(0).reset_index()
)

weekend_day = (
    all_data
    .groupby("contentid")
    .weekend_weekday
    .value_counts(normalize=True)
    .reset_index(name="value")
    .pivot(index="contentid", columns="weekend_weekday", values="value")
    .fillna(0).reset_index()
)

action_period = (
    all_data
    .groupby("contentid")
    .action_period
    .value_counts(normalize=True)
    .reset_index(name="value")
    .pivot(index="contentid", columns="action_period", values="value")
    .fillna(0).reset_index()
)


n_records = (
    all_data.groupby("contentid").size()
)
n_records.name = "n_records"
n_records = n_records.reset_index()

prod_wide = action_counts.merge(avg_selling_price).merge(gender_dist, how="left").merge(n_records).merge(weekend_day).merge(action_period).fillna(0.5)

cat_features = ["Level1_Category_Name"]
prod_wide = prod_wide.merge(all_data[["contentid", "Level1_Category_Name", "product_gender"]].drop_duplicates())
categorical = pd.get_dummies(prod_wide[cat_features])
prod_wide = pd.concat([prod_wide, categorical], axis=1)
prod_wide = prod_wide.drop(columns=cat_features)

prod_wide.head()

Unnamed: 0,contentid,basket,favorite,order,search,visit,sellingprice,F,M,n_records,...,Level1_Category_Name_Aksesuar,Level1_Category_Name_Anne & Bebek & Çocuk,Level1_Category_Name_Ayakkabı,Level1_Category_Name_Elektronik,Level1_Category_Name_Ev & Mobilya,Level1_Category_Name_Giyim,Level1_Category_Name_Kozmetik & Kişisel Bakım,Level1_Category_Name_Spor & Outdoor,Level1_Category_Name_Süpermarket,Level1_Category_Name_Yaşam
0,48852.0,0.0,0.5,0.0,0.5,0.0,189.0,0.0,1.0,2,...,0,0,0,0,0,0,0,1,0,0
1,51969.0,0.0,0.0,0.0,1.0,0.0,39.9,0.0,1.0,1,...,0,0,0,0,0,0,0,1,0,0
2,51973.0,0.0,0.181818,0.0,0.272727,0.545455,20.0,1.0,0.0,11,...,0,0,0,0,0,0,0,1,0,0
3,51995.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3,...,0,0,0,0,0,0,0,1,0,0
4,52007.0,0.153846,0.0,0.076923,0.0,0.769231,37.030769,1.0,0.0,13,...,0,0,0,0,0,0,0,1,0,0


In [4]:
prod_wide.shape

(610035, 27)

In [5]:
all_data.contentid.nunique()

610035

In [6]:
from sklearn.model_selection import cross_val_score

test_prod = prod_wide[prod_wide.product_gender.isna()].reset_index(drop=True)
train_prod = prod_wide[~prod_wide.product_gender.isna()].reset_index(drop=True)

X_train = train_prod.drop(columns=["contentid", "product_gender"])
y_train, classes = pd.factorize(train_prod.product_gender)

rf_est = RandomForestClassifier(n_estimators=100, max_depth=6, class_weight="balanced_subsample", n_jobs=-1)

scores = cross_val_score(estimator=rf_est, X=X_train, y=y_train, scoring="balanced_accuracy", cv=5, verbose=10)
scores


[CV] START .....................................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ................................ score: (test=0.703) total time=  14.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   14.6s remaining:    0.0s


[CV] END ................................ score: (test=0.746) total time=  13.9s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   28.5s remaining:    0.0s


[CV] END ................................ score: (test=0.742) total time=  13.6s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   42.1s remaining:    0.0s


[CV] END ................................ score: (test=0.707) total time=  13.4s
[CV] START .....................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   55.6s remaining:    0.0s


[CV] END ................................ score: (test=0.705) total time=  13.4s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.2min finished


array([0.70266463, 0.74584295, 0.7424604 , 0.70699283, 0.70498215])

In [7]:
scores.mean()

0.720588593165944

In [8]:
rf_est.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced_subsample', max_depth=6,
                       n_jobs=-1)

In [9]:
genders = rf_est.predict(test_prod.drop(columns=["contentid", "product_gender"]))
test_prod["estimated_gender"] = genders
test_prod["estimated_gender"] = test_prod.estimated_gender.apply(lambda x: classes[x])
test_prod.head()

Unnamed: 0,contentid,basket,favorite,order,search,visit,sellingprice,F,M,n_records,...,Level1_Category_Name_Anne & Bebek & Çocuk,Level1_Category_Name_Ayakkabı,Level1_Category_Name_Elektronik,Level1_Category_Name_Ev & Mobilya,Level1_Category_Name_Giyim,Level1_Category_Name_Kozmetik & Kişisel Bakım,Level1_Category_Name_Spor & Outdoor,Level1_Category_Name_Süpermarket,Level1_Category_Name_Yaşam,estimated_gender
0,52015.0,0.0,0.142857,0.0,0.0,0.857143,48.471429,0.333333,0.666667,7,...,0,0,0,0,0,0,1,0,0,Erkek
1,52099.0,0.133333,0.0,0.0,0.4,0.466667,49.9,1.0,0.0,15,...,0,0,0,0,0,0,1,0,0,Unisex
2,52103.0,0.0,0.0,0.0,1.0,0.0,39.9,0.0,1.0,1,...,0,0,0,0,0,0,0,0,0,Unisex
3,52201.0,0.0,0.0,0.0,1.0,0.0,61.0,1.0,0.0,2,...,0,0,0,0,0,0,1,0,0,Unisex
4,52298.0,0.0,0.5,0.0,0.25,0.25,49.85,0.0,1.0,4,...,0,0,0,0,0,0,1,0,0,Erkek


In [10]:
test_prod.estimated_gender.value_counts()

Unisex    84198
Kadın       986
Erkek       569
Name: estimated_gender, dtype: int64

In [11]:
sorted(zip(rf_est.feature_importances_, rf_est.feature_names_in_), reverse=True)

[(0.2566915085654986, 'Level1_Category_Name_Giyim'),
 (0.19233094491571442, 'Level1_Category_Name_Ev & Mobilya'),
 (0.13766652703785862, 'F'),
 (0.11324461219093002, 'M'),
 (0.06829359631415655, 'Level1_Category_Name_Ayakkabı'),
 (0.06360178056278194, 'Level1_Category_Name_Yaşam'),
 (0.04128800291961739, 'Level1_Category_Name_Elektronik'),
 (0.0397717713457193, 'Level1_Category_Name_Süpermarket'),
 (0.03187145930726974, 'Level1_Category_Name_Aksesuar'),
 (0.02137175331048419, 'sellingprice'),
 (0.009098429250878271, 'Level1_Category_Name_Anne & Bebek & Çocuk'),
 (0.006152214503124268, 'Level1_Category_Name_Spor & Outdoor'),
 (0.005674802364798772, 'Level1_Category_Name_Kozmetik & Kişisel Bakım'),
 (0.0032559515158315645, 'favorite'),
 (0.0022805586039482744, 'visit'),
 (0.0020566774531202508, 'search'),
 (0.001518298882887778, 'n_records'),
 (0.0007942826420625645, 'basket'),
 (0.0007883686267884417, 'night'),
 (0.0005799522857978075, 'evening'),
 (0.00046773370650245096, 'order'),
 (0

In [12]:
all_data = all_data.merge(test_prod[["contentid", "estimated_gender"]], how="left")
all_data.product_gender = all_data.product_gender.fillna(all_data.estimated_gender)
all_data = all_data.drop(columns="estimated_gender")
all_data.product_gender.isna().sum()

3

In [13]:
all_data.shape

(2955345, 21)

In [13]:
all_data["date"] = all_data.time_stamp.dt.date
n_active_days = all_data.groupby("unique_id").date.nunique()
n_active_days.name = "n_active_days"

all_combined = []
for action in all_data.user_action.unique():
    print(action)
    
    temp_data = all_data[all_data.user_action == action].reset_index(drop=True)
    
    prod_gender = (
        temp_data
        .groupby("unique_id")
        .product_gender
        .value_counts(normalize=True)
        .reset_index(name="value")
        .pivot(index="unique_id", columns="product_gender", values="value")
        .fillna(0)
    )
    prod_gender.columns = [f"{action}_ProdGender_{x}" for x in prod_gender]
    
    action_period = (
        temp_data
        .groupby("unique_id")
        .action_period
        .value_counts(normalize=True)
        .reset_index(name="value")
        .pivot(index="unique_id", columns="action_period", values="value")
        .fillna(0)
    )
    action_period.columns = [f"{action}_ActionPeriod_{x}" for x in action_period]
    
    avg_price = temp_data.groupby("unique_id").sellingprice.mean()
    avg_price.name = f"{action}_AvgSellingPrice"
    max_price = temp_data.groupby("unique_id").sellingprice.max()
    max_price.name = f"{action}_MaxSellingPrice"
    min_price = temp_data.groupby("unique_id").sellingprice.min()
    min_price.name = f"{action}_MinSellingPrice"
    
    n_action = temp_data.groupby("unique_id").size()
    n_action.name = f"{action}_count"
    
    category_visit_dist =  (
        temp_data
        .groupby("unique_id")
        .Level1_Category_Name
        .value_counts(normalize=True)
        .reset_index(name="value")
        .pivot(index="unique_id", columns="Level1_Category_Name", values="value")
        .fillna(0)
    )
    category_visit_dist.columns = [f"{action}_Cat1_{x}" for x in category_visit_dist]
    
    weekend_weekday =(
        temp_data
        .groupby("unique_id")
        .weekend_weekday
        .value_counts(normalize=True)
        .reset_index(name="value")
        .pivot(index="unique_id", columns="weekend_weekday", values="value")
        .fillna(0)
    )
    weekend_weekday.columns = [f"{action}_{x}" for x in weekend_weekday]
    
    combined = pd.concat([prod_gender, action_period, avg_price, max_price, min_price, n_action, category_visit_dist, weekend_weekday], axis=1)
    all_combined.append(combined)
    
all_combined.append(n_active_days)
all_combined = pd.concat(all_combined, axis=1).fillna(0)
count_cols = [x for x in all_combined if "count" in x]
all_combined[count_cols] = (all_combined[count_cols].values.T/all_combined[count_cols].sum(axis=1).values).T
all_combined.head()

favorite
visit
search
basket
order


Unnamed: 0_level_0,favorite_ProdGender_Erkek,favorite_ProdGender_Kadın,favorite_ProdGender_Unisex,favorite_ActionPeriod_evening,favorite_ActionPeriod_morning,favorite_ActionPeriod_night,favorite_ActionPeriod_noon,favorite_AvgSellingPrice,favorite_MaxSellingPrice,favorite_MinSellingPrice,...,order_Cat1_Elektronik,order_Cat1_Ev & Mobilya,order_Cat1_Giyim,order_Cat1_Kozmetik & Kişisel Bakım,order_Cat1_Spor & Outdoor,order_Cat1_Süpermarket,order_Cat1_Yaşam,order_weekday,order_weekend,n_active_days
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.01087,0.347826,0.641304,0.326087,0.021739,0.5,0.152174,137.97,2205.0,0.0,...,0.0,0.666667,0.0,0.0,0.333333,0.0,0.0,1.0,0.0,56
2,0.002242,0.599776,0.397982,0.261211,0.052691,0.006726,0.679372,158.402348,7284.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,38
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.727273,0.0,0.0,0.0,0.0,0.909091,0.090909,12
4,0.053846,0.623077,0.323077,0.376923,0.115385,0.123077,0.384615,182.792077,1395.0,0.0,...,0.0,0.089286,0.303571,0.321429,0.017857,0.071429,0.178571,1.0,0.0,57
5,0.0,1.0,0.0,0.0,0.0,0.0,1.0,59.99,59.99,59.99,...,0.0,0.0,0.7,0.0,0.1,0.0,0.0,0.7,0.3,57


In [14]:
all_combined['favorite_segment'] = np.where(all_combined['favorite_AvgSellingPrice'] == 0, np.nan,
                                             np.where(all_combined['favorite_AvgSellingPrice'] < 100, 'low_seg',
                                                     np.where(all_combined['favorite_AvgSellingPrice'] < 300, 'mid_seg', 'high_seg')))
all_combined['visit_segment'] = np.where(all_combined['visit_AvgSellingPrice'] == 0, np.nan,
                                             np.where(all_combined['visit_AvgSellingPrice'] < 100, 'low_seg',
                                                     np.where(all_combined['visit_AvgSellingPrice'] < 300, 'mid_seg','high_seg')))
all_combined['search_segment'] = np.where(all_combined['search_AvgSellingPrice'] == 0, np.nan,
                                             np.where(all_combined['search_AvgSellingPrice'] < 100, 'low_seg',
                                                     np.where(all_combined['search_AvgSellingPrice'] < 300, 'mid_seg','high_seg')))
all_combined['basket_segment'] = np.where(all_combined['basket_AvgSellingPrice'] == 0, np.nan,
                                             np.where(all_combined['basket_AvgSellingPrice'] < 100, 'low_seg',
                                                     np.where(all_combined['basket_AvgSellingPrice'] < 300, 'mid_seg','high_seg')))
all_combined['order_segment'] = np.where(all_combined['order_AvgSellingPrice'] == 0, np.nan,
                                             np.where(all_combined['order_AvgSellingPrice'] < 100, 'low_seg',
                                                     np.where(all_combined['order_AvgSellingPrice'] < 300, 'mid_seg','high_seg')))

In [15]:
n_observations = all_data.groupby(["user_action","unique_id"]).size()
n_observations.name = "n_obs"
n_observations = n_observations.reset_index()
n_observations = n_observations.pivot(index="unique_id", columns=["user_action"], values="n_obs").fillna(0)
n_observations.columns = [f"n_obs_{x}" for x in n_observations]
n_observations["total"] = n_observations.sum(axis=1)
n_observations = n_observations.reset_index()
n_observations

Unnamed: 0,unique_id,n_obs_basket,n_obs_favorite,n_obs_order,n_obs_search,n_obs_visit,total
0,1,109.0,92.0,3.0,671.0,745.0,1620.0
1,2,26.0,892.0,2.0,916.0,781.0,2617.0
2,3,32.0,0.0,11.0,216.0,135.0,394.0
3,4,300.0,130.0,56.0,2126.0,2878.0,5490.0
4,5,205.0,1.0,10.0,598.0,3297.0,4111.0
...,...,...,...,...,...,...,...
7993,7994,0.0,0.0,1.0,0.0,0.0,1.0
7994,7995,0.0,0.0,0.0,0.0,1.0,1.0
7995,7996,1.0,0.0,0.0,0.0,1.0,2.0
7996,7997,0.0,0.0,0.0,0.0,1.0,1.0


In [16]:
all_combined = all_combined.reset_index()
all_combined = all_combined.merge(n_observations)
all_combined = all_combined.set_index("unique_id")

In [17]:
known_features = all_combined.loc[train_ids]
known_features = pd.concat([known_features, labels], axis=1)
submission_features = all_combined.loc[test_ids]

In [18]:
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, ParameterGrid
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, roc_auc_score

train_idx, valid_idx = train_test_split(train_ids, test_size=0.20, random_state=3136)
print(train_idx.shape, valid_idx.shape)
train_features = known_features.loc[train_idx].reset_index()
validation_features = known_features.loc[valid_idx].reset_index()

(4494,) (1124,)


In [19]:
cols_to_remove = ["date", "time_stamp", "contentid", "product_name", "brand_id", "Level1_Category_Id", "Level2_Category_Id", "Level3_Category_Id", "type"]
categorical_features = ["user_action", "businessunit", "product_gender", "Level1_Category_Name", "Level2_Category_Name", "action_period", "weekend_weekday"]
dropped_categorical = ['Level3_Category_Name', "category_id", "brand_name"]
long_data = all_data.copy()
long_data = long_data.drop(columns=cols_to_remove + dropped_categorical)
categorical = pd.get_dummies(long_data[categorical_features])
long_data = pd.concat([long_data, categorical], axis=1)
long_data = long_data.drop(columns=categorical_features)
long_data.sellingprice = long_data.sellingprice.fillna(long_data.sellingprice.median())
long_data = long_data.groupby("unique_id").head(1000)


train_long = long_data[long_data.unique_id.isin(train_ids)].reset_index(drop=True)
test_long = long_data[long_data.unique_id.isin(test_ids)].reset_index(drop=True)

train_features_long = train_long[train_long.unique_id.isin(train_idx)].reset_index(drop=True)
validation_features_long = train_long[train_long.unique_id.isin(valid_idx)].reset_index(drop=True)

In [20]:
del train_long, long_data
import gc
gc.collect()

649

In [21]:
params = {
    'bootstrap': True, 
    'ccp_alpha': 0.0, 
    'class_weight': 'balanced_subsample', 
    'criterion': 'gini', 
    'max_depth': None, 
    'max_features': 0.85, 
    'max_leaf_nodes': None, 
    'max_samples': None, 
    'min_impurity_decrease': 0.0, 
    'min_samples_leaf': 8, 
    'min_samples_split': 8, 
    'min_weight_fraction_leaf': 0.0, 
    'n_estimators': 100,
    "max_depth": 6,
    'n_jobs': -1, 
    'oob_score': True, 
    'random_state': 3136, 
    'verbose': 1, 
    'warm_start': False}

In [22]:
X = train_features_long.drop(columns=["unique_id", "gender"])
y = train_features_long.gender == "F"

X_val = validation_features_long.drop(columns=["unique_id", "gender"])
y_val = validation_features_long.gender == "F"

In [23]:
classifier = RandomForestClassifier(**params)
classifier.fit(X,y)

pred = classifier.predict(X_val) # Uses 0.5 as threshold
pred_prob = classifier.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, pred_prob)
bacc = balanced_accuracy_score(y_val, pred_prob>0.5)
print(f"auc-roc score: {auc:.5f}, balanced acc: {bacc:.5f}")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  5.5min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.2s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.2s


auc-roc score: 0.79370, balanced acc: 0.72084


[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.5s finished


In [24]:
oob_preds = classifier.oob_decision_function_[:,1]
roc_auc_score(y, oob_preds)

0.8040021987492966

In [25]:
oob_features_train = train_features_long[["unique_id"]].copy()
oob_features_train["oob_feature"] = oob_preds

oob_features_val = validation_features_long[["unique_id"]].copy()
oob_features_val["oob_feature"] = pred_prob

test_prob = classifier.predict_proba(test_long.drop(columns=["unique_id", "gender"]))[:,1]
oob_features_test = test_long[["unique_id"]].copy()
oob_features_test["oob_feature"] = test_prob


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.3s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    1.2s finished


In [26]:
oob_features_train_agg = oob_features_train.groupby("unique_id").agg(["min", "max", "mean", "std"])
oob_features_train_agg.columns = ["_".join(str(y) for y in x) for x in oob_features_train_agg]
oob_features_train_agg = oob_features_train_agg.reset_index()

oob_features_val_agg = oob_features_val.groupby("unique_id").agg(["min", "max", "mean", "std"])
oob_features_val_agg.columns = ["_".join(str(y) for y in x) for x in oob_features_val_agg]
oob_features_val_agg = oob_features_val_agg.reset_index()

oob_features_test_agg = oob_features_test.groupby("unique_id").agg(["min", "max", "mean", "std"])
oob_features_test_agg.columns = ["_".join(str(y) for y in x) for x in oob_features_test_agg]
oob_features_test_agg = oob_features_test_agg.reset_index()

In [27]:
# train_features = train_features.merge(oob_features_train.groupby("unique_id").mean().reset_index())
# validation_features = validation_features.merge(oob_features_val.groupby("unique_id").mean().reset_index())
# submission_features = submission_features.reset_index().merge(oob_features_test.groupby("unique_id").mean().reset_index())

train_features = train_features.merge(oob_features_train_agg)
validation_features = validation_features.merge(oob_features_val_agg)
submission_features = submission_features.reset_index().merge(oob_features_test_agg)

In [28]:
X = train_features.drop(columns=["unique_id", "gender"])
y = train_features.gender == "F"

X_val = validation_features.drop(columns=["unique_id", "gender"])
y_val = validation_features.gender == "F"

In [29]:
categorical_features=["favorite_segment", "visit_segment", "search_segment", "basket_segment", "order_segment"]
X[categorical_features] = X[categorical_features].astype("category")
X_val[categorical_features] = X_val[categorical_features].astype("category")

In [30]:
X_full = pd.concat([X, X_val], axis=0).reset_index(drop=True)
y_full = pd.concat([y, y_val], axis=0).reset_index(drop=True)

In [31]:
def objective(trial, X, y):
    param_grid = {
        "boosting": trial.suggest_categorical("boosting", ["dart", "gbdt"]),
        "is_unbalance": trial.suggest_categorical("is_unbalance", [True]),
        "n_estimators": trial.suggest_categorical("n_estimators", [100, 250, 500, 750]),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.15),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10, log=True),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.05
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.05
        ),
        "min_child_samples": trial.suggest_int(
            "min_child_samples", 5, 100
        ),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=3136)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgb.LGBMClassifier(objective="binary", **param_grid, verbose_eval=100)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="auc",
            callbacks=[
                LightGBMPruningCallback(trial, "auc")
            ],
            # Add a pruning callback
        )
        preds = model.predict_proba(X_test)[:,1]
        #cv_scores[idx] = roc_auc_score(y_test, preds)
        cv_scores[idx] = (roc_auc_score(y_test, preds) + balanced_accuracy_score(y_test, preds>0.5))/2

    return np.mean(cv_scores)

In [32]:
study = optuna.create_study(
    direction="maximize", 
    study_name="LGBM Classifier", 
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=30, n_min_trials=30)
)
func = lambda trial: objective(trial, X, y)
# study.optimize(func, timeout=3600)
# best_params = study.best_params

[32m[I 2022-01-21 19:55:16,622][0m A new study created in memory with name: LGBM Classifier[0m


In [45]:
#best_params = study.best_params
best_params = {'boosting': 'dart',
 'is_unbalance': True,
 'n_estimators': 250,
 'learning_rate': 0.02713490913717809,
 'max_depth': 6,
 'lambda_l1': 0.014005522389993892,
 'lambda_l2': 3.354264156762259e-07,
 'bagging_fraction': 0.8500000000000001,
 'bagging_freq': 1,
 'feature_fraction': 0.25,
 'min_child_samples': 7}
best_params

{'boosting': 'dart',
 'is_unbalance': True,
 'n_estimators': 250,
 'learning_rate': 0.02713490913717809,
 'max_depth': 6,
 'lambda_l1': 0.014005522389993892,
 'lambda_l2': 3.354264156762259e-07,
 'bagging_fraction': 0.8500000000000001,
 'bagging_freq': 1,
 'feature_fraction': 0.25,
 'min_child_samples': 7}

In [46]:
model = lgb.LGBMClassifier(objective="binary", **best_params, verbose_eval=100, importance_type="gain")
model.fit(
    X_full,
    y_full,
    #eval_set=[(X_val, y_val)],
    eval_metric="auc",
)
preds = model.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, preds)
bacc = balanced_accuracy_score(y_val, preds>0.5)
print(f"auc-roc score: {auc:.5f}, balanced acc: {bacc:.5f}")

auc-roc score: 0.93778, balanced acc: 0.85812


In [47]:
sorted(zip(model.feature_importances_/model.feature_importances_.sum(), model.feature_name_), reverse=True)

[(0.20591089153125408, 'oob_feature_mean'),
 (0.09988200791129212, 'search_ProdGender_Kadın'),
 (0.07923547970470482, 'visit_ProdGender_Kadın'),
 (0.0696629494951269, 'oob_feature_max'),
 (0.0487770832401285, 'search_ProdGender_Erkek'),
 (0.04593527847175947, 'visit_ProdGender_Erkek'),
 (0.03734824190051716, 'oob_feature_min'),
 (0.024330354907492312, 'favorite_ProdGender_Kadın'),
 (0.01770034593437506, 'basket_ProdGender_Kadın'),
 (0.01563831204083272, 'n_obs_visit'),
 (0.013067775281223774, 'oob_feature_std'),
 (0.01126804896234294, 'favorite_count'),
 (0.010657693044678484, 'visit_Cat1_Giyim'),
 (0.010025721506253941, 'total'),
 (0.00938475295917037, 'search_AvgSellingPrice'),
 (0.008043126849931217, 'order_count'),
 (0.007997540646140486, 'visit_Cat1_Ev_&_Mobilya'),
 (0.00796941164133517, 'n_obs_favorite'),
 (0.007969065677833514, 'search_Cat1_Ev_&_Mobilya'),
 (0.007536644323385169, 'search_Cat1_Giyim'),
 (0.007202642373680103, 'visit_AvgSellingPrice'),
 (0.007028298096678324, 'sea

In [48]:
oob_preds = preds
oob_perf = (auc + bacc)/2
oob_cols = [x for x in X if "oob_feature" in x]

In [49]:
oob_perf

0.8979486822891316

In [50]:
study = optuna.create_study(
    direction="maximize", 
    study_name="LGBM Classifier", 
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=30, n_min_trials=30)
)
# func = lambda trial: objective(trial, X.drop(columns = oob_cols), y)
# study.optimize(func, timeout=20*60)
# best_params = study.best_params

best_params ={'boosting': 'gbdt',
 'is_unbalance': True,
 'n_estimators': 100,
 'learning_rate': 0.029226424058146096,
 'max_depth': 10,
 'lambda_l1': 0.007844515793696284,
 'lambda_l2': 4.1092202807509865e-06,
 'bagging_fraction': 0.9000000000000001,
 'bagging_freq': 1,
 'feature_fraction': 0.35000000000000003,
 'min_child_samples': 44}

[32m[I 2022-01-21 20:31:14,054][0m A new study created in memory with name: LGBM Classifier[0m


In [51]:
model2 = lgb.LGBMClassifier(objective="binary", **best_params, verbose_eval=100, importance_type="gain")
model2.fit(
    X_full.drop(columns = oob_cols),
    y_full,
    #eval_set=[(X_val, y_val)],
    eval_metric="auc",
)
preds = model2.predict_proba(X_val.drop(columns=oob_cols))[:,1]
auc = roc_auc_score(y_val, preds)
bacc = balanced_accuracy_score(y_val, preds>0.5)
print(f"auc-roc score: {auc:.5f}, balanced acc: {bacc:.5f}")

auc-roc score: 0.93262, balanced acc: 0.85812


In [52]:
sorted(zip(model2.feature_importances_/model2.feature_importances_.sum(), model2.feature_name_), reverse=True)

[(0.1885636128615981, 'search_ProdGender_Kadın'),
 (0.16029246025140453, 'visit_ProdGender_Kadın'),
 (0.09629033420010959, 'search_ProdGender_Erkek'),
 (0.055812197289912845, 'visit_ProdGender_Erkek'),
 (0.027673174420600074, 'favorite_ProdGender_Kadın'),
 (0.025428739522235656, 'basket_ProdGender_Kadın'),
 (0.023330051761247753, 'n_obs_visit'),
 (0.01728185932791345, 'search_Cat1_Ev_&_Mobilya'),
 (0.015171858628262046, 'search_Cat1_Giyim'),
 (0.013599376663178312, 'visit_Cat1_Ev_&_Mobilya'),
 (0.01344596005177324, 'visit_Cat1_Elektronik'),
 (0.013011883266302832, 'favorite_count'),
 (0.012815923249399088, 'visit_Cat1_Giyim'),
 (0.01112848545236678, 'visit_AvgSellingPrice'),
 (0.01053067703381499, 'n_obs_favorite'),
 (0.010401263811687407, 'search_AvgSellingPrice'),
 (0.009916068656624139, 'order_count'),
 (0.009333500477236285, 'search_Cat1_Elektronik'),
 (0.00916559779101658, 'favorite_ActionPeriod_noon'),
 (0.008956783294239908, 'search_ProdGender_Unisex'),
 (0.00889858582242083, 'v

In [53]:
normal_preds = preds
normal_perf = (auc + bacc)/2
model_coef = oob_perf/(oob_perf + normal_perf)
print(model_coef)
preds = oob_preds*model_coef + normal_preds*(1-model_coef)
auc = roc_auc_score(y_val, preds)
bacc = balanced_accuracy_score(y_val, preds>0.5)
print(f"auc-roc score: {auc:.5f}, balanced acc: {bacc:.5f}")
print((auc + bacc)/2)

0.5007187940158339
auc-roc score: 0.93645, balanced acc: 0.85977
0.8981138307489048


In [54]:
X_test = (
    submission_features
    .sort_values("unique_id")
    .reset_index(drop=True)
    .drop(columns=["unique_id"])
)
X_test[categorical_features] = X_test[categorical_features].astype("category")
submission_probs = model.predict_proba(X_test)[:,1]*model_coef + model2.predict_proba(X_test.drop(columns=oob_cols))[:,1]*(1-model_coef)
submission_str = ",".join(str(round(x, 6)) for x in submission_probs)

with open(f"./{round((auc+bacc)/2,5)}_submission.txt", "w") as file:
    file.write(submission_str)

In [51]:
len(submission_probs)

2380

In [35]:
best_params

{'boosting': 'dart',
 'is_unbalance': True,
 'n_estimators': 250,
 'learning_rate': 0.02713490913717809,
 'max_depth': 6,
 'lambda_l1': 0.014005522389993892,
 'lambda_l2': 3.354264156762259e-07,
 'bagging_fraction': 0.8500000000000001,
 'bagging_freq': 1,
 'feature_fraction': 0.25,
 'min_child_samples': 7}

In [None]:
# 1) Long formatta train oob tahminleri diger sete ekleme
# 2) Az observationlar yaniltiyor olabilir. Az observationlari aggreden cikarip baska sekilde handle etme. User segmentation kategorisi eklenebilir.
# 3) Product gender doldurmayi daha iyi nasil yapariz?
# 4) Xgboost da denenebilir
# 5

# {'boosting': 'dart',
#  'is_unbalance': True,
#  'n_estimators': 250,
#  'learning_rate': 0.02713490913717809,
#  'max_depth': 6,
#  'lambda_l1': 0.014005522389993892,
#  'lambda_l2': 3.354264156762259e-07,
#  'bagging_fraction': 0.8500000000000001,
#  'bagging_freq': 1,
#  'feature_fraction': 0.25,
#  'min_child_samples': 7}

# {'boosting': 'dart',
#  'is_unbalance': True,
#  'n_estimators': 100,
#  'learning_rate': 0.10872294701549981,
#  'max_depth': 3,
#  'lambda_l1': 0.0018230139457142034,
#  'lambda_l2': 1.2063254020860439e-05,
#  'bagging_fraction': 0.75,
#  'bagging_freq': 1,
#  'feature_fraction': 0.9000000000000001,
#  'min_child_samples': 31}