In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('/kaggle/input/train-data-ready-frfr/train_actually_ready.csv')
label = pd.read_csv('/kaggle/input/train-data-ready-frfr/mask_to_actual_labeling.csv')

In [3]:
for col in df.columns:
    if col not in label['masked_column'].to_list():
        print(col)

interest
day
dayofweek
hour
is_month_start
f375
f378
id8
avg_trans_amt
offer_ctr


In [4]:
label = label[label['masked_column'].isin(df.columns)]

In [5]:
masked_column=['interest','day','dayofweek','hour','is_month_start','f375','f378','id8','avg_trans_amt','offer_ctr']
Description=['interest score','day','dayofweek','hour','is_month_start','f375','f378','id8','avg_trans_amt','offer_ctr']
Type=['Numerical','Numerical','Categorical','Numerical','Categorical','Numerical','Categorical','Categorical','Numerical','Numerical']
to_add = pd.DataFrame({'masked_column':masked_column,'Description':Description,'Type':Type})
to_add['Type'].unique()

array(['Numerical', 'Categorical'], dtype=object)

In [6]:
label = pd.concat([label,to_add],axis=0)

In [7]:
df.drop(columns=label[label['Type']=='One hot encoded']['masked_column'].to_list(),inplace=True)

In [8]:
from sklearn.preprocessing import LabelEncoder
cat_cols =  label[label['Type']=='Categorical']['masked_column'].to_list()
cat_cols.remove('id3')
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [9]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score

In [10]:
cust_ids = df['id2'].unique()
train_cust, test_cust = train_test_split(cust_ids, test_size=0.2, random_state=42)
train_mask = df['id2'].isin(train_cust)
df_train = df[train_mask]
df_test  = df[~train_mask]

In [11]:
FEATURES = [f for f in df.columns if f not in ['id2','id3','y','id1']]
X_train = df_train[FEATURES]
y_train = df_train['y']
X_test  = df_test[FEATURES]
y_test  = df_test['y']

In [12]:
def make_group(df_sub):
    return df_sub.groupby('id2').size().to_list()

In [13]:
group_train = make_group(df_train)
group_test  = make_group(df_test)

In [14]:
ranker = lgb.LGBMRanker(
    objective='lambdarank',
    metric='ndcg',
    ndcg_at=[7],
    learning_rate=0.05,
    num_leaves=31,
    n_estimators=1000
)

In [15]:
cat_cols =  label[label['Type']=='Categorical']['masked_column'].to_list()
cat_cols.remove('id3')
ranker.fit(
    X_train, y_train,
    group=group_train,
    eval_set=[(X_test, y_test)],
    eval_group=[group_test],
    categorical_feature=cat_cols,
    callbacks=[lgb.early_stopping(50),lgb.log_evaluation(50)]
)



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.293914 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 32804
[LightGBM] [Info] Number of data points in the train set: 619653, number of used features: 158
Training until validation scores don't improve for 50 rounds
[50]	valid_0's ndcg@7: 0.944927
[100]	valid_0's ndcg@7: 0.945862
Early stopping, best iteration is:
[97]	valid_0's ndcg@7: 0.945883


In [16]:
test_df = pd.read_csv('/kaggle/input/test-ready-to-test/test_ready.csv')

In [17]:
test_df.drop(columns=label[label['Type']=='One hot encoded']['masked_column'].to_list(),inplace=True)

In [19]:
cat_cols =  label[label['Type']=='Categorical']['masked_column'].to_list()
cat_cols.remove('id3')
for col in cat_cols:
    le = LabelEncoder()
    test_df[col] = le.fit_transform(test_df[col])

In [20]:
temp_df = df.drop(columns=['y'])

In [21]:
temp_df.shape

(770164, 161)

In [22]:
test_df.shape

(369301, 161)

In [23]:
for i in range(len(temp_df.columns)):
    if temp_df[temp_df.columns[i]].dtypes!=test_df[test_df.columns[i]].dtypes:
        print("haaaaaaaaaaaaaaaa")

In [24]:
X_new = test_df[FEATURES]

In [25]:
ranker.predict(X_new)



array([-2.42347417, -1.12141496,  1.45642558, ..., -1.40725637,
        0.01311022, -1.48131835])

In [26]:
# 1. Get scores
test_df['score'] = ranker.predict(X_new)

# 2. Sort by customer and descending score
test_df = test_df.sort_values(['id2', 'score'], ascending=[True, False])



In [27]:
raw = pd.read_csv('/kaggle/input/test-data-raw/test_data_converted_to_.csv')

In [28]:
test_df['id5'] = raw['id5'].copy()
test_df = test_df[['id1','id2','id3','id5','score']]

In [29]:
test_df.head()

Unnamed: 0,id1,id2,id3,id5,score
82037,1000061_62395_16-23_2023-11-05 09:28:07.805,1000061,62395,2023-11-05,-0.924676
269190,1000061_27945_16-23_2023-11-05 09:28:04.157,1000061,27945,2023-11-05,-1.316955
8135,1000061_403431_16-23_2023-11-05 09:28:10.592,1000061,403431,2023-11-05,-1.34345
288480,1000061_31794_16-23_2023-11-05 09:47:54.852,1000061,31794,2023-11-05,-1.365746
153767,1000061_5420674_16-23_2023-11-05 09:28:04.153,1000061,5420674,2023-11-05,-1.441359


In [30]:
test_df['pred'] = 1

In [31]:
test_df.head()

Unnamed: 0,id1,id2,id3,id5,score,pred
82037,1000061_62395_16-23_2023-11-05 09:28:07.805,1000061,62395,2023-11-05,-0.924676,1
269190,1000061_27945_16-23_2023-11-05 09:28:04.157,1000061,27945,2023-11-05,-1.316955,1
8135,1000061_403431_16-23_2023-11-05 09:28:10.592,1000061,403431,2023-11-05,-1.34345,1
288480,1000061_31794_16-23_2023-11-05 09:47:54.852,1000061,31794,2023-11-05,-1.365746,1
153767,1000061_5420674_16-23_2023-11-05 09:28:04.153,1000061,5420674,2023-11-05,-1.441359,1


In [32]:
test_df.drop(columns=['score'],inplace=True)

In [33]:
test_df = test_df.reset_index()

In [34]:
test_df.head()

Unnamed: 0,index,id1,id2,id3,id5,pred
0,82037,1000061_62395_16-23_2023-11-05 09:28:07.805,1000061,62395,2023-11-05,1
1,269190,1000061_27945_16-23_2023-11-05 09:28:04.157,1000061,27945,2023-11-05,1
2,8135,1000061_403431_16-23_2023-11-05 09:28:10.592,1000061,403431,2023-11-05,1
3,288480,1000061_31794_16-23_2023-11-05 09:47:54.852,1000061,31794,2023-11-05,1
4,153767,1000061_5420674_16-23_2023-11-05 09:28:04.153,1000061,5420674,2023-11-05,1


In [35]:
test_df.drop(columns=['index'],inplace=True)

In [36]:
test_df.head()

Unnamed: 0,id1,id2,id3,id5,pred
0,1000061_62395_16-23_2023-11-05 09:28:07.805,1000061,62395,2023-11-05,1
1,1000061_27945_16-23_2023-11-05 09:28:04.157,1000061,27945,2023-11-05,1
2,1000061_403431_16-23_2023-11-05 09:28:10.592,1000061,403431,2023-11-05,1
3,1000061_31794_16-23_2023-11-05 09:47:54.852,1000061,31794,2023-11-05,1
4,1000061_5420674_16-23_2023-11-05 09:28:04.153,1000061,5420674,2023-11-05,1


In [37]:
test_df.to_csv('submission2.csv',index=False)

In [38]:
df_test['score'] = ranker.predict(X_test)
df_test = df_test.sort_values(['id2', 'score'], ascending=[True, False])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['score'] = ranker.predict(X_test)


In [39]:
df_test = df_test[['id1','id2','id3','score']]
df_test.head()

Unnamed: 0,id1,id2,id3,score
575715,1000043_69026_16-23_2023-11-03 13:59:42.714,1000043,69026,-0.184929
575716,1000043_93516_16-23_2023-11-03 13:59:42.346,1000043,93516,-3.088331
543490,1000237_783808_16-23_2023-11-03 19:57:59.010,1000237,783808,-2.147111
543489,1000237_169639_16-23_2023-11-03 19:57:58.992,1000237,169639,-2.219415
164324,1000328_753828_16-23_2023-11-01 14:28:38.075,1000328,753828,-1.401329


In [40]:
df_test['pred']=1
df_test.drop(columns=['score'],inplace=True)

In [41]:
df_test['actual'] = y_test

In [42]:
def apk(truth,pred,k):
    if len(pred)>k:
        pred = pred[:k]
        truth = truth[:k]
    score = 0.0
    num_hits = 0.0
    total_relevant = sum(truth)
    if total_relevant==0:
        return 0.0
    for i,(p,t) in enumerate(zip(pred,truth),start=1):
        if p==1 and t==1:
            num_hits += 1
            score += num_hits/i
    return score/total_relevant

In [43]:
def mapk(df,k=7):
    grouped = df.groupby('id2')
    ap_scores = []
    decrease_by=0
    for _,group in grouped:
        pred = group['pred'].to_list()
        actual = group['actual'].to_list()
        if sum(actual)!=0:
            ap = apk(actual, pred, k)
            ap_scores.append(ap)
    return sum(ap_scores)/len(ap_scores)

In [44]:
df_test.head()

Unnamed: 0,id1,id2,id3,pred,actual
575715,1000043_69026_16-23_2023-11-03 13:59:42.714,1000043,69026,1,0
575716,1000043_93516_16-23_2023-11-03 13:59:42.346,1000043,93516,1,0
543490,1000237_783808_16-23_2023-11-03 19:57:59.010,1000237,783808,1,0
543489,1000237_169639_16-23_2023-11-03 19:57:58.992,1000237,169639,1,0
164324,1000328_753828_16-23_2023-11-01 14:28:38.075,1000328,753828,1,0


In [45]:
map7_score = mapk(df_test,k=7)
print(map7_score)

0.5768223057644112
