In [8]:
import sys
# caution: path[0] is reserved for script path (or '' in REPL)
sys.path.insert(1, '/Users/hjain/toy_dataset_ranking/kaggle_template/')

from boilerplate import *
%matplotlib inline

pd.options.display.max_rows = 500
pd.options.display.max_columns = 100

In [None]:
# replace with the actual file path
data = pd.read_csv("/Users/hjain/Downloads/test_data.csv")
data.shape

In [None]:
data.head(10)

In [None]:
# rename any columns if needed
data.rename(columns={"p": "a", "q": "b"}, inplace=True)

In [None]:
summarize_data(data)

If pandas didn't already detect any timestamp columns as datetime type

In [1]:
# data["timestamp_col"] = pd.to_datetime(data['timestamp_col'])

For ranking model evaluation, we probably want to split train-test data to reflect historical data vs future data.\
=> sort based on timestamps

In [None]:
data.sort_values(by="timestamp_col", inplace=True)

Mark target column

In [1]:
TARGET_COL = "purchase"

In [None]:
X, y = split_df_x_y(data, TARGET_COL)

Drop any columns that don't seem useful

In [None]:
X_ = X.drop(["col1", "col2", "timestamp_col"], axis=1)

In [None]:
# stats on continuous features
X_.describe() # by default only describes continous features

In [None]:
# Visualise the distribution of the target to get a sense and for potential outlier values

# # if it's continuous
# plt.boxplot(data[TARGET_COL], vert=False)

# if it's discrete
y.hist()

train-test data split to reflect historical data vs future data if timestamps provided and sorted above

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
TRAIN_SIZE = .8*len(X_)
X_train, y_train = X_[:TRAIN_SIZE], y[:TRAIN_SIZE]
X_test, y_test = X_[TRAIN_SIZE:], y[TRAIN_SIZE:]

#### imputation of missing values, encoding simple categorical features --> scaling

In [None]:
cat_preprocessing = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='NA'),
    OneHotEncoder(handle_unknown='ignore'))

cont_preprocessing = make_pipeline(
    SimpleImputer(),
    StandardScaler())

#### map the preprocessing logic to the categorical vs continuous features

In [None]:
col_transformer = make_column_transformer(
    (cat_preprocessing, cat_features),
    remainder=cont_preprocessing
)

#### wire the preprocessing pipeline

In [None]:
preprocess = make_column_transformer(
    (cat_preprocessing, make_column_selector(dtype_include='object')),
    remainder=cont_preprocessing
)

In [None]:
lr_pipe = make_pipeline(preprocess, LogisticRegression())

getting embeddings

In [None]:
# Encode all titles and queries at once
listing_title_embeddings = model.encode(X_["listing_title"].tolist())
query_embeddings = model.encode(X_["query"].tolist())

# Calculate similarity scores using dot product
X_["title_query_similarity"] = np.sum(
    listing_title_embeddings * query_embeddings, 
    axis=1
)

In [None]:
# Grid search example
grid = GridSearchCV(lr_pipe, param_grid={"logisticregression__C": np.geomspace(0.1, 2, num=5)}, scoring="recall", cv=KFold(n_splits=5,shuffle=True))

In [None]:
# plotting grid search results
cv_results = pd.DataFrame(grid.cv_results_)
plt.title("Grid Search Results for the Classifier")
lr_regularization_inv = cv_results["param_logisticregression__C"]
plt.plot(lr_regularization_inv, cv_results['mean_test_score'], marker='o')

### `LGBMRanker()` Interface
- feature vector $x_i$ contains query+item information
- label $y_i$ is the relevance label, which can be either binary or graded (rating)

In [4]:
from sklearn.datasets import load_svmlight_file

In [5]:
DATA_PATH = "/Users/hjain/toy_dataset_ranking/MSLR-WEB10K/"

In [6]:
X_train, y_train, qid_train = load_svmlight_file(str(DATA_PATH + 'vali.txt'), query_id=True)

In [7]:
X_train.shape

(235259, 136)

In [8]:
# Convert labels to integers
y_train = y_train.astype(int)

In [9]:
# the data is already sorted by query ids, which is why this works out of the box
x, group_train = np.unique(qid_train, return_counts=True)

In [10]:
group_train

array([93, 58, 84, ..., 71, 70, 81])

In [11]:
from lightgbm import LGBMRanker

gbm = LGBMRanker()
gbm.fit(X_train, y_train, group=group_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.029991 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 25546
[LightGBM] [Info] Number of data points in the train set: 235259, number of used features: 136


In [12]:
def ndcg(y_score, y_true, k):
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2**y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2) # +2 since it's 0-indexed
    return np.sum(gain / discounts)

In [13]:
X_test, y_test, qid_test = load_svmlight_file(str(DATA_PATH+'test.txt'), query_id=True)

In [14]:
y_test = y_test.astype(int)

In [15]:
predictions = []

for group in np.unique(qid_test):
    preds = gbm.predict(X_test[qid_test == group])
    predictions.extend(preds)

In [29]:
np.mean(ndcg_)

0.4723195111423233

### Simple classification/regression

In [None]:
# visualise the distribution of the target for potential outlier values

# if it's discrete
data[TARGET_COL].hist()

In [None]:
data.info()

In [None]:
# split into categorical and continuous features broadly
cols = data.columns
feature_cols = [col for col in cols if col != 4]

In [None]:
target_col = 4
X = data[feature_cols]
y = data[target_col]

In [None]:
X.head()

In [None]:
y

In [None]:
# stats on continuous features
X.describe() # by default only describes continous features

In [None]:
# separate continuous vs categorical features broadly
cont_features, cat_features = separate_cont_cat(X)

In [None]:
cont_features

In [None]:
cat_features

In [None]:
# visualize continuous features
continuous_feature_histograms(X, cont_features)

In [None]:
# get stats on categorical features
categorical_variables_stats(X, cat_features)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
cat_preprocessing = make_pipeline(
    OneHotEncoder(handle_unknown='ignore'))
cont_preprocessing = make_pipeline(StandardScaler())

In [None]:
len(X_train.columns) == len(cat_features) + len(cont_features)

In [None]:
col_transformer = make_column_transformer(
    (cat_preprocessing, cat_features),
    remainder=cont_preprocessing
)

In [None]:
np.isnan(col_transformer.fit_transform(X_train).toarray())

In [None]:
lr_pipe = make_pipeline(col_transformer, LogisticRegression())

In [None]:
lr_pipe.fit(X_train, y_train)

In [None]:
cross_val_score(lr_pipe, X_train, y_train).mean()

In [None]:
lr_pipe.score(X_test, y_test)