In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []

    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

In [None]:
data = pd.read_parquet("/content/drive/MyDrive/CS 7641 Machine Learning Group Project/lgbmknndata.parquet")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from lightgbm.sklearn import LGBMRanker

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []

    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)

data = pd.read_parquet("/content/drive/MyDrive/CS 7641 Machine Learning Group Project/lgbmknndata.parquet")
# data = pd.read_parquet("lgbmknndata.parquet")

train = data[(data.week != 104) & (data.week != 105)]
test = data[data.week==104].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

train_X = train[columns_to_use]
train_y = train['purchased']

neighbors1 = KNeighborsClassifier(n_neighbors=3)
neighbors1 = neighbors1.fit(
    train_X,
    train_y,
)

ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    learning_rate=0.03,
    verbose=10
)
ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)



for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())


Mounted at /content/drive


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.797727
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.120156
[LightGBM] [Debug] init for col-wise cost 0.172728 seconds, init for row-wise cost 1.025744 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.619236 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1138
[LightGBM] [Info] Number of data points in the train set: 4844840, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
bestseller_rank 0.9991608123407203
age 0.00027114816482595445
garment_group_no 0.0001618962712558173
article_id 0.00010371299962208835
Active 5.480771028614211e-05
department_no 4.870119202285968e-05
postal_code 4.562004276840647e-05
colour_group_code 3.5326514992692575e-05
pr

In [None]:
# Test KNN and LGBM
directory = '/content/drive/MyDrive/CS 7641 Machine Learning Group Project/'
testfiles = ['test1.csv', 'test2.csv', 'test3.csv', 'test4.csv', 'test5.csv', 'test6.csv']
for testf in testfiles:
    print("predicitng for testf", directory+testf)
    cust_to_pred = pd.read_csv(directory+testf)
    changedpred = cust_to_pred
    changedpred['customer_id'] = customer_hex_id_to_int(changedpred['customer_id'])

    testdata = test[test["customer_id"].isin(changedpred["customer_id"])]

    test_X = testdata[columns_to_use]

    knnpred = testdata.copy()

    testdata['preds'] = ranker.predict(test_X)

    knnpred['preds'] = neighbors1.predict_proba(test_X)[:,1]

    c_id2predicted_article_ids1 = testdata \
        .sort_values(['customer_id', 'preds'], ascending=False) \
        .groupby('customer_id')['article_id'].apply(list).to_dict()

    c_id2predicted_article_ids2 = knnpred \
        .sort_values(['customer_id', 'preds'], ascending=False) \
        .groupby('customer_id')['article_id'].apply(list).to_dict()

    bestsellinglastweekdf = data[data.week == 103]
    bestsellinglastweek = bestsellinglastweekdf['article_id'].value_counts().head(12)

    cust_to_pred = pd.read_csv(directory+testf)
    customers = cust_to_pred[['customer_id']].drop_duplicates()
    customers['prediction'] = 1

    bestsellinglastweek = bestsellinglastweek.index.tolist()

    preds = []
    for c_id in customer_hex_id_to_int(customers.customer_id):
        pred = c_id2predicted_article_ids1.get(c_id, [])
        pred = pred + bestsellinglastweek
        preds.append(pred[:12])

    predsknn = []
    for c_id in customer_hex_id_to_int(customers.customer_id):
        pred = c_id2predicted_article_ids2.get(c_id, [])
        pred = pred + bestsellinglastweek
        predsknn.append(pred[:12])

    knncustomers = customers.copy()

    preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
    customers.prediction = preds

    predsknn = [' '.join(['0' + str(p) for p in ps]) for ps in predsknn]
    knncustomers.prediction = predsknn

    customers.to_csv(directory+'lgbm_submission_'+testf, index=False)
    knncustomers.to_csv(directory+'knn_submission_'+testf, index=False)

predicitng for testf /content/drive/MyDrive/CS 7641 Machine Learning Group Project/test1.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata['preds'] = ranker.predict(test_X)


predicitng for testf /content/drive/MyDrive/CS 7641 Machine Learning Group Project/test2.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata['preds'] = ranker.predict(test_X)


predicitng for testf /content/drive/MyDrive/CS 7641 Machine Learning Group Project/test3.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata['preds'] = ranker.predict(test_X)


predicitng for testf /content/drive/MyDrive/CS 7641 Machine Learning Group Project/test4.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  testdata['preds'] = ranker.predict(test_X)
