In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        if filename.endswith("csv"):
            print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Load data

In [3]:
transactions = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype={"article_id": "str"})
customers = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
articles = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv", dtype={"article_id": "str"})

### Sample data, based on https://www.kaggle.com/code/paweljankiewicz/hm-create-dataset-samples/notebook

In [4]:
for sample_repr, sample in [("01", 0.001), ("1", 0.01), ("5", 0.05)]:
    print(sample)
    customers_sample = customers.sample(int(customers.shape[0]*sample), replace=False)
    customers_sample_ids = set(customers_sample["customer_id"])
    transactions_sample = transactions[transactions["customer_id"].isin(customers_sample_ids)]
    articles_sample_ids = set(transactions_sample["article_id"])
    articles_sample = articles[articles["article_id"].isin(articles_sample_ids)]
    customers_sample.to_csv(f"customers_sample{sample_repr}.csv.gz", index=False)
    transactions_sample.to_csv(f"transactions_train_sample{sample_repr}.csv.gz", index=False)
    articles_sample.to_csv(f"articles_train_sample{sample_repr}.csv.gz", index=False)

## Trenging products Weekly

### Import packages

In [5]:
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import seaborn as sns

from datetime import datetime, timedelta

import warnings
warnings.filterwarnings("ignore")

In [6]:
N = 12

In [7]:
transactions_sample.info()

Reduce memory on customer_id

In [8]:
df = transactions[['t_dat','customer_id','article_id']]
df.customer_id = df.customer_id.str[-16:]

In [9]:
#Choose last date
df['t_dat'] = pd.to_datetime(df['t_dat'])
last_ts = df['t_dat'].max()

In [10]:
tmp = df[['t_dat']].copy()

#as day of week
tmp['dow'] = tmp['t_dat'].dt.dayofweek

#Truncated t_dat into the Tuesday of t_dat week, saved as ldbw
tmp['ldbw'] = tmp['t_dat'] - pd.TimedeltaIndex(tmp['dow'] - 1, unit = 'D')

#for t_dat Wednesday until Sunday add 7 days to get next Tuesday, do nothing for t_dat Monday and Tuesday
tmp.loc[tmp['dow'] >=2, 'ldbw'] = tmp.loc[tmp['dow'] >= 2, 'ldbw'] + pd.TimedeltaIndex(np.ones(len(tmp.loc[tmp['dow'] >=2 ])) * 7, unit = 'D')

df['ldbw'] = tmp['ldbw'].values

## Count number of transactions per week

In [11]:
weekly_sales = df.drop('customer_id', axis = 1).groupby(['ldbw','article_id']).count().reset_index()
weekly_sales = weekly_sales.rename(columns={'t_dat':'count'})

In [12]:
df = df.merge(weekly_sales, on=['ldbw', 'article_id'], how = 'left')

Let's assume that in te

In [13]:
weekly_sales = weekly_sales.reset_index().set_index('article_id')

df = df.merge(
    weekly_sales.loc[weekly_sales['ldbw']==last_ts, ['count']],
    on='article_id', suffixes=("", "_targ"))

df['count_targ'].fillna(0, inplace=True)
del weekly_sales

Sales rate adjusted for changes in product popularity

In [14]:
df['quotient'] = df['count_targ'] / df['count']

Take supposedly popular products

In [15]:
target_sales = df.drop('customer_id', axis=1).groupby('article_id')['quotient'].sum()
general_pred = target_sales.nlargest(N).index.tolist()
general_pred = [str(article_id) for article_id in general_pred]
general_pred_str =  ' '.join(general_pred)
del target_sales

In [16]:
general_pred

In [17]:
general_pred_str

**Purchase dictionary**

In [18]:
purchase_dict = {}

tmp = df.copy()
tmp['x'] = ((last_ts - tmp['t_dat']) / np.timedelta64(1,'D')).astype(int)
tmp['dummy_1'] = 1
tmp['x'] = tmp[['x','dummy_1']].max(axis = 1)

a, b, c, d = 2.5e4, 1.5e5, 2e-1, 1e3
tmp['y'] = a / np.sqrt(tmp['x']) + b * np.exp(-c*tmp['x']) - d

tmp['dummy_0'] = 0 
tmp['y'] = tmp[["y", "dummy_0"]].max(axis=1)
tmp['value'] = tmp['quotient'] * tmp['y'] 

tmp = tmp.groupby(['customer_id', 'article_id']).agg({'value': 'sum'})
tmp = tmp.reset_index()

tmp = tmp.loc[tmp['value'] > 100]
tmp['rank'] = tmp.groupby("customer_id")["value"].rank("dense", ascending=False)
tmp = tmp.loc[tmp['rank'] <= 12]

# for customer_id in tmp['customer_id'].unique():
#     purchase_dict[customer_id] = {} 

# for customer_id, article_id, value in zip(tmp['customer_id'], tmp['article_id'], tmp['value']):
#     purchase_dict[customer_id][article_id] = value

purchase_df = tmp.sort_values(['customer_id', 'value'], ascending = False).reset_index(drop = True)
purchase_df['prediction'] = purchase_df['article_id'].astype(str) + ' '
purchase_df = purchase_df.groupby('customer_id').agg({'prediction': sum}).reset_index()
purchase_df['prediction'] = purchase_df['prediction'].str.strip()
purchase_df = pd.DataFrame(purchase_df)


## Submission

In [19]:
sub  = pd.read_csv('../input/h-and-m-personalized-fashion-recommendations/sample_submission.csv',
                            usecols= ['customer_id'], 
                            dtype={'customer_id': 'string'})

sub['customer_id2'] = sub['customer_id'].str[-16:]

sub = sub.merge(purchase_df, left_on = 'customer_id2', right_on = 'customer_id', how = 'left',
               suffixes = ('', '_ignored'))

sub['prediction'] = sub['prediction'].fillna(general_pred_str)
sub['prediction'] = sub['prediction'] + ' ' +  general_pred_str
sub['prediction'] = sub['prediction'].str.strip()
sub['prediction'] = sub['prediction'].str[:131]
sub = sub[['customer_id', 'prediction']]
sub.to_csv(f'submission.csv.gz', compression="gzip", index=False)

In [20]:
sub

In [None]:
!git