In [14]:
import pandas as pd

### 1. Load the data

In [15]:
transaction_df = pd.read_csv('data/transactions_train.csv', dtype={'article_id': str})
transaction_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


### 2. Filter by recent purchases

In [16]:
transaction_df = transaction_df[transaction_df.t_dat > '2020-09-07']
transaction_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
31256357,2020-09-08,0002cca4cc68601e894ab62839428e5f0696417fe0f9e8...,910601002,0.042356,1
31256358,2020-09-08,0006bb0fad5c49341bd9cece264271e68e01a4e55f22ec...,708138021,0.06778,2
31256359,2020-09-08,0006bb0fad5c49341bd9cece264271e68e01a4e55f22ec...,728156006,0.050831,2
31256360,2020-09-08,0008804a45e7fbc8653ba8f5ce15880cb966ca220c52d2...,857271001,0.050831,2
31256361,2020-09-08,0008804a45e7fbc8653ba8f5ce15880cb966ca220c52d2...,857271001,0.050831,2


### 3. Customer purchases

In [4]:
# What items do each customers buy the most
cp = transaction_df.groupby(["customer_id", "article_id"])[["article_id"]].count()
cp.columns = ["purchase_count"]
cp = cp.reset_index()
cp.head(15)

Unnamed: 0,customer_id,article_id,purchase_count
0,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,794321007,1
1,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,448509014,1
2,0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d...,719530003,1
3,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,734592001,1
4,0002cca4cc68601e894ab62839428e5f0696417fe0f9e8...,910601002,1
5,00039306476aaf41a07fed942884f16b30abfa83a2a8be...,624486001,1
6,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,621381012,1
7,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,640021012,1
8,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,827487003,1
9,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,880017001,2


In [5]:
cp = cp.sort_values(["customer_id", "purchase_count"], ascending=False)
cp = cp.groupby("customer_id").head(10)
cp = cp.groupby("customer_id")["article_id"].apply(list)
cp.head(10)

customer_id
000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318                                         [0794321007]
0000757967448a6cb83efb3ea7a3fb9d418ac7adf2379d8cd0c725276a467a2a                             [0448509014, 0719530003]
0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37e011580a479e80aa94                                         [0734592001]
0002cca4cc68601e894ab62839428e5f0696417fe0f9e84551c6827a7629d441                                         [0910601002]
00039306476aaf41a07fed942884f16b30abfa83a2a8bea972019098d6406793                                         [0624486001]
0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf4672f30b3e622fec55    [0880017001, 0621381012, 0640021012, 082748700...
00040239317e877c77ac6e79df42eb2633ad38fcac09fc0094e549180ddc201c                             [0875272011, 0875272012]
000493dd9fc463df1acc2081450c9e75ef8e87d5dd17ed6396773839f6bf71a9    [0399136061, 0556255001, 0576897001, 064002101...
000525e3fe01600d717da8423643a8303390a055c578

In [6]:
cp['0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf4672f30b3e622fec55']

['0880017001', '0621381012', '0640021012', '0827487003', '0889036004']

### 4. Popular purchases

In [7]:
transaction_df['article_id'].value_counts()[:12]

0909370001    1843
0924243001    1570
0918522001    1428
0865799006    1283
0751471001    1219
0448509014    1206
0762846027    1080
0918292001    1057
0923758001    1049
0924243002    1000
0915529003     989
0850917001     928
Name: article_id, dtype: int64

In [8]:
pp = list(transaction_df['article_id'].value_counts().index[:12])
print(pp)

['0909370001', '0924243001', '0918522001', '0865799006', '0751471001', '0448509014', '0762846027', '0918292001', '0923758001', '0924243002', '0915529003', '0850917001']


### 5. Prepare submission

In [9]:
submission_df = pd.read_csv('data/sample_submission.csv')
submission_df.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0706016001 0706016002 0372860001 0610776002 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0706016002 0372860001 0610776002 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0706016002 0372860001 0610776002 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0706016001 0706016002 0372860001 0610776002 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0706016001 0706016002 0372860001 0610776002 07...


In [10]:
submission_df["prediction"] = submission_df["customer_id"].map(cp)
submission_df["prediction"] = submission_df["prediction"].apply(lambda x: x if isinstance(x, list) else [])
submission_df.head(-10)

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,[]
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,[]
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,[0794321007]
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,[]
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,[]
...,...,...
1371965,ffff25c78688e1c34e48a4e34b9a953bde663cf937e715...,[]
1371966,ffff2d1849db66617499febae392fb5e9335ebf160de0e...,[]
1371967,ffff2f5a160e334d722d2f2c36be9907f6d097a141e49b...,[]
1371968,ffff4c4e8b57b633c1ddf8fbd53db16b962cf831baf9ed...,[]


In [11]:
submission_df["prediction"] = submission_df["prediction"].apply(lambda x: x[:12] + pp[:12-len(x)])
submission_df.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,"[0909370001, 0924243001, 0918522001, 086579900..."
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,"[0909370001, 0924243001, 0918522001, 086579900..."
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,"[0794321007, 0909370001, 0924243001, 091852200..."
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,"[0909370001, 0924243001, 0918522001, 086579900..."
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,"[0909370001, 0924243001, 0918522001, 086579900..."


In [12]:
submission_df["prediction"] = submission_df["prediction"].apply(lambda x: " ".join(x))
submission_df.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0909370001 0924243001 0918522001 0865799006 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0909370001 0924243001 0918522001 0865799006 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0909370001 0924243001 0918522001 08...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0909370001 0924243001 0918522001 0865799006 07...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0909370001 0924243001 0918522001 0865799006 07...


In [13]:
submission_df.to_csv('simple_submission.csv', index=False)