In [34]:
import sys
from pathlib import Path

import pandas as pd

sys.path.append(str(Path().cwd().parent))

from src.schema.config import Config

pd.options.display.max_columns = 100

In [9]:
cfg = Config.load(Path().cwd().parent.joinpath("conf", "config.yaml"))

past_start_date = pd.to_datetime(cfg.data.past_start_date)
past_end_date = pd.to_datetime(cfg.data.past_end_date)
train_start_date = pd.to_datetime(cfg.data.train_start_date)
train_end_date = pd.to_datetime(cfg.data.train_end_date)
val_start_date = pd.to_datetime(cfg.data.val_start_date)
val_end_date = pd.to_datetime(cfg.data.val_end_date)
test_start_date = pd.to_datetime(cfg.data.test_start_date)
test_end_date = pd.to_datetime(cfg.data.test_end_date)

filtered_chunks = []
transactions_path = Path().cwd().parent.joinpath(cfg.data.transactions_path)
for chunk in pd.read_csv(transactions_path, chunksize=cfg.data.chunksize):
    chunk["t_dat"] = pd.to_datetime(chunk["t_dat"])
    filtered_chunk = chunk.loc[
        (past_start_date <= chunk["t_dat"]) & (chunk["t_dat"] <= test_end_date)
    ]
    if not filtered_chunk.empty:
        filtered_chunks.append(filtered_chunk)
trans_df = pd.concat(filtered_chunks)
all_customers = trans_df["customer_id"].unique()
all_articles = trans_df["article_id"].unique()

filtered_chunks = []
customers_path = Path().cwd().parent.joinpath(cfg.data.customers_path)
for chunk in pd.read_csv(customers_path, chunksize=cfg.data.chunksize):
    filtered_chunk = chunk.loc[chunk["customer_id"].isin(all_customers)]
    if not filtered_chunk.empty:
        filtered_chunks.append(filtered_chunk)
customer_df = pd.concat(filtered_chunks)

filtered_chunks = []
articles_path = Path().cwd().parent.joinpath(cfg.data.articles_path)
for chunk in pd.read_csv(articles_path, chunksize=cfg.data.chunksize):
    filtered_chunk = chunk.loc[chunk["article_id"].isin(all_articles)]
    if not filtered_chunk.empty:
        filtered_chunks.append(filtered_chunk)
article_df = pd.concat(filtered_chunks)

# customer

In [13]:
customer_df.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
30,000172a9c322560c849754ffbdfdb2180d408aa7176b94...,,,ACTIVE,NONE,45.0,4ca377c955c160866d5662b33aa1af44d54d4342fb8bfa...
38,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,1.0,1.0,ACTIVE,Regularly,44.0,930b19ae7db8abb5a27f4da10217755a7305b4c452f5e0...
86,0003e867a930d0d6842f923d6ba7c9b77aba33fe2a0fbf...,1.0,1.0,ACTIVE,Regularly,33.0,d647e4ede3d0eb4ce0750440a110350b5f4c758165d89d...
121,0005340aa69bb5a28d98712a36d8f669024bce137e3c82...,,,ACTIVE,NONE,21.0,2c29ae653a9282cce4151bd87643c907644e09541abc28...


In [14]:
customer_df["customer_id"].nunique() == len(customer_df)

True

In [17]:
customer_df["FN"].value_counts(dropna=False)

FN
NaN    31407
1.0    26241
Name: count, dtype: int64

In [18]:
customer_df["Active"].value_counts(dropna=False)

Active
NaN    31785
1.0    25863
Name: count, dtype: int64

In [19]:
customer_df["fashion_news_frequency"].value_counts(dropna=False)

fashion_news_frequency
NONE         31188
Regularly    26293
NaN            154
Monthly         13
Name: count, dtype: int64

In [20]:
customer_df["age"].describe()

count    57390.000000
mean        35.713312
std         13.919815
min         16.000000
25%         24.000000
50%         31.000000
75%         48.000000
max         95.000000
Name: age, dtype: float64

In [26]:
customer_df["postal_code"].value_counts()

postal_code
2c29ae653a9282cce4151bd87643c907644e09541abc28ae87dea0d1f6603b1c    1541
9d5787501bf1c77592156ba51eab13f4a2670c807686431a9e22a69090b02358      12
7c1fa3b0ec1d37ce2c3f34f63bd792f3b4494f324b6be5d1e4ba6a75456b96a7      12
1f5bd429acc88fbbf24de844a59e438704aa8761bc7b99fd977cad297c50b74c      12
1cffd42ab4bd892f8688dff5b7e76946fc799bc8c18804e458a8d64f03407819       9
                                                                    ... 
2513b55ec9e613c786ed96342b8245c92c473b7d8d4359d51462c3284e46c7a2       1
fab3ea41b83def28f9af4943837bf2df9ba9b762d331568784486f0c56b7f06c       1
83775088d6017bf61875222c4129f4bdc32495f6aef2bb739345362a2348ca63       1
e903a625fa75fa15feb6e4f3a7a880b74c5c8056096518f9361cb1e864331f47       1
d124e4680386b47d0b969a2d1f7e4e808c233444dc515ce202b6d4b2332d38bf       1
Name: count, Length: 49320, dtype: int64

In [27]:
len(customer_df)

57648

# article

In [35]:
article_df.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name,department_no,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,3,Light,9,White,1676,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
6,111565001,111565,20 den 1p Stockings,304,Underwear Tights,Socks & Tights,1010016,Solid,9,Black,4,Dark,5,Black,3608,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Semi shiny nylon stockings with a wide, reinfo..."
8,111586001,111586,Shape Up 30 den 1p Tights,273,Leggings/Tights,Garment Lower body,1010016,Solid,9,Black,4,Dark,5,Black,3608,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Tights with built-in support to lift the botto...
9,111593001,111593,Support 40 den 1p Tights,304,Underwear Tights,Socks & Tights,1010016,Solid,9,Black,4,Dark,5,Black,3608,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,"Semi shiny tights that shape the tummy, thighs..."
10,111609001,111609,200 den 1p Tights,304,Underwear Tights,Socks & Tights,1010016,Solid,9,Black,4,Dark,5,Black,3608,Tights basic,B,Lingeries/Tights,1,Ladieswear,62,"Womens Nightwear, Socks & Tigh",1021,Socks and Tights,Opaque matt tights. 200 denier.


In [2]:
article_df["product_group_name"].value_counts()

NameError: name 'article_df' is not defined

In [64]:
article_df["detail_desc"].value_counts()

detail_desc
Socks in a soft, jacquard-knit cotton blend with elasticated tops.                                                                                          70
Round-necked T-shirt in soft cotton jersey.                                                                                                                 31
Fully lined bikini bottoms with a mid waist and medium coverage at the back.                                                                                28
5-pocket jeans in washed, superstretch denim with a regular waist, zip fly and button, and skinny legs.                                                     26
Socks in a soft, fine-knit cotton blend with elasticated tops.                                                                                              26
                                                                                                                                                            ..
Short lace dress with a V-neck fro

In [63]:
article_df["garment_group_name"].value_counts()

garment_group_name
Jersey Fancy                     2999
Accessories                      1874
Jersey Basic                     1562
Trousers                         1441
Blouses                          1380
Under-, Nightwear                1353
Knitwear                         1117
Dresses Ladies                   1063
Trousers Denim                    751
Outdoor                           724
Swimwear                          641
Shoes                             601
Socks and Tights                  515
Skirts                            329
Unknown                           314
Shirts                            258
Shorts                            240
Dressed                           208
Woven/Jersey/Knitted mix Baby     164
Special Offers                     87
Dresses/Skirts girls               80
Name: count, dtype: int64

In [62]:
article_df["garment_group_no"].value_counts()

garment_group_no
1005    2999
1019    1874
1002    1562
1009    1441
1010    1380
1017    1353
1003    1117
1013    1063
1016     751
1007     724
1018     641
1020     601
1021     515
1012     329
1001     314
1011     258
1025     240
1008     208
1006     164
1023      87
1014      80
Name: count, dtype: int64

In [61]:
article_df["section_name"].value_counts()

section_name
Womens Everyday Collection        1853
Divided Collection                1575
Womens Tailoring                   973
Womens Small accessories           905
Womens Lingerie                    832
Womens Swimwear, beachwear         605
Men Underwear                      603
Womens Casual                      600
Womens Trend                       547
Womens Nightwear, Socks & Tigh     507
Divided Basics                     465
Baby Essentials & Complements      451
H&M+                               450
Ladies Denim                       426
Womens Shoes                       418
Mama                               412
Womens Everyday Basics             410
Ladies H&M Sport                   407
Divided Projects                   349
Womens Big accessories             324
Contemporary Smart                 313
Kids Girl                          276
Young Girl                         269
Divided Accessories                264
Kids Boy                           244
Girls Underw

In [60]:
article_df["section_no"].value_counts()

section_no
15    1853
53    1575
11     973
66     905
61     832
60     605
26     603
6      600
18     547
62     507
51     465
44     451
2      450
57     426
64     418
8      412
16     410
5      407
50     349
65     324
20     313
76     276
77     269
52     264
46     244
79     233
19     228
23     216
55     213
21     212
72     203
47     200
14     174
41     170
45     162
25     162
56     155
40     142
22     130
58     126
42     117
31      90
43      68
82      56
27      48
70      47
49      30
97      19
80      12
29       5
24       4
4        1
Name: count, dtype: int64

In [59]:
article_df["index_group_name"].value_counts()

index_group_name
Ladieswear       9381
Divided          3197
Baby/Children    2535
Menswear         2021
Sport             567
Name: count, dtype: int64

In [58]:
article_df["index_group_no"].value_counts()

index_group_no
1     9381
2     3197
4     2535
3     2021
26     567
Name: count, dtype: int64

In [57]:
article_df["index_code"].value_counts()

index_code
A    5916
D    3197
F    2021
B    1818
C    1647
H     817
G     803
I     732
S     567
J     183
Name: count, dtype: int64

In [56]:
article_df["index_name"].value_counts()

index_name
Ladieswear                        5916
Divided                           3197
Menswear                          2021
Lingeries/Tights                  1818
Ladies Accessories                1647
Children Sizes 92-140              817
Baby Sizes 50-98                   803
Children Sizes 134-170             732
Sport                              567
Children Accessories, Swimwear     183
Name: count, dtype: int64

In [55]:
article_df["department_no"].value_counts()

department_no
4242    605
1643    414
1676    401
4344    376
1636    368
       ... 
5959      1
5958      1
5952      1
4213      1
1727      1
Name: count, Length: 260, dtype: int64

In [54]:
article_df["department_name"].value_counts()

department_name
Jersey                        962
Trouser                       745
Blouse                        664
Swimwear                      631
Knitwear                      622
                             ... 
Shoes Other                     1
On Demand                       1
Nursing                         1
Divided+ inactive from s.1      1
Test Ladies                     1
Name: count, Length: 217, dtype: int64

In [53]:
article_df["perceived_colour_master_name"].value_counts()

perceived_colour_master_name
Black              4598
Blue               2378
White              2219
Beige              1411
Pink               1303
Grey               1274
Khaki green         610
Metal               550
Green               529
Red                 496
Yellow              492
Orange              487
Brown               455
Mole                325
Lilac Purple        227
Turquoise           174
Unknown             160
undefined            11
Yellowish Green       2
Name: count, dtype: int64

In [52]:
article_df["perceived_colour_master_id"].value_counts()

perceived_colour_master_id
 5     4598
 2     2378
 9     2219
 11    1411
 4     1303
 12    1274
 20     610
 15     550
 19     529
 18     496
 8      492
 3      487
 13     455
 1      325
 6      227
 7      174
-1      160
 14      11
 10       2
Name: count, dtype: int64

In [51]:
article_df["perceived_colour_value_name"].value_counts()

perceived_colour_value_name
Dark            7189
Dusty Light     3887
Medium Dusty    2620
Light           2500
Bright           920
Medium           574
Undefined         11
Name: count, dtype: int64

In [50]:
article_df["perceived_colour_value_id"].value_counts()

perceived_colour_value_id
4    7189
1    3887
2    2620
3    2500
5     920
7     574
6      11
Name: count, dtype: int64

In [36]:
article_df["article_id"].nunique() == len(article_df)

True

In [40]:
article_df["article_id"].nunique()

17701

In [41]:
article_df["product_code"].value_counts()

product_code
783707    42
685816    26
562245    26
599580    23
706016    22
          ..
820016     1
819866     1
819836     1
819509     1
953450     1
Name: count, Length: 9658, dtype: int64

In [42]:
article_df["prod_name"].value_counts()

prod_name
1pk Fun                   40
Luna skinny RW            23
Timeless Midrise Brief    23
Shake it in Balconette    20
Brit Baby Tee             20
                          ..
ORVAR romper set           1
CNY EQ Celeste             1
CNY Demi fur               1
CNY Greta PU shirt         1
5pk regular Placement1     1
Name: count, Length: 9946, dtype: int64

In [43]:
article_df["product_type_no"].value_counts()

product_type_no
272    2367
265    1978
252    1320
255    1090
254     817
       ... 
762       1
303       1
515       1
492       1
491       1
Name: count, Length: 114, dtype: int64

In [44]:
article_df["product_type_name"].value_counts()

product_type_name
Trousers               2367
Dress                  1978
Sweater                1320
T-shirt                1090
Top                     817
                       ... 
Bra extender              1
Moccasins                 1
Toy                       1
Marker pen                1
Stain remover spray       1
Name: count, Length: 113, dtype: int64

In [46]:
article_df["graphical_appearance_no"].value_counts()

graphical_appearance_no
1010016    9812
1010001    2105
1010023    1266
1010010    1045
1010017     583
1010004     388
1010026     383
1010014     292
1010008     242
1010007     202
1010021     202
1010005     200
1010006     129
1010012     120
1010009     119
1010002     111
1010022      99
1010018      65
1010011      64
1010015      61
1010020      55
1010013      50
1010028      34
1010019      22
1010024      20
1010025      18
1010027      10
1010029       3
1010003       1
Name: count, dtype: int64

In [47]:
article_df["graphical_appearance_name"].value_counts()

graphical_appearance_name
Solid                  9812
All over pattern       2105
Denim                  1266
Melange                1045
Stripe                  583
Check                   388
Other structure         383
Placement print         292
Front print             242
Embroidery              202
Lace                    202
Colour blocking         200
Dot                     129
Mixed solid/pattern     120
Glittering/Metallic     119
Application/3D          111
Jacquard                 99
Treatment                65
Metallic                 64
Sequin                   61
Contrast                 55
Other pattern            50
Mesh                     34
Transparent              22
Chambray                 20
Slub                     18
Neps                     10
Hologram                  3
Argyle                    1
Name: count, dtype: int64

In [48]:
article_df["colour_group_code"].value_counts()

colour_group_code
9     4595
10    1644
73    1284
12     786
13     747
51     738
72     599
7      589
11     535
19     494
8      494
71     486
52     387
5      365
17     322
93     315
43     269
22     269
6      261
31     253
14     227
42     187
3      179
92     171
33     170
32     155
21     151
61     140
53     124
91     110
81     101
23      85
15      66
50      62
62      54
82      53
83      53
63      44
20      33
30      28
90      19
41      14
1       11
40      10
60       7
4        6
2        5
70       4
Name: count, dtype: int64

In [49]:
article_df["colour_group_name"].value_counts()

colour_group_name
Black              4595
White              1644
Dark Blue          1284
Light Beige         786
Beige               747
Light Pink          738
Blue                599
Grey                589
Off White           535
Greenish Khaki      494
Dark Grey           494
Light Blue          486
Pink                387
Gold                365
Yellowish Brown     322
Dark Green          315
Dark Red            269
Yellow              269
Light Grey          261
Light Orange        253
Dark Beige          227
Red                 187
Silver              179
Green               171
Dark Orange         170
Orange              155
Light Yellow        151
Light Purple        140
Dark Pink           124
Light Green         110
Light Turquoise     101
Dark Yellow          85
Greyish Beige        66
Other Pink           62
Purple               54
Turquoise            53
Dark Turquoise       53
Dark Purple          44
Other Yellow         33
Other Orange         28
Other Green          1