# H&M Personalized Fashion Recommendations
Provide product recommendations based on previous purchases

## 1. EDA
- articles
- customers
- transactions_train
- images

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [6]:
def unique(x):
    import numpy as np
    x = np.array(x)
    return np.unique(x)

In [43]:
art = pd.read_csv('articles.csv')
print('[articles]')
print('Articles data with %d rows and %d columns.' % (art.shape[0], art.shape[1]))
print('\nColumns in articles:\n', art.columns.values)

[articles]
Articles data with 105542 rows and 25 columns.

Columns in articles:
 ['article_id' 'product_code' 'prod_name' 'product_type_no'
 'product_type_name' 'product_group_name' 'graphical_appearance_no'
 'graphical_appearance_name' 'colour_group_code' 'colour_group_name'
 'perceived_colour_value_id' 'perceived_colour_value_name'
 'perceived_colour_master_id' 'perceived_colour_master_name'
 'department_no' 'department_name' 'index_code' 'index_name'
 'index_group_no' 'index_group_name' 'section_no' 'section_name'
 'garment_group_no' 'garment_group_name' 'detail_desc']


In [44]:
art.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."



|**變數名稱**|**變數介紹**|
|:-|:-|
|article_id|商品id(key)|
|product_code, prod_name|商品的編碼及名稱(相同的編碼或名稱下有不一樣的特徵就屬於不同的產品)|
|product_type_no, product_type_name|商品類型的編號、名稱|
|product_group_name|商品分類名稱|
|graphical_appearance_no, graphical_appearance_name|圖形外觀的編號、名稱(one-to-one)|
|colour_group_code, colour_group_name|顏色的編號、名稱(one-to-one)|
|perceived_colour_value_id, perceived_colour_value_name|明亮度及色調的編碼、名稱|
|perceived_colour_master_id, perceived_colour_master_name|主顏色的編號、名稱|
|department_no, department_name|department編碼、名稱|
|index_code, index_name|index編碼、名稱|
|index_group_no, index_group_name|index種類編碼、名稱|
|section_no, section_name|section編碼、名稱|
|garment_group_no, garment_group_name|服裝類別的編號及名稱|
|detail_desc|商品敘述|

In [45]:
# check missing values
print('Check missing values:')
art.isnull().sum(axis = 0)

Check missing values:


article_id                        0
product_code                      0
prod_name                         0
product_type_no                   0
product_type_name                 0
product_group_name                0
graphical_appearance_no           0
graphical_appearance_name         0
colour_group_code                 0
colour_group_name                 0
perceived_colour_value_id         0
perceived_colour_value_name       0
perceived_colour_master_id        0
perceived_colour_master_name      0
department_no                     0
department_name                   0
index_code                        0
index_name                        0
index_group_no                    0
index_group_name                  0
section_no                        0
section_name                      0
garment_group_no                  0
garment_group_name                0
detail_desc                     416
dtype: int64

In [5]:
# unique values in all columns
for i in art.columns[:24]:
    print('number of unique values in ', i, ' : ', len(unique(art[i])), sep = '')

unique_desc = []
for i in art['detail_desc']:
    if i not in unique_desc:
        unique_desc.append(i)
print('number of unique values in detail_desc:', len(unique_desc))

print('')

for i in art.columns[:23]:
    print('unique values in ', i, ' -> ', unique(art[i]), sep = '')

print('unique values in garment_group_name -> ', unique(art['garment_group_name']), sep = '')

number of unique values in article_id : 105542
number of unique values in product_code : 47224
number of unique values in prod_name : 45875
number of unique values in product_type_no : 132
number of unique values in product_type_name : 131
number of unique values in product_group_name : 19
number of unique values in graphical_appearance_no : 30
number of unique values in graphical_appearance_name : 30
number of unique values in colour_group_code : 50
number of unique values in colour_group_name : 50
number of unique values in perceived_colour_value_id : 8
number of unique values in perceived_colour_value_name : 8
number of unique values in perceived_colour_master_id : 20
number of unique values in perceived_colour_master_name : 20
number of unique values in department_no : 299
number of unique values in department_name : 250
number of unique values in index_code : 10
number of unique values in index_name : 10
number of unique values in index_group_no : 5
number of unique values in inde

unique values in index_name -> ['Baby Sizes 50-98' 'Children Accessories, Swimwear'
 'Children Sizes 134-170' 'Children Sizes 92-140' 'Divided'
 'Ladies Accessories' 'Ladieswear' 'Lingeries/Tights' 'Menswear' 'Sport']
unique values in index_group_no -> [ 1  2  3  4 26]
unique values in index_group_name -> ['Baby/Children' 'Divided' 'Ladieswear' 'Menswear' 'Sport']
unique values in section_no -> [ 2  4  5  6  8 11 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
 40 41 42 43 44 45 46 47 48 49 50 51 52 53 55 56 57 58 60 61 62 64 65 66
 70 71 72 76 77 79 80 82 97]
unique values in section_name -> ['Baby Boy' 'Baby Essentials & Complements' 'Baby Girl'
 'Boys Underwear & Basics' 'Collaborations' 'Contemporary Casual'
 'Contemporary Smart' 'Contemporary Street' 'Denim Men'
 'Divided Accessories' 'Divided Asia keys' 'Divided Basics'
 'Divided Collection' 'Divided Complements Other' 'Divided Projects'
 'Divided Selected' 'EQ Divided' 'Girls Underwear & Basics' 'H&M+'
 'Kids & Baby Shoes'

In [8]:
print('unique values in detail_desc ->')
unique_desc

unique values in detail_desc ->


['Jersey top with narrow shoulder straps.',
 'Microfibre T-shirt bra with underwired, moulded, lightly padded cups that shape the bust and provide good support. Narrow adjustable shoulder straps and a narrow hook-and-eye fastening at the back. Without visible seams for greater comfort.',
 'Semi shiny nylon stockings with a wide, reinforced trim at the top. Use with a suspender belt. 20 denier.',
 'Tights with built-in support to lift the bottom. Black in 30 denier and light amber in 15 denier.',
 'Semi shiny tights that shape the tummy, thighs and calves while also encouraging blood circulation in the legs. Elasticated waist.',
 'Opaque matt tights. 200 denier.',
 'Sweatshirt in soft organic cotton with a  press-stud on one shoulder (sizes 12-18 months and 18-24 months without a press-stud). Brushed inside.',
 'Two soft bandeau bras in soft jersey with side support and a silicone trim at the top.',
 'Fitted top in soft stretch jersey with a wide neckline and long sleeves.',
 'Trousers 

articles.csv中包含了商品的資訊，共有105,542個商品及25個變數，其中在detail_desc中有416個空值，即沒有商品介紹。另外根據上方的唯一值資訊以及變數之間的關係可以做些猜測，例如從資料可看出，一些變數之間呈現1對1的關係，例如product_type_no與product_type_name、graphical_appearance_no與graphical_appearance_name及colour_group_code與colour_group_name等變數，這些都可以作為未來決定要放入那些特徵進入模型的考量。

In [4]:
trans = pd.read_csv('transactions_train.csv')

display(trans.head())
trans.shape

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


(31788324, 5)