In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette("colorblind")

# Personalized Fashion Recommendations

## Objectives:
- To predict future potential customers purchasing
- Propose outfit combinations based on the previous purchases

## Dataset
https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/data

## Metric

Comparing the predicted purchases with the purchases done in the 7 days after the time of training set period.

## Naive/Baseline model

- **Naive model**: All customers buy any random article.
- **Baseline model**: The customer purchases the same articles purchased in the past.

## Reading Data

In [2]:
# df_customers = pd.read_csv("../fashion_dataset/customers.csv.zip")
# df_transactions = pd.read_csv("../fashion_dataset/transactions_train.csv.zip")

In [3]:
df_articles = pd.read_csv("../fashion_dataset/articles.csv.zip")

In [140]:
df_articles.iloc[0]

article_id                                                    108775015
product_code                                                     108775
prod_name                                                     Strap top
product_type_no                                                     253
product_type_name                                              Vest top
product_group_name                                   Garment Upper body
graphical_appearance_no                                         1010016
graphical_appearance_name                                         Solid
colour_group_code                                                     9
colour_group_name                                                 Black
perceived_colour_value_id                                             4
perceived_colour_value_name                                        Dark
perceived_colour_master_id                                            5
perceived_colour_master_name                                    

In [5]:
df_articles.sample(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
94164,861036002,861036,Spritz top,258,Blouse,Garment Upper body,1010016,Solid,8,Dark Grey,...,Blouse & Dress,A,Ladieswear,1,Ladieswear,18,Womens Trend,1010,Blouses,Blouse in a crisp weave with gathered seams at...
27373,619878001,619878,Blueberry dress,265,Dress,Garment Full body,1010001,All over pattern,23,Dark Yellow,...,Blouse & Dress,A,Ladieswear,1,Ladieswear,18,Womens Trend,1010,Blouses,


In [6]:
df_articles.shape

(105542, 25)

In [7]:
# Drop duplicates
df_articles = df_articles.drop_duplicates()

In [9]:
df_articles.shape

(105542, 25)

In [13]:
# Missing values
df_articles.isna().sum()

article_id                        0
product_code                      0
prod_name                         0
product_type_no                   0
product_type_name                 0
product_group_name                0
graphical_appearance_no           0
graphical_appearance_name         0
colour_group_code                 0
colour_group_name                 0
perceived_colour_value_id         0
perceived_colour_value_name       0
perceived_colour_master_id        0
perceived_colour_master_name      0
department_no                     0
department_name                   0
index_code                        0
index_name                        0
index_group_no                    0
index_group_name                  0
section_no                        0
section_name                      0
garment_group_no                  0
garment_group_name                0
detail_desc                     416
dtype: int64

### article_id

In [19]:
# article_id is unique for each row
not df_articles['article_id'].duplicated().any()

True

### product_code

In [29]:
grouped = df_articles.groupby('product_code')

In [60]:
for _, group in grouped:
    break

In [61]:
group.loc[:, (group.iloc[0] != group.iloc[1]).to_list()]

Unnamed: 0,article_id,colour_group_code,colour_group_name,perceived_colour_value_id,perceived_colour_value_name,perceived_colour_master_id,perceived_colour_master_name
0,108775015,9,Black,4,Dark,5,Black
1,108775044,10,White,3,Light,9,White
2,108775051,11,Off White,1,Dusty Light,9,White


### Most common names of the features

In [137]:
for feature in columns:
    display(pd.DataFrame(df_articles[feature].value_counts()[:5]))

Unnamed: 0,prod_name
Dragonfly dress,98
Mike tee,72
Wow printed tee 6.99,70
1pk Fun,55
TP Paddington Sweater,54


Unnamed: 0,product_type_name
Trousers,11169
Dress,10362
Sweater,9302
T-shirt,7904
Top,4155


Unnamed: 0,product_group_name
Garment Upper body,42741
Garment Lower body,19812
Garment Full body,13292
Accessories,11158
Underwear,5490


Unnamed: 0,graphical_appearance_name
Solid,49747
All over pattern,17165
Melange,5938
Stripe,4990
Denim,4842


Unnamed: 0,colour_group_name
Black,22670
Dark Blue,12171
White,9542
Light Pink,5811
Grey,4487


Unnamed: 0,perceived_colour_value_name
Dark,42706
Dusty Light,22152
Light,15739
Medium Dusty,12630
Bright,6471


Unnamed: 0,perceived_colour_master_name
Black,22585
Blue,18469
White,12665
Pink,9403
Grey,8924


Unnamed: 0,department_name
Jersey,4604
Knitwear,3503
Trouser,2655
Blouse,2362
Dress,2087


Unnamed: 0,index_name
Ladieswear,26001
Divided,15149
Menswear,12553
Children Sizes 92-140,12007
Children Sizes 134-170,9214


Unnamed: 0,index_group_name
Ladieswear,39737
Baby/Children,34711
Divided,15149
Menswear,12553
Sport,3392


Unnamed: 0,section_name
Womens Everyday Collection,7295
Divided Collection,7124
Baby Essentials & Complements,4932
Kids Girl,4469
Young Girl,3899


Unnamed: 0,garment_group_name
Jersey Fancy,21445
Accessories,11519
Jersey Basic,8126
Knitwear,7490
"Under-, Nightwear",7441


## Summary of EDA

- The dataset does not contain duplicates
- No missing values except `416 out of 105542` `detail_desc` detailed descriptions
- `article_id is unique for each sample`
- `product_code` stands for the same product but different colours

In [126]:
columns = ['prod_name', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name',
           'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 'index_name', 
           'index_group_name', 'section_name', 'garment_group_name']
nunique_count = [df_articles[feature].nunique() for feature in columns]
pd.DataFrame({'Column name': columns, 'Number of unique values': nunique_count}).set_index('Column name')

Unnamed: 0_level_0,Number of unique values
Column name,Unnamed: 1_level_1
prod_name,45875
product_type_name,131
product_group_name,19
graphical_appearance_name,30
colour_group_name,50
perceived_colour_value_name,8
perceived_colour_master_name,20
department_name,250
index_name,10
index_group_name,5


In [139]:
# !git branch
# !git add .
# !git commit -m "EDA of article.csv update"
# !git push origin kuzma

[kuzma e4466f9] EDA of article.csv update
 1 file changed, 1678 insertions(+), 24 deletions(-)
Enumerating objects: 5, done.
Counting objects: 100% (5/5), done.
Delta compression using up to 8 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (3/3), 7.62 KiB | 7.62 MiB/s, done.
Total 3 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/kuzmatsukanov/kuzma_omri_noa_data_project.git
   5612527..e4466f9  kuzma -> kuzma
