# H&M Personalized Fashion Recommendations

### Import packages

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

import plotly.express as px
import plotly.figure_factory as ff

import warnings
warnings.filterwarnings("ignore")

In [2]:
  def reduce_mem_usage(df):
    """iterate through all the columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    for col in df.select_dtypes(exclude=[np.datetime64]).columns:
        col_type = df[col].dtype
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if (
                    c_min > np.finfo(np.float16).min
                    and c_max < np.finfo(np.float16).max
                ):
                    df[col] = df[col].astype(np.float16)
                elif (
                    c_min > np.finfo(np.float32).min
                    and c_max < np.finfo(np.float32).max
                ):
                    df[col] = df[col].astype(np.float32)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    return df

### Import dataset

In [3]:
transactions = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/transactions_train.csv", dtype={"article_id": "str"})
customers = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/customers.csv")
articles = pd.read_csv("/kaggle/input/h-and-m-personalized-fashion-recommendations/articles.csv", dtype={"article_id": "str"})

In [4]:
files = [articles,customers,transactions]

for file in files:
    reduce_mem_usage(file)

## Articles


- **article_id** : A unique identifier of every article. /n
- **product_code, prod_name** : A unique identifier of every product and its name (not the same).
- **product_type, product_type_name** : The group of product_code and its name
- **graphical_appearance_no, graphical_appearance_name** : The group of graphics and its name
- **colour_group_code, colour_group_name** : The group of color and its name
- **perceived_colour_value_id, perceived_colour_value_name, perceived_colour_master_id, perceived_colour_master_name** : The added color info
- **department_no, department_name**: A unique identifier of every dep and its name
- **index_code, index_name**: A unique identifier of every index and its name
- **index_group_no, index_group_name**: A group of indeces and its name
- **section_no, section_name**: A unique identifier of every section and its name
- **garment_group_no, garment_group_name**: A unique identifier of every garment and its name
- **detail_desc**: Details

In [5]:
articles.head()

In [6]:
f, ax = plt.subplots(figsize = (15,10))
ax = sns.histplot(data = articles, y = 'index_name', color = 'green')
ax.set_xlabel('Count by index name')
ax.set_ylabel('Index name')
plt.show()


In [7]:
temp = articles.groupby(["product_group_name"])["product_type_name"].nunique()
df = pd.DataFrame({'Product Group': temp.index,
                   'Product Types': temp.values
                  })
df = df.sort_values(['Product Types'], ascending=False)

# Plotly code
px.bar(df, x='Product Group', y='Product Types', 
       title='Number of Product Types per each Product Group', 
      )

In [8]:
articles.groupby(['index_group_name', 'section_name']).count()['article_id']

In [9]:
for col in articles.columns:
    n_unique = articles[col].nunique()
    print(f'Number of unique values in {col}: {n_unique}')

## Customers

- **customer_id** : A unique identifier of every customer
- **FN** : 1 or missed
- **Active** : 1 or missed
- **club_member_status** : Status in club
- **fashion_news_frequency** : How often H&M may send news to customer
- **age** : The current age
- **postal_code** : Postal code of customer

In [10]:
import seaborn as sns
from matplotlib import pyplot as plt

temp = customers.groupby(["age"])["customer_id"].count()
df = pd.DataFrame({'Age': temp.index,
                   'Customers': temp.values
                  })
df = df.sort_values(['Age'], ascending=False)

# Plotly code
px.bar(df, x='Age', y='Customers', 
       title='Number of Customers per each Age', 
      )

In [11]:
temp = customers.groupby(["fashion_news_frequency"])["customer_id"].count()
df = pd.DataFrame({'Fashion News Frequency': temp.index,
                   'Customers': temp.values
                  })
df = df.sort_values(['Customers'], ascending=False)

# Plotly code
px.bar(df, x='Fashion News Frequency', y='Customers', 
       title='Number of Customers per each Fashion News Frequency', 
      )

In [12]:
temp = customers.groupby(["club_member_status"])["customer_id"].count()
df = pd.DataFrame({'Club Member Status': temp.index,
                   'Customers': temp.values
                  })
df = df.sort_values(['Customers'], ascending=False)

# Plotly code
px.bar(df, x='Club Member Status', y='Customers', 
       title='Number of Customers per each Club Member Status', 
      )

In [13]:
customers["FN"].value_counts()

- **1 - offline**
- **2 - online**

In [14]:
x = transactions[transactions['sales_channel_id'] == 1].count()['sales_channel_id']
y = transactions['sales_channel_id'].count()
print(f'Percent of articles bought offline = {x/y* 100:.2f}%, and online {100 - x/y*100:.2f}%')

# Merge all 3 tables (transactions, customers, articles)

In [16]:
cus_tran = pd.merge(left = transactions, right = customers, how = 'inner', on = 'customer_id')

In [17]:
art = articles[['article_id','product_type_name']]
cus_tran_art = pd.merge(left = cus_tran, right = art, how = 'left', on = 'article_id')
cus_tran_art.t_dat = pd.to_datetime(cus_tran_art['t_dat'])

In [18]:
cus_tran_art.t_dat = pd.to_datetime(cus_tran_art['t_dat'])

In [11]:
#extract month from date
cus_tran_art['month'] = cus_tran_art.t_dat.dt.month

## Online vs offline

In [12]:
#split data by sales channel id, 1 = online, 2 = offline
df_online = cus_tran_art[cus_tran_art['sales_channel_id'] == 2]
df_offline = cus_tran_art[cus_tran_art['sales_channel_id'] == 1]

In [13]:
#hist of age distribution for split data
sns.set(rc={'figure.figsize':(24,8)})
fig, ax = plt.subplots(1,2)
sns.histplot(data=df_online, x = 'age', binwidth = 2, ax = ax[0]).set(title='Age of customer bought online')
sns.histplot(data=df_offline, x = 'age', binwidth = 2, ax = ax[1]).set(title='Age of customer bought offline')
fig.show()

In [14]:
top_10_online = df_online['product_type_name'].value_counts()
top_offline = df_offline['product_type_name'].value_counts()
top10offline = top_offline.head(10)
top10online = top_10_online.head(10)

In [15]:
sns.set(rc={'figure.figsize':(24,8)})
fig, ax = plt.subplots(1,2)
sns.barplot(x = top10online,y = top10online.index, ax = ax[0]).set(title='Top 10 popular products bought online')
sns.barplot(x = top10offline,y = top10offline.index, ax = ax[1]).set(title='Top 10 popular products bought offline')
fig.show()

In [16]:
x = cus_tran_art.groupby('sales_channel_id')['age'].mean()
x

In [17]:
cus_tran_art.groupby('club_member_status')['age'].mean()

### Monthly sales, trending products

In [21]:
top10all = cus_tran_art.groupby('product_type_name',as_index = False)['customer_id'].count().sort_values('customer_id', ascending = False).head(10)
top10all

In [26]:
sns.set(rc={'figure.figsize':(24,6)})
sns.barplot(x = 'product_type_name', y = 'customer_id', data = top10all).set_title(f'Top 10 product types for all customers')
plt.show()

In [18]:
monthly = cus_tran_art.groupby(['month','product_type_name'],as_index = False)['customer_id'].count()

monthly = monthly.sort_values(['month','customer_id'], ascending = [True, False])

#Top 10 product types for every month

sns.set(rc={'figure.figsize':(24,6)})
for i in range(1,13):
    plt.figure()
    sns.barplot(x = 'product_type_name', y = 'customer_id', data = monthly[monthly['month'] == i].head(10)).set_title(f'Top 10 product types for month number {i}')
    fig.show()

In [19]:
top10month = pd.DataFrame()
for i in range(1,13):
    top10month[i] = monthly[monthly['month'] == i].head(10).reset_index()['product_type_name']

top10month

## Grouped by age

In [20]:
listBin = [-1, 19, 29, 39, 49, 59, 69, 119]
cus_tran_art.age = cus_tran_art.age.fillna(0)
labels = ["-1:19","19:29", "29:39", "39:49", "49:59", "59:69", "69:119"]
cus_tran_art['age_bins'] = pd.cut(cus_tran_art['age'], listBin, labels=labels)

In [21]:
cus_tran_art

In [51]:
labels = ["-1:19","19:29", "29:39", "39:49", "49:59", "59:69", "69:119"]
for label in labels:
    
    df = cus_tran_art[cus_tran_art['age_bins'] == label]
    top10 = pd.DataFrame(df.groupby('product_type_name').count()['customer_id'].sort_values(ascending = False).head(10))
    plt.figure()
    sns.barplot(x = top10.index,y = 'customer_id', data = top10).set_title(f'Top 10 product types for {label} age group')
    fig.show()

In [54]:
ages = cus_tran_art.groupby(['age_bins','product_type_name'],as_index = False)['customer_id'].count()
ages = ages.sort_values(['age_bins','customer_id'], ascending = [True, False])

labels = ["-1:19","19:29", "29:39", "39:49", "49:59", "59:69", "69:119"]
top10age = pd.DataFrame()
for label in labels:
    top10age[label] = ages[ages['age_bins'] == label].head(10).reset_index()['product_type_name']

top10age