# Recommendation System

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./looker.csv', low_memory=False)

In [3]:
df.info(show_counts=True, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2466231 entries, 0 to 2466230
Data columns (total 37 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   user_id                 165843 non-null   float64
 1   age                     165843 non-null   float64
 2   gender                  165843 non-null   object 
 3   account_state           165843 non-null   object 
 4   account_city            164274 non-null   object 
 5   account_country         165843 non-null   object 
 6   latitude                165843 non-null   float64
 7   longitude               165843 non-null   float64
 8   account_traffic_source  165843 non-null   object 
 9   account_created_at      165843 non-null   object 
 10  order_id                145887 non-null   float64
 11  order_items_id          145887 non-null   float64
 12  status                  145887 non-null   object 
 13  sale_price              145887 non-null   float64
 14  or

In [5]:
col_list = [
    'user_id',
    'order_items_id',
    'status',
    'sale_price',
    'order_created_at',
    'product_id',
    'category',
    'brand',
    'product_name',
    'session_id',
    'uri'
]

In [6]:
df = df[col_list].drop_duplicates()

In [7]:
df.info(show_counts=True, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 1845929 entries, 0 to 2465726
Data columns (total 11 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   user_id           165843 non-null   float64
 1   order_items_id    145887 non-null   float64
 2   status            145887 non-null   object 
 3   sale_price        145887 non-null   float64
 4   order_created_at  145887 non-null   object 
 5   product_id        174933 non-null   float64
 6   category          174933 non-null   object 
 7   brand             174790 non-null   object 
 8   product_name      174923 non-null   object 
 9   session_id        1796927 non-null  object 
 10  uri               1796927 non-null  object 
dtypes: float64(4), object(7)
memory usage: 680.3 MB


In [8]:
df.describe(include='all')

Unnamed: 0,user_id,order_items_id,status,sale_price,order_created_at,product_id,category,brand,product_name,session_id,uri
count,165843.0,145887.0,145887,145887.0,145887,174933.0,174933,174790,174923,1796927,1796927
unique,,,5,,145726,,26,2752,27236,645887,35530
top,,,Shipped,,2021-04-09 23:41:48+00:00,,Intimates,Allegra K,Wrangler Men's Premium Performance Cowboy Cut ...,2dba35be-4936-4196-b003-b03327e34338,/cart
freq,,,38042,,2,,13179,6124,59,5,396274
mean,49964.691166,90882.658098,,59.53611,,15140.276569,,,,,
std,28843.590039,52479.390587,,66.43774,,8410.230687,,,,,
min,1.0,1.0,,0.02,,1.0,,,,,
25%,25005.0,45443.5,,24.5,,7862.0,,,,,
50%,49932.0,90854.0,,39.990002,,15728.0,,,,,
75%,74904.0,136377.5,,69.949997,,22388.0,,,,,


In [9]:
df = df.dropna(how='all')

In [10]:
def convert_type_dict(df):
    ideal_dtypes = dict()
    
    for column in df.columns:
        dtype = df[column].dtype
        
        if ('_id' in column) or (dtype == object):
            n_unique = df[column].nunique()
            
            # 값의 종류가 n개 미만일 경우에만 category 형식으로 최적화
            if n_unique > 10:
                ideal_dtypes[column] = 'object'
            else:
                ideal_dtypes[column] = 'category'
        elif dtype != object:
            c_min = df[column].min()
            c_max = df[column].max()

            # 숫자형 데이터 형식 최적화
            if 'price' in column:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    ideal_dtypes[column] = 'float16'
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    ideal_dtypes[column] = 'float32'
                else:
                    ideal_dtypes[column] = 'float64'
            else:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    ideal_dtypes[column] = 'int8'
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    ideal_dtypes[column] = 'uint8'
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    ideal_dtypes[column] = 'int16'
                elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                    ideal_dtypes[column] = 'uint16'
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    ideal_dtypes[column] = 'int32'
                elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                    ideal_dtypes[column] = 'uint32'
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    ideal_dtypes[column] = 'int64'
                elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                    ideal_dtypes[column] = 'uint64'
            
    return ideal_dtypes

In [11]:
data_types = convert_type_dict(df)
data_types

{'user_id': 'object',
 'order_items_id': 'object',
 'status': 'category',
 'sale_price': 'float16',
 'order_created_at': 'object',
 'product_id': 'object',
 'category': 'object',
 'brand': 'object',
 'product_name': 'object',
 'session_id': 'object',
 'uri': 'object'}

In [12]:
df = df.astype(data_types)

In [13]:
df.info(show_counts=True, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 1845929 entries, 0 to 2465726
Data columns (total 11 columns):
 #   Column            Non-Null Count    Dtype   
---  ------            --------------    -----   
 0   user_id           165843 non-null   object  
 1   order_items_id    145887 non-null   object  
 2   status            145887 non-null   category
 3   sale_price        145887 non-null   float16 
 4   order_created_at  145887 non-null   object  
 5   product_id        174933 non-null   object  
 6   category          174933 non-null   object  
 7   brand             174790 non-null   object  
 8   product_name      174923 non-null   object  
 9   session_id        1796927 non-null  object  
 10  uri               1796927 non-null  object  
dtypes: category(1), float16(1), object(9)
memory usage: 737.2 MB


In [14]:
product_name_table = pd.read_csv('./Looker Ecommerce BigQuery Dataset/products.csv')

In [15]:
product_name_table = product_name_table.drop(columns=['sku', 'distribution_center_id', 'cost'])
product_name_table = product_name_table.drop_duplicates()
product_name_table['name'] = product_name_table['name'].fillna('')
product_name_table['brand'] = product_name_table['brand'].fillna('')
product_name_table = product_name_table.set_index('id')

In [16]:
product_name_table

Unnamed: 0_level_0,category,name,brand,retail_price,department
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13842,Accessories,Low Profile Dyed Cotton Twill Cap - Navy W39S55D,MG,6.25,Women
13928,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,MG,5.95,Women
14115,Accessories,Enzyme Regular Solid Army Caps-Black W35S45D,MG,10.99,Women
14157,Accessories,Enzyme Regular Solid Army Caps-Olive W35S45D (...,MG,10.99,Women
14273,Accessories,Washed Canvas Ivy Cap - Black W11S64C,MG,15.99,Women
...,...,...,...,...,...
5676,Pants & Capris,WHAT GOES AROUND COMES AROUND Women's Colette ...,What Goes Around Comes Around,24.17,Women
6538,Shorts,WHAT GOES AROUND COMES AROUND Women's Mimi Shorts,What Goes Around Comes Around,25.00,Women
6712,Shorts,WHAT GOES AROUND COMES AROUND Women's Fifi Short,What Goes Around Comes Around,25.00,Women
6821,Shorts,WHAT GOES AROUND COMES AROUND Women's Ferguson...,What Goes Around Comes Around,25.00,Women


## 컨텐츠 기반 필터링

##### 세션마다 어떤 상품을 봤는지 추출

In [17]:
import re
def extract_product(x):
    try:
        return re.findall('(?<=/product/)([0-9]*)', x)[0]
    except:
        return np.nan
    
def extract_department(x):
    try:
        return re.findall('(?<=/department/)([^/]*)', x)[0]
    except:
        return np.nan
    
def extract_category(x):
    try:
        return re.findall('(?<=/category/)([^/]*)', x)[0]
    except:
        return np.nan
    
def extract_brand(x):
    try:
        return re.findall('(?<=/brand/)([^/]*)', x)[0]
    except:
        return np.nan

In [26]:
sess_product = df[['user_id', 'session_id', 'uri']].dropna(subset='uri')

sess_product = sess_product.assign(clicked_product=sess_product['uri'].apply(extract_product))
sess_product = sess_product.assign(clicked_department=sess_product['uri'].apply(extract_department))
sess_product = sess_product.assign(clicked_category=sess_product['uri'].apply(extract_category))
sess_product = sess_product.assign(clicked_brand=sess_product['uri'].apply(extract_brand))

In [27]:
sess_product = sess_product.dropna(subset=['clicked_product', 'clicked_department', 'clicked_category', 'clicked_brand'], how='all')
sess_product = sess_product.drop(columns='uri')

In [28]:
sess_user_id = df[['session_id', 'user_id']].dropna(how='all').drop_duplicates()
sess_product = sess_user_id.merge(sess_product, on=['session_id', 'user_id'], how='outer')

In [29]:
sess_product

Unnamed: 0,session_id,user_id,clicked_product,clicked_department,clicked_category,clicked_brand
0,00000763-a855-4ad0-a95c-b160e749b272,,16381,,,
1,00000763-a855-4ad0-a95c-b160e749b272,,,men,tops&tees,winniefashion
2,0000364a-ce41-46f1-89d6-3f8704af77db,26551.0,,,,
3,0000364a-ce41-46f1-89d6-3f8704af77db,,,women,intimates,gemsli
4,0000364a-ce41-46f1-89d6-3f8704af77db,,11501,,,
...,...,...,...,...,...,...
1207328,,99987.0,,,,
1207329,,99991.0,,,,
1207330,,99994.0,,,,
1207331,,99995.0,,,,


In [30]:
cat = product_name_table.category.unique()
cat.sort()
cat

array(['Accessories', 'Active', 'Blazers & Jackets', 'Clothing Sets',
       'Dresses', 'Fashion Hoodies & Sweatshirts', 'Intimates', 'Jeans',
       'Jumpsuits & Rompers', 'Leggings', 'Maternity',
       'Outerwear & Coats', 'Pants', 'Pants & Capris', 'Plus', 'Shorts',
       'Skirts', 'Sleep & Lounge', 'Socks', 'Socks & Hosiery', 'Suits',
       'Suits & Sport Coats', 'Sweaters', 'Swim', 'Tops & Tees',
       'Underwear'], dtype=object)

In [31]:
sess_cat = sess_product['clicked_category'].dropna().unique()
sess_cat.sort()
sess_cat

array(['accessories', 'active', 'blazers&jackets', 'clothingsets',
       'dresses', 'fashionhoodies&sweatshirts', 'intimates', 'jeans',
       'jumpsuits&rompers', 'leggings', 'maternity', 'outerwear&coats',
       'pants', 'pants&capris', 'plus', 'shorts', 'skirts',
       'sleep&lounge', 'socks', 'socks&hosiery', 'suits',
       'suits&sportcoats', 'sweaters', 'swim', 'tops&tees', 'underwear'],
      dtype=object)

In [32]:
cat_match = pd.DataFrame(cat, sess_cat).reset_index()
cat_match.columns = ['category_uri', 'category']
cat_match.head()

Unnamed: 0,category_uri,category
0,accessories,Accessories
1,active,Active
2,blazers&jackets,Blazers & Jackets
3,clothingsets,Clothing Sets
4,dresses,Dresses


In [33]:
sess_product = sess_product.merge(cat_match, left_on='clicked_category', right_on='category_uri', how='outer')
sess_product = sess_product.assign(clicked_category=sess_product['category'])
sess_product = sess_product.drop(columns=['category', 'category_uri'])

In [34]:
product_name_table.department.unique()

array(['Women', 'Men'], dtype=object)

In [35]:
sess_product.loc[sess_product.clicked_department=='women','clicked_department'] = 'Women'
sess_product.loc[sess_product.clicked_department=='men','clicked_department'] = 'Men'

In [36]:
sess_product

Unnamed: 0,session_id,user_id,clicked_product,clicked_department,clicked_category,clicked_brand
0,00052841-2c5c-41ab-916a-c83fd3944faf,,,Men,Accessories,oakley
1,0005aa38-9d7a-4d7d-9cf6-d711458ac93a,,,Women,Accessories,ray-ban
2,00074e3f-0e66-42be-ae55-8817b6d689de,,,Women,Accessories,mg
3,000773b6-7b15-4e65-96ce-8bbc2e282631,,,Men,Accessories,nautica
4,000b0dd5-862b-4003-a542-7d6e60e6bcc8,,,Women,Accessories,pashmina
...,...,...,...,...,...,...
1207328,,99987.0,,,,
1207329,,99991.0,,,,
1207330,,99994.0,,,,
1207331,,99995.0,,,,


In [37]:
brand = product_name_table['brand'].drop_duplicates().dropna().to_frame()
brand = brand.assign(chk_uri=brand['brand'].apply(lambda x: x.lower().replace(' ', '')))

In [38]:
brand = brand.merge(sess_product.clicked_brand.drop_duplicates().dropna().to_frame(), left_on='chk_uri', right_on='clicked_brand', how='outer')
brand = brand.dropna(subset='brand')

In [39]:
brand

Unnamed: 0,brand,chk_uri,clicked_brand
0,,,
1,!it Jeans,!itjeans,!itjeans
2,'47 Brand,'47brand,'47brand
3,007Lingerie,007lingerie,007lingerie
4,106Shades,106shades,106shades
...,...,...,...
2757,Zoot,zoot,zoot
2758,ZOO YORK,zooyork,zooyork
2759,Zorrel,zorrel,zorrel
2760,Zulu LAX,zululax,zululax


In [40]:
brand[brand.clicked_brand=='oakley']

Unnamed: 0,brand,chk_uri,clicked_brand
1806,Oakley,oakley,oakley


In [41]:
# 매칭되지 않은 브랜드 찾기
brand[(brand.chk_uri.isna()) | (brand.clicked_brand.isna())]

Unnamed: 0,brand,chk_uri,clicked_brand
739,DPC/Scalar,dpc/scalar,
866,F/X FUSION,f/xfusion,
1976,Pulse/Iceburg,pulse/iceburg,
2005,Rasta/NYE,rasta/nye,
2568,Underboss/ Undergirl,underboss/undergirl,
2670,Westmoor Mfg P/s,westmoormfgp/s,


In [42]:
brand = brand.dropna()

In [52]:
sess_product = sess_product.dropna(subset='session_id')

In [44]:
brand_ = brand.copy()

In [45]:
brand_ = brand_.map(str.lower).drop_duplicates()

In [46]:
brand.clicked_brand.nunique()

2704

In [47]:
brand_.clicked_brand.nunique()

2704

In [48]:
brand = brand.loc[brand_.index]

In [53]:
sess_product = sess_product.merge(brand, on='clicked_brand', how='outer')
sess_product = sess_product.assign(clicked_brand=sess_product['brand'])
sess_product = sess_product.drop(columns=['chk_uri', 'brand'])

In [54]:
sess_product.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1188547 entries, 0 to 1188546
Data columns (total 6 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   session_id          1188547 non-null  object
 1   user_id             145887 non-null   object
 2   clicked_product     645887 non-null   object
 3   clicked_department  396773 non-null   object
 4   clicked_category    396773 non-null   object
 5   clicked_brand       396650 non-null   object
dtypes: object(6)
memory usage: 54.4+ MB


In [55]:
sess_product.session_id.nunique()

645887

In [56]:
sess_product

Unnamed: 0,session_id,user_id,clicked_product,clicked_department,clicked_category,clicked_brand
0,07796f62-cebf-432b-b682-1a971d2dcd42,,,Women,Accessories,
1,097b0056-2843-4a33-a118-e16e8007b3d7,,,Women,Accessories,
2,1201530e-6421-4b97-adc1-883777695c54,,,Women,Accessories,
3,1922d827-692c-41a5-a17f-c32aa78de606,,,Women,Accessories,
4,1e3f4b37-7cff-4bd6-9173-c049b53d4f97,,,Women,Accessories,
...,...,...,...,...,...,...
1188542,ffff532c-4e82-41fd-8c7d-ddefa871b5eb,,7694,,,
1188543,ffff7645-26f7-410f-8498-0c601d74c67a,,4235,,,
1188544,ffff8086-da2c-4045-94ba-453437cd3289,,4212,,,
1188545,ffffa9aa-4345-4c2a-b389-8fd58844472e,,5053,,,


In [58]:
sess_product_list = sess_product.groupby('session_id').agg(pd.Series.to_list)
sess_product_list = sess_product_list.drop(columns='user_id')
sess_product_list = sess_product[['session_id', 'user_id']].merge(sess_product_list, on='session_id')

In [59]:
user_product_list = sess_product_list.groupby('user_id').agg(sum)
user_product_list = user_product_list.reset_index()
user_product_list = user_product_list.drop(columns='session_id')

  user_product_list = sess_product_list.groupby('user_id').agg(sum)


In [60]:
sess_product_list = pd.concat([user_product_list, sess_product_list[sess_product_list.user_id.isna()][user_product_list.columns]])
sess_product_list

  sess_product_list = pd.concat([user_product_list, sess_product_list[sess_product_list.user_id.isna()][user_product_list.columns]])


Unnamed: 0,user_id,clicked_product,clicked_department,clicked_category,clicked_brand
0,1.0,"[nan, nan, 7656, nan, nan, 2953, nan, nan, 4731]","[Women, nan, nan, Women, nan, nan, Women, nan,...","[Blazers & Jackets, nan, nan, Active, nan, nan...","[eVogues Apparel, nan, nan, Tommy Hilfiger, na..."
1,2.0,"[nan, nan, 25774]","[Men, nan, nan]","[Underwear, nan, nan]","[Tommy Bahama, nan, nan]"
2,3.0,"[nan, nan, 21364, nan, nan, 26696, nan, nan, 2...","[Men, nan, nan, Men, nan, nan, Men, nan, nan, ...","[Jeans, nan, nan, Sleep & Lounge, nan, nan, Pa...","[Marc Ecko Cut & Sew, nan, nan, Tommy Bahama, ..."
3,4.0,"[nan, nan, 1035]","[Women, nan, nan]","[Sweaters, nan, nan]","[Lilly Pulitzer, nan, nan]"
4,5.0,"[nan, nan, 6998, nan, nan, 1488, nan, nan, 7173]","[Women, nan, nan, Women, nan, nan, Women, nan,...","[Shorts, nan, nan, Sweaters, nan, nan, Skirts,...","[Grane, nan, nan, LookbookStore, nan, nan, Wom..."
...,...,...,...,...,...
1188542,,"[nan, 7694]","[Women, nan]","[Blazers & Jackets, nan]","[Sutton Studio, nan]"
1188543,,[4235],[nan],[nan],[nan]
1188544,,[4212],[nan],[nan],[nan]
1188545,,"[nan, 5053]","[Women, nan]","[Jeans, nan]","[7 For All Mankind, nan]"


In [61]:
for col in sess_product_list:
    if 'clicked' in col:
        sess_product_list[col] = sess_product_list[col].apply(lambda clicked_list: [x for x in clicked_list if pd.notna(x)])

In [62]:
sess_product_list

Unnamed: 0,user_id,clicked_product,clicked_department,clicked_category,clicked_brand
0,1.0,"[7656, 2953, 4731]","[Women, Women, Women]","[Blazers & Jackets, Active, Jeans]","[eVogues Apparel, Tommy Hilfiger, Joe's Jeans]"
1,2.0,[25774],[Men],[Underwear],[Tommy Bahama]
2,3.0,"[21364, 26696, 22308, 18177]","[Men, Men, Men, Men]","[Jeans, Sleep & Lounge, Pants, Active]","[Marc Ecko Cut & Sew, Tommy Bahama, Dockers, S..."
3,4.0,[1035],[Women],[Sweaters],[Lilly Pulitzer]
4,5.0,"[6998, 1488, 7173]","[Women, Women, Women]","[Shorts, Sweaters, Skirts]","[Grane, LookbookStore, Woman Within]"
...,...,...,...,...,...
1188542,,[7694],[Women],[Blazers & Jackets],[Sutton Studio]
1188543,,[4235],[],[],[]
1188544,,[4212],[],[],[]
1188545,,[5053],[Women],[Jeans],[7 For All Mankind]


In [63]:
from collections import Counter
sess_product = sess_product_list.set_index('user_id').map(Counter)

In [72]:
# sess_product.to_csv('./sess_clicked.csv')

In [73]:
sess_product = sess_product.reset_index().dropna(subset='user_id').set_index('user_id')

In [74]:
# sess_product.to_csv('./user_clicked.csv')

#### 구매내역과 비교

In [75]:
sess_product

Unnamed: 0_level_0,clicked_product,clicked_department,clicked_category,clicked_brand
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,"{'7656': 1, '2953': 1, '4731': 1}",{'Women': 3},"{'Blazers & Jackets': 1, 'Active': 1, 'Jeans': 1}","{'eVogues Apparel': 1, 'Tommy Hilfiger': 1, 'J..."
2.0,{'25774': 1},{'Men': 1},{'Underwear': 1},{'Tommy Bahama': 1}
3.0,"{'21364': 1, '26696': 1, '22308': 1, '18177': 1}",{'Men': 4},"{'Jeans': 1, 'Sleep & Lounge': 1, 'Pants': 1, ...","{'Marc Ecko Cut & Sew': 1, 'Tommy Bahama': 1, ..."
4.0,{'1035': 1},{'Women': 1},{'Sweaters': 1},{'Lilly Pulitzer': 1}
5.0,"{'6998': 1, '1488': 1, '7173': 1}",{'Women': 3},"{'Shorts': 1, 'Sweaters': 1, 'Skirts': 1}","{'Grane': 1, 'LookbookStore': 1, 'Woman Within..."
...,...,...,...,...
99993.0,{'9826': 1},{'Women': 1},{'Sleep & Lounge': 1},{'Paul Frank': 1}
99996.0,"{'27368': 1, '25132': 1, '18161': 1, '28478': 1}",{'Men': 4},"{'Sleep & Lounge': 1, 'Socks': 1, 'Active': 1,...","{'Tommy Hilfiger': 1, 'Pearl iZUMi': 1, 'Gilda..."
99998.0,"{'21923': 1, '16938': 1, '19164': 1}",{'Men': 3},"{'Pants': 1, 'Tops & Tees': 1, 'Sweaters': 1}","{'Allegra K': 1, 'Woolrich': 1, 'Original Peng..."
99999.0,"{'20828': 1, '25959': 1}",{'Men': 2},"{'Jeans': 1, 'Underwear': 1}","{'Wrangler': 1, 'Hanes': 1}"


In [80]:
sess_clicked_num = sess_product.clicked_product.apply(len)
sess_clicked_num

user_id
1.0         3
2.0         1
3.0         4
4.0         1
5.0         3
           ..
99993.0     1
99996.0     4
99998.0     3
99999.0     2
100000.0    2
Name: clicked_product, Length: 76918, dtype: int64

In [101]:
order_product = df[['user_id', 'order_items_id', 'product_id']].dropna()
order_product_num = order_product.groupby('user_id').order_items_id.nunique()
order_product_num

user_id
1.0         3
2.0         1
3.0         4
4.0         1
5.0         3
           ..
99993.0     1
99996.0     4
99998.0     3
99999.0     2
100000.0    2
Name: order_items_id, Length: 76918, dtype: int64

In [102]:
all(sess_clicked_num == order_product_num)

False

In [103]:
sess_clicked_num[sess_clicked_num != order_product_num]

user_id
21319.0    3
35593.0    3
47363.0    4
49725.0    1
61640.0    4
70798.0    3
78293.0    3
Name: clicked_product, dtype: int64

In [104]:
order_product_num[sess_clicked_num != order_product_num]

user_id
21319.0    4
35593.0    4
47363.0    5
49725.0    2
61640.0    5
70798.0    4
78293.0    4
Name: order_items_id, dtype: int64

In [105]:
sess_product.loc[21319.0]

clicked_product                      {'4310': 2, '13239': 1, '7412': 1}
clicked_department                                         {'Women': 4}
clicked_category                   {'Jeans': 2, 'Swim': 1, 'Skirts': 1}
clicked_brand         {'Wallflower': 2, 'ViX': 1, 'Living Dead Souls...
Name: 21319.0, dtype: object

In [106]:
order_product[order_product.user_id==21319]

Unnamed: 0,user_id,order_items_id,product_id
338936,21319.0,38758.0,4310.0
774221,21319.0,38755.0,7412.0
1098994,21319.0,38757.0,4310.0
2230927,21319.0,38756.0,13239.0


같은 제품을 여러 번 구매한 유저 존재

In [107]:
order_product_nuniq = order_product.groupby('user_id').product_id.nunique()
order_product_nuniq

user_id
1.0         3
2.0         1
3.0         4
4.0         1
5.0         3
           ..
99993.0     1
99996.0     4
99998.0     3
99999.0     2
100000.0    2
Name: product_id, Length: 76918, dtype: int64

In [108]:
all(sess_clicked_num == order_product_nuniq)

True

클릭한 제품이 전부 구매로 이어짐 (환불/취소 포함)

## 연관 추천 (A priori)
- 컨텐츠 기반 추천(contents-based recommendation)의 기본이 되는 방법론
1. 희소행렬(sparse matrix) 생성
2. Item set: 조건절(Antecedent) ‘만일 A를 샀다면’ + 결과절(Consequent) 'B를 산다'
    - 상호배반(mutually exclusive)이어야 함 (A를 산다면 A를 산다 -> 안됨)
3. 지표
    - 지지도(support): 빈발 아이템 집합을 판별에 사용, 조건절(𝐴)이 일어날 확률 
        - 𝐹𝑜𝑟 𝑡ℎ𝑒 𝑟𝑢𝑙𝑒 𝐴 → 𝐵, 𝑠𝑢𝑝𝑝𝑜𝑟𝑡(𝐴) = 𝑃(𝐴)
    - 신뢰도(confidence): 아이템 집합 간의 연관성 강도를 측정, 조건절(𝐴)이 주어졌을 때 결과절(𝐵)이 일어날 조건부확률 
        - 𝑐𝑜𝑛𝑓𝑖𝑑𝑒𝑛𝑐𝑒(𝐴→𝐵) = 𝑃(𝐴,𝐵) / 𝑃(𝐴)
    - 향상도(lift): 생성된 규칙이 실제 효용가치가 있는지를 판별, 조건절과 결과절이 서로 독립일 때와 비교해 두 사건이 동시에 얼마나 발생하는지 비율 
        - 𝑙𝑖𝑓𝑡(𝐴→𝐵) = 𝑃(𝐴,𝐵) / 𝑃(𝐴)⋅𝑃(𝐵)
        - 향상도 1: 조건절과 결과절은 서로 독립임 -> 규칙 사이에 유의미한 연관성이 없다
        - 향상도 2: 두 사건이 독립이라는 걸 가정했을 때 대비 2배로 긍정적인 연관관계
    - **세 지표 모두, 보다 클 경우**에 임의의 규칙1이 규칙2보다 효과적인 규칙이다고 평가할 수 있음
4. 규칙 생성
    - 아이템이 𝑛 개일 때 탐색해야할 모든 경우의 수: 𝑛∗(𝑛−1) => **빈발 집합(frequent item sets)** 으로 해결!
        - 지지도 𝑃(𝐴)가 0.1이면 아이템 집합 {𝐴,𝐵}의 지지도는 아무리 높아도 0.1을 넘지 못함 => **최소지지도 요건을 만족하지 못하는 아이템집합의 규칙들은 애당초 계산할 필요가 없다!**
            - 𝐴가 단독으로 등장할 확률인 𝑃(𝐴)는 𝐴 와 𝐵가 동시에 나타날 확률인 𝑃(𝐴,𝐵)보다는 크거나 같을 것이기 때문
                - {𝐴,𝐵}는 {𝐴}, {𝐵}의 초월집합(superset)
            - 아이템 집합 {𝐴,𝐵}의 지지도가 사용자가 정한 최소 지지도 요건을 충족시키지 못했을 경우 {𝐴,𝐵}는 물론 {𝐴,𝐵}의 초월집합인 {𝐴,𝐵,𝐶}, {𝐴,𝐵,𝐷} 등 8가지 경우의 수를 계산에서 제외

[참고](https://ratsgo.github.io/machine%20learning/2017/04/08/apriori/)
<img src="image.png" width="650" height="400" />

In [None]:
#%pip install mlxtend

In [109]:
from mlxtend.frequent_patterns import apriori, association_rules

In [110]:
user_product = pd.read_csv('./user_clicked.csv', index_col=0)
user_product

Unnamed: 0_level_0,clicked_product,clicked_department,clicked_category,clicked_brand
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,"Counter({'7656': 1, '2953': 1, '4731': 1})",Counter({'Women': 3}),"Counter({'Blazers & Jackets': 1, 'Active': 1, ...","Counter({'eVogues Apparel': 1, 'Tommy Hilfiger..."
2.0,Counter({'25774': 1}),Counter({'Men': 1}),Counter({'Underwear': 1}),Counter({'Tommy Bahama': 1})
3.0,"Counter({'21364': 1, '26696': 1, '22308': 1, '...",Counter({'Men': 4}),"Counter({'Jeans': 1, 'Sleep & Lounge': 1, 'Pan...","Counter({'Marc Ecko Cut & Sew': 1, 'Tommy Baha..."
4.0,Counter({'1035': 1}),Counter({'Women': 1}),Counter({'Sweaters': 1}),Counter({'Lilly Pulitzer': 1})
5.0,"Counter({'6998': 1, '1488': 1, '7173': 1})",Counter({'Women': 3}),"Counter({'Shorts': 1, 'Sweaters': 1, 'Skirts':...","Counter({'Grane': 1, 'LookbookStore': 1, 'Woma..."
...,...,...,...,...
99993.0,Counter({'9826': 1}),Counter({'Women': 1}),Counter({'Sleep & Lounge': 1}),Counter({'Paul Frank': 1})
99996.0,"Counter({'27368': 1, '25132': 1, '18161': 1, '...",Counter({'Men': 4}),"Counter({'Sleep & Lounge': 1, 'Socks': 1, 'Act...","Counter({'Tommy Hilfiger': 1, 'Pearl iZUMi': 1..."
99998.0,"Counter({'21923': 1, '16938': 1, '19164': 1})",Counter({'Men': 3}),"Counter({'Pants': 1, 'Tops & Tees': 1, 'Sweate...","Counter({'Allegra K': 1, 'Woolrich': 1, 'Origi..."
99999.0,"Counter({'20828': 1, '25959': 1})",Counter({'Men': 2}),"Counter({'Jeans': 1, 'Underwear': 1})","Counter({'Wrangler': 1, 'Hanes': 1})"


In [111]:
user_product.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 76918 entries, 1.0 to 100000.0
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   clicked_product     76918 non-null  object
 1   clicked_department  76918 non-null  object
 2   clicked_category    76918 non-null  object
 3   clicked_brand       76918 non-null  object
dtypes: object(4)
memory usage: 27.0 MB


In [112]:
from collections import Counter
user_product = user_product.assign(clicked_brand=user_product['clicked_brand'].apply(lambda x: eval(x)))
user_product = user_product.assign(clicked_department=user_product['clicked_department'].apply(lambda x: eval(x)))
user_product = user_product.assign(clicked_category=user_product['clicked_category'].apply(lambda x: eval(x)))
user_product = user_product.assign(clicked_product=user_product['clicked_product'].apply(lambda x: eval(x)))

In [113]:
# Convert the quantities into 0/1 (0: not in the basket, 1: in the basket)
def encode_units(x):
    if x <= 0:
        return 0
    if x >= 1:
        return 1

#### 상품 id -> 메모리 이슈..ㅜ

In [114]:
user_clicked_product = pd.DataFrame(user_product['clicked_product'].to_list(), index=user_product.index).fillna(0)
user_clicked_product

In [9]:
user_clicked_product_sets = user_clicked_product.applymap(encode_units)

  user_clicked_product_sets = user_clicked_product.applymap(encode_units)


In [115]:
del user_clicked_product

In [11]:
user_clicked_product_sets

Unnamed: 0_level_0,7656,2953,4731,25774,21364,26696,22308,18177,1035,6998,...,14198,13269,2113,28482,2311,4823,7910,13372,26981,2698
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2.0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3.0,0,0,0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4.0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5.0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99993.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99996.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99998.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99999.0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
# Use the Apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(user_clicked_product_sets, min_support=1.1e-4, use_colnames=True)
frequent_itemsets



: 

In [88]:
del user_clicked_product_sets

현업에는 상품이 더 많을텐데... 도대체 메모리가 얼마나 필요한건지...?

#### 브랜드 -> 그냥 인기있는 브랜드 조합인듯...

In [6]:
user_clicked_brand = pd.DataFrame(user_product['clicked_brand'].to_list(), index=user_product.index).fillna(0)

In [7]:
user_clicked_brand.shape

(76918, 2699)

In [8]:
user_clicked_brand_sets = user_clicked_brand.applymap(encode_units)
del user_clicked_brand

  user_clicked_brand_sets = user_clicked_brand.applymap(encode_units)


In [9]:
# 최소 지지도가 1.2e-3보다 내려가면 커널 터짐...ㅜ
frequent_brandsets = apriori(user_clicked_brand_sets, min_support=1.2e-3, use_colnames=True)
frequent_brandsets



Unnamed: 0,support,itemsets
0,0.004693,(eVogues Apparel)
1,0.017239,(Tommy Hilfiger)
2,0.006786,(Joe's Jeans)
3,0.005109,(Tommy Bahama)
4,0.002431,(Marc Ecko Cut & Sew)
...,...,...
336,0.001820,(Ripe Maternity)
337,0.001443,"(Carhartt, Allegra K)"
338,0.001703,"(Calvin Klein, Allegra K)"
339,0.001742,"(Le Suit, Lesuit)"


In [12]:
# Generate association rules
rules = association_rules(frequent_brandsets, metric="lift", min_threshold=0.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Carhartt),(Allegra K),0.027016,0.064055,0.001443,0.053417,0.833917,-0.000287,0.988761,-0.169911
1,(Allegra K),(Carhartt),0.064055,0.027016,0.001443,0.022529,0.833917,-0.000287,0.99541,-0.175455
2,(Calvin Klein),(Allegra K),0.03349,0.064055,0.001703,0.050854,0.793909,-0.000442,0.986092,-0.21172
3,(Allegra K),(Calvin Klein),0.064055,0.03349,0.001703,0.026588,0.793909,-0.000442,0.992909,-0.217133
4,(Le Suit),(Lesuit),0.001742,0.001742,0.001742,1.0,574.014925,0.001739,inf,1.0
5,(Lesuit),(Le Suit),0.001742,0.001742,0.001742,1.0,574.014925,0.001739,inf,1.0
6,(Retrofit),(Retro Fit),0.002197,0.002197,0.002197,1.0,455.136095,0.002192,inf,1.0
7,(Retro Fit),(Retrofit),0.002197,0.002197,0.002197,1.0,455.136095,0.002192,inf,1.0


#### 카테고리...!

In [116]:
user_clicked_category = pd.DataFrame(user_product['clicked_category'].to_list(), index=user_product.index).fillna(0)

In [117]:
user_clicked_category.shape

(76918, 26)

In [118]:
user_clicked_category_sets = user_clicked_category.applymap(encode_units)

  user_clicked_category_sets = user_clicked_category.applymap(encode_units)


In [119]:
frequent_clicked_category_sets = apriori(user_clicked_category_sets, min_support=3.0e-4, use_colnames=True)
frequent_clicked_category_sets



Unnamed: 0,support,itemsets
0,0.032476,(Blazers & Jackets)
1,0.090850,(Active)
2,0.125055,(Jeans)
3,0.072233,(Underwear)
4,0.107868,(Sleep & Lounge)
...,...,...
1118,0.000312,"(Fashion Hoodies & Sweatshirts, Sleep & Lounge..."
1119,0.000312,"(Fashion Hoodies & Sweatshirts, Socks, Sleep &..."
1120,0.000364,"(Sweaters, Pants, Shorts, Tops & Tees)"
1121,0.000325,"(Swim, Shorts, Sweaters, Tops & Tees)"


In [120]:
# Generate association rules
rules = association_rules(frequent_clicked_category_sets, metric="lift", min_threshold=0.5)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Blazers & Jackets),(Active),0.032476,0.090850,0.002106,0.064852,0.713835,-0.000844,0.972199,-0.292957
1,(Active),(Blazers & Jackets),0.090850,0.032476,0.002106,0.023183,0.713835,-0.000844,0.990486,-0.306011
2,(Blazers & Jackets),(Jeans),0.032476,0.125055,0.003146,0.096878,0.774678,-0.000915,0.968800,-0.231137
3,(Jeans),(Blazers & Jackets),0.125055,0.032476,0.003146,0.025159,0.774678,-0.000915,0.992494,-0.249493
4,(Blazers & Jackets),(Sleep & Lounge),0.032476,0.107868,0.002613,0.080464,0.745951,-0.000890,0.970198,-0.260356
...,...,...,...,...,...,...,...,...,...,...
5789,"(Sweaters, Tops & Tees)","(Outerwear & Coats, Swim)",0.012390,0.009231,0.000312,0.025184,2.728274,0.000198,1.016365,0.641415
5790,(Outerwear & Coats),"(Swim, Sweaters, Tops & Tees)",0.088549,0.001729,0.000312,0.003524,2.037871,0.000159,1.001801,0.558770
5791,(Swim),"(Outerwear & Coats, Sweaters, Tops & Tees)",0.113185,0.001547,0.000312,0.002757,1.781860,0.000137,1.001213,0.494792
5792,(Sweaters),"(Outerwear & Coats, Swim, Tops & Tees)",0.111001,0.001677,0.000312,0.002811,1.676075,0.000126,1.001137,0.453733


In [121]:
rules.describe()

Unnamed: 0,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
count,5794.0,5794.0,5794.0,5794.0,5794.0,5794.0,5794.0,5794.0
mean,0.049406,0.049406,0.001181,0.068284,1.548969,0.00016,1.02285,0.249467
std,0.046461,0.046461,0.001686,0.067259,0.729184,0.000487,0.047156,0.292592
min,0.000923,0.000923,0.000312,0.002495,0.512203,-0.004744,0.944049,-0.501478
25%,0.007736,0.007736,0.000403,0.010163,1.035227,1.9e-05,1.000388,0.036203
50%,0.032502,0.032502,0.000845,0.038583,1.309848,0.000194,1.004823,0.249896
75%,0.107868,0.107868,0.001209,0.116127,1.98657,0.000324,1.031909,0.514639
max,0.125055,0.125055,0.013937,0.397436,5.960856,0.004553,1.453072,0.839266


In [122]:
# Filter rules by a minimum lift and confidence
filtered_rules = rules[(rules['lift'] >= 3.0) & (rules['confidence'] >= 0.05)]
filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

Unnamed: 0,antecedents,consequents,support,confidence,lift
708,"(Outerwear & Coats, Intimates)",(Blazers & Jackets),0.000702,0.100372,3.090630
796,"(Blazers & Jackets, Maternity)",(Pants & Capris),0.000338,0.107438,3.103236
797,"(Blazers & Jackets, Pants & Capris)",(Maternity),0.000338,0.175676,3.443583
798,"(Maternity, Pants & Capris)",(Blazers & Jackets),0.000338,0.103175,3.176935
802,"(Blazers & Jackets, Leggings)",(Maternity),0.000338,0.184397,3.614542
...,...,...,...,...,...
5712,"(Socks, Accessories, Tops & Tees)",(Underwear),0.000351,0.264706,3.664623
5713,"(Underwear, Tops & Tees, Accessories)",(Socks),0.000351,0.270000,4.323935
5715,"(Socks, Accessories)","(Underwear, Tops & Tees)",0.000351,0.053892,5.662953
5740,"(Fashion Hoodies & Sweatshirts, Sleep & Lounge...",(Socks),0.000312,0.193548,3.099595


In [123]:
filtered_rules_sort = filtered_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

from sklearn.preprocessing import minmax_scale
filtered_rules_sort = filtered_rules_sort.assign(score=(minmax_scale(filtered_rules_sort.support) \
                                                       + minmax_scale(filtered_rules_sort.confidence) \
                                                        + minmax_scale(filtered_rules_sort.lift)) / 3)

*Item set: 조건절(Antecedent) ‘만일 A를 샀다면’ + 결과절(Consequent) 'B를 산다'*  
(1) Underwear 와 Sleep & Lounge 를 클릭하면 Socks도 클릭한다  
(2) Tops & Tees, Underwear 와 Accessories 를 클릭하면 Socks도 클릭한다  
(3) Suits 와 Jeans 를 클릭하면 Intimates (보정 속옷)도 클릭한다  
(15) Fashion Hoodies & Sweatshirts 와 Leggings'를 클릭하면 Plus (플러스 사이즈)도 클릭한다  
(21) Fashion Hoodies & Sweatshirts' 와 'Leggings를 클릭하면 Maternity (임부복)도 클릭한다

In [124]:
filtered_rules_sort.sort_values('score', ascending=False).style.background_gradient()

Unnamed: 0,antecedents,consequents,support,confidence,lift,score
2388,"frozenset({'Sleep & Lounge', 'Underwear'})",frozenset({'Socks'}),0.00143,0.191638,3.068995,0.475896
5713,"frozenset({'Underwear', 'Tops & Tees', 'Accessories'})",frozenset({'Socks'}),0.000351,0.27,4.323935,0.387229
2285,"frozenset({'Suits', 'Jeans'})",frozenset({'Intimates'}),0.000403,0.397436,3.194021,0.384538
5391,"frozenset({'Sleep & Lounge', 'Underwear', 'Jeans'})",frozenset({'Socks'}),0.000338,0.268041,4.292566,0.377528
5615,"frozenset({'Sleep & Lounge', 'Underwear', 'Tops & Tees'})",frozenset({'Socks'}),0.000325,0.265957,4.259195,0.367455
4823,"frozenset({'Fashion Hoodies & Sweatshirts', 'Intimates'})",frozenset({'Socks & Hosiery'}),0.001326,0.116041,3.002233,0.36336
5715,"frozenset({'Socks', 'Accessories'})","frozenset({'Underwear', 'Tops & Tees'})",0.000351,0.053892,5.662953,0.345795
5389,"frozenset({'Socks', 'Sleep & Lounge', 'Jeans'})",frozenset({'Underwear'}),0.000338,0.282609,3.912472,0.344016
5473,"frozenset({'Swim', 'Socks', 'Jeans'})",frozenset({'Underwear'}),0.000364,0.277228,3.837977,0.337229
5476,"frozenset({'Swim', 'Socks'})","frozenset({'Underwear', 'Jeans'})",0.000364,0.05303,5.549639,0.334643


### FP-Growth

- category

In [125]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import fpgrowth

In [127]:
data = user_product['clicked_category'].apply(lambda x: x.keys()).to_list()
data

[dict_keys(['Blazers & Jackets', 'Active', 'Jeans']),
 dict_keys(['Underwear']),
 dict_keys(['Jeans', 'Sleep & Lounge', 'Pants', 'Active']),
 dict_keys(['Sweaters']),
 dict_keys(['Shorts', 'Sweaters', 'Skirts']),
 dict_keys(['Suits & Sport Coats']),
 dict_keys(['Tops & Tees', 'Active']),
 dict_keys(['Fashion Hoodies & Sweatshirts', 'Outerwear & Coats', 'Shorts', 'Socks', 'Suits & Sport Coats', 'Swim']),
 dict_keys(['Swim', 'Dresses']),
 dict_keys(['Jumpsuits & Rompers', 'Jeans', 'Sleep & Lounge', 'Tops & Tees']),
 dict_keys(['Intimates', 'Maternity']),
 dict_keys(['Sweaters']),
 dict_keys(['Socks']),
 dict_keys(['Active', 'Accessories', 'Tops & Tees', 'Shorts', 'Outerwear & Coats']),
 dict_keys(['Pants', 'Tops & Tees']),
 dict_keys(['Fashion Hoodies & Sweatshirts']),
 dict_keys(['Jeans', 'Underwear']),
 dict_keys(['Sleep & Lounge', 'Pants & Capris']),
 dict_keys(['Socks']),
 dict_keys(['Tops & Tees']),
 dict_keys(['Sleep & Lounge']),
 dict_keys(['Tops & Tees', 'Leggings']),
 dict_keys(

In [128]:
te = TransactionEncoder()
te_ary = te.fit_transform(data)
te_user_clicked_cat = pd.DataFrame(te_ary, columns=te.columns_)

min_support_per = 3.0e-6
min_trust_per = 0.5
result = fpgrowth(te_user_clicked_cat,min_support=min_support_per, use_colnames=True)
result_chart = association_rules(result, metric="lift", min_threshold=min_trust_per)

In [129]:
result_chart

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(Active),(Jeans),0.090850,0.125055,0.010518,0.115770,0.925750,-0.000844,0.989499,-0.081068
1,(Jeans),(Active),0.125055,0.090850,0.010518,0.084104,0.925750,-0.000844,0.992635,-0.083971
2,(Active),(Sleep & Lounge),0.090850,0.107868,0.008906,0.098025,0.908750,-0.000894,0.989087,-0.099461
3,(Sleep & Lounge),(Active),0.107868,0.090850,0.008906,0.082560,0.908750,-0.000894,0.990964,-0.101167
4,(Active),(Tops & Tees),0.090850,0.117593,0.009660,0.106325,0.904181,-0.001024,0.987392,-0.104395
...,...,...,...,...,...,...,...,...,...,...
513623,"(Clothing Sets, Tops & Tees)","(Blazers & Jackets, Fashion Hoodies & Sweatshi...",0.000234,0.003016,0.000013,0.055556,18.419061,0.000012,1.055630,0.945930
513624,(Blazers & Jackets),"(Fashion Hoodies & Sweatshirts, Clothing Sets,...",0.032476,0.000091,0.000013,0.000400,4.398833,0.000010,1.000309,0.798603
513625,(Fashion Hoodies & Sweatshirts),"(Blazers & Jackets, Clothing Sets, Tops & Tees)",0.117216,0.000013,0.000013,0.000111,8.531278,0.000011,1.000098,1.000000
513626,(Clothing Sets),"(Blazers & Jackets, Tops & Tees, Fashion Hoodi...",0.002275,0.000364,0.000013,0.005714,15.697551,0.000012,1.005381,0.938431


In [130]:
result_chart_sort = result_chart[['antecedents', 'consequents', 'support', 'confidence', 'lift']]
result_chart_sort = result_chart_sort.assign(score=(minmax_scale(result_chart_sort.support) \
                                                       + minmax_scale(result_chart_sort.confidence) \
                                                        + minmax_scale(result_chart_sort.lift)) / 3)

In [131]:
result_chart_sort.sort_values('score', ascending=False)[:20].style.background_gradient()

Unnamed: 0,antecedents,consequents,support,confidence,lift,score
311620,"frozenset({'Outerwear & Coats', 'Jumpsuits & Rompers', 'Shorts', 'Fashion Hoodies & Sweatshirts'})","frozenset({'Blazers & Jackets', 'Active', 'Sweaters', 'Tops & Tees'})",1.3e-05,1.0,76918.0,0.666667
330257,"frozenset({'Plus', 'Skirts', 'Jumpsuits & Rompers', 'Jeans'})","frozenset({'Dresses', 'Suits', 'Maternity', 'Tops & Tees'})",1.3e-05,1.0,76918.0,0.666667
333008,"frozenset({'Swim', 'Suits', 'Skirts', 'Maternity'})","frozenset({'Dresses', 'Jumpsuits & Rompers', 'Plus', 'Tops & Tees'})",1.3e-05,1.0,76918.0,0.666667
313259,"frozenset({'Outerwear & Coats', 'Shorts', 'Blazers & Jackets', 'Intimates', 'Sweaters'})","frozenset({'Fashion Hoodies & Sweatshirts', 'Active', 'Jumpsuits & Rompers', 'Tops & Tees'})",1.3e-05,1.0,76918.0,0.666667
333707,"frozenset({'Maternity', 'Tops & Tees', 'Swim', 'Dresses', 'Plus', 'Suits'})","frozenset({'Skirts', 'Jumpsuits & Rompers', 'Jeans'})",1.3e-05,1.0,76918.0,0.666667
500296,"frozenset({'Swim', 'Plus', 'Skirts', 'Tops & Tees'})","frozenset({'Dresses', 'Suits', 'Maternity', 'Jeans'})",1.3e-05,1.0,76918.0,0.666667
30755,"frozenset({'Swim', 'Plus', 'Pants & Capris', 'Tops & Tees'})","frozenset({'Fashion Hoodies & Sweatshirts', 'Dresses', 'Blazers & Jackets', 'Leggings'})",1.3e-05,1.0,76918.0,0.666667
332999,"frozenset({'Swim', 'Dresses', 'Jumpsuits & Rompers', 'Maternity'})","frozenset({'Plus', 'Skirts', 'Suits', 'Tops & Tees'})",1.3e-05,1.0,76918.0,0.666667
333699,"frozenset({'Maternity', 'Tops & Tees', 'Swim', 'Dresses', 'Jumpsuits & Rompers', 'Jeans'})","frozenset({'Plus', 'Skirts', 'Suits'})",1.3e-05,1.0,76918.0,0.666667
333009,"frozenset({'Dresses', 'Jumpsuits & Rompers', 'Maternity', 'Plus'})","frozenset({'Swim', 'Skirts', 'Suits', 'Tops & Tees'})",1.3e-05,1.0,76918.0,0.666667


In [132]:
len(data)
# 지지도가 유저 수랑 같은 거는...?
# 𝑃(𝐴,𝐵) / 𝑃(𝐴)⋅𝑃(𝐵)

76918

- product_id

In [133]:
data = user_product['clicked_product'].apply(lambda x: x.keys()).to_list()
data

[dict_keys(['7656', '2953', '4731']),
 dict_keys(['25774']),
 dict_keys(['21364', '26696', '22308', '18177']),
 dict_keys(['1035']),
 dict_keys(['6998', '1488', '7173']),
 dict_keys(['20018']),
 dict_keys(['16199', '18243']),
 dict_keys(['17577', '24194', '22961', '24745', '20051', '27507']),
 dict_keys(['13549', '3682']),
 dict_keys(['4130', '4690', '10039', '186']),
 dict_keys(['11464', '15053']),
 dict_keys(['19737']),
 dict_keys(['24573']),
 dict_keys(['18612', '29058', '16381', '23584', '23645']),
 dict_keys(['22153', '16318']),
 dict_keys(['1801']),
 dict_keys(['21427', '26055']),
 dict_keys(['9700', '5119']),
 dict_keys(['24822']),
 dict_keys(['668']),
 dict_keys(['26462']),
 dict_keys(['796', '5916']),
 dict_keys(['15295', '191', '7708', '14357']),
 dict_keys(['807']),
 dict_keys(['20850']),
 dict_keys(['9336', '463', '6606']),
 dict_keys(['13876']),
 dict_keys(['4168']),
 dict_keys(['11550']),
 dict_keys(['26390', '28626']),
 dict_keys(['26856', '19633']),
 dict_keys(['20036']

In [134]:
te = TransactionEncoder()
te_ary = te.fit_transform(data)
te_user_clicked_cat = pd.DataFrame(te_ary, columns=te.columns_)

min_support_per = 3.0e-6
min_trust_per = 0.5
result = fpgrowth(te_user_clicked_cat,min_support=min_support_per, use_colnames=True)

In [135]:
result

Unnamed: 0,support,itemsets
0,0.000091,(2953)
1,0.000078,(7656)
2,0.000065,(4731)
3,0.000065,(25774)
4,0.000104,(26696)
...,...,...
380452,0.000013,"(17924, 19800, 26981)"
380453,0.000013,"(19800, 17316, 26981)"
380454,0.000013,"(17924, 17316, 26981)"
380455,0.000013,"(17924, 17316, 19800, 26981)"


In [136]:
result_chart = association_rules(result, metric="lift", min_threshold=min_trust_per)

In [137]:
result_chart

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(2953),(11056),0.000091,0.000104,0.000013,0.142857,1373.535714,0.000013,1.166545,0.999363
1,(11056),(2953),0.000104,0.000091,0.000013,0.125000,1373.535714,0.000013,1.142753,0.999376
2,(4976),(2953),0.000117,0.000091,0.000013,0.111111,1220.920635,0.000013,1.124898,0.999298
3,(2953),(4976),0.000091,0.000117,0.000013,0.142857,1220.920635,0.000013,1.166530,0.999272
4,(9577),(2953),0.000117,0.000091,0.000013,0.111111,1220.920635,0.000013,1.124898,0.999298
...,...,...,...,...,...,...,...,...,...,...
5790231,(17316),"(17924, 19800, 26981)",0.000130,0.000013,0.000013,0.100000,7691.800000,0.000013,1.111097,1.000000
5790232,(19800),"(17924, 17316, 26981)",0.000026,0.000013,0.000013,0.500000,38459.000000,0.000013,1.999974,1.000000
5790233,(26981),"(17924, 17316, 19800)",0.000013,0.000013,0.000013,1.000000,76918.000000,0.000013,inf,1.000000
5790234,(1435),(2698),0.000065,0.000013,0.000013,0.200000,15383.600000,0.000013,1.249984,1.000000


In [138]:
result_chart_sort = result_chart[['antecedents', 'consequents', 'support', 'confidence', 'lift']]

result_chart_sort = result_chart_sort.assign(score=(minmax_scale(result_chart_sort.support) \
                                                       + minmax_scale(result_chart_sort.confidence) \
                                                        + minmax_scale(result_chart_sort.lift)) / 3)

result_chart_sort.sort_values('score', ascending=False)[:20].style.background_gradient()

Unnamed: 0,antecedents,consequents,support,confidence,lift,score
5045126,"frozenset({'10277', '5762'})","frozenset({'9162', '15127', '9569'})",1.3e-05,1.0,76918.0,0.666667
4413212,"frozenset({'21443', '22734', '20529', '17861'})","frozenset({'21446', '21012', '26306', '27407', '25250', '25186'})",1.3e-05,1.0,76918.0,0.666667
4413221,"frozenset({'26306', '27407', '20529', '17861'})","frozenset({'21446', '21012', '21443', '25250', '22734', '25186'})",1.3e-05,1.0,76918.0,0.666667
4413220,"frozenset({'25186', '26306', '27407', '17861'})","frozenset({'21446', '21012', '21443', '25250', '22734', '20529'})",1.3e-05,1.0,76918.0,0.666667
4413219,"frozenset({'26306', '27407', '22734', '17861'})","frozenset({'21446', '21012', '21443', '25250', '25186', '20529'})",1.3e-05,1.0,76918.0,0.666667
4413218,"frozenset({'26306', '27407', '25250', '17861'})","frozenset({'21446', '21012', '21443', '22734', '25186', '20529'})",1.3e-05,1.0,76918.0,0.666667
4413217,"frozenset({'25186', '21443', '20529', '22734'})","frozenset({'21446', '21012', '26306', '27407', '25250', '17861'})",1.3e-05,1.0,76918.0,0.666667
4413216,"frozenset({'25186', '21443', '20529', '25250'})","frozenset({'21446', '21012', '26306', '27407', '17861', '22734'})",1.3e-05,1.0,76918.0,0.666667
5374285,"frozenset({'13821', '4534'})","frozenset({'13442', '13560'})",1.3e-05,1.0,76918.0,0.666667
4413215,"frozenset({'21443', '20529', '25250', '22734'})","frozenset({'21446', '21012', '26306', '27407', '17861', '25186'})",1.3e-05,1.0,76918.0,0.666667
