# Recommendation System

In [3]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score

In [4]:
df = pd.read_csv('./looker.csv', low_memory=False)

In [5]:
df.info(show_counts=True, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2466231 entries, 0 to 2466230
Data columns (total 37 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   user_id                 165843 non-null   float64
 1   age                     165843 non-null   float64
 2   gender                  165843 non-null   object 
 3   account_state           165843 non-null   object 
 4   account_city            164274 non-null   object 
 5   account_country         165843 non-null   object 
 6   latitude                165843 non-null   float64
 7   longitude               165843 non-null   float64
 8   account_traffic_source  165843 non-null   object 
 9   account_created_at      165843 non-null   object 
 10  order_id                145887 non-null   float64
 11  order_items_id          145887 non-null   float64
 12  status                  145887 non-null   object 
 13  sale_price              145887 non-null   float64
 14  or

In [6]:
check = df.copy()

In [7]:
# 장바구니 형태 필요
col_list = [
    'user_id',
    'age',
    'gender',
    'account_city',
    'account_country',
    'account_traffic_source',
    'account_created_at',
    'order_id',
    'order_items_id',
    'status',
    'sale_price',
    'order_created_at',
    'sess_traffic_source',
    'product_id',
    'category',
    'brand',
    'product_name'
]

In [8]:
df = df[col_list].drop_duplicates()

In [9]:
df.info(show_counts=True, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 194894 entries, 0 to 2465726
Data columns (total 17 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   user_id                 165843 non-null  float64
 1   age                     165843 non-null  float64
 2   gender                  165843 non-null  object 
 3   account_city            164274 non-null  object 
 4   account_country         165843 non-null  object 
 5   account_traffic_source  165843 non-null  object 
 6   account_created_at      165843 non-null  object 
 7   order_id                145887 non-null  float64
 8   order_items_id          145887 non-null  float64
 9   status                  145887 non-null  object 
 10  sale_price              145887 non-null  float64
 11  order_created_at        145887 non-null  object 
 12  sess_traffic_source     145892 non-null  object 
 13  product_id              174933 non-null  float64
 14  category                

In [10]:
df.describe(include='all')

Unnamed: 0,user_id,age,gender,account_city,account_country,account_traffic_source,account_created_at,order_id,order_items_id,status,sale_price,order_created_at,sess_traffic_source,product_id,category,brand,product_name
count,165843.0,165843.0,165843,164274,165843,165843,165843,145887.0,145887.0,145887,145887.0,145887,145892,174933.0,174933,174790,174923
unique,,,2,7841,16,5,1841,,,5,,145726,5,,26,2752,27236
top,,,F,Shanghai,China,Search,2024-01-14,,,Shipped,,2021-04-09 23:41:48+00:00,Email,,Intimates,Allegra K,Wrangler Men's Premium Performance Cowboy Cut ...
freq,,,83076,4191,56968,116248,812,,,38042,,2,65585,,13179,6124,59
mean,49964.691166,41.065701,,,,,,62581.780755,90882.658098,,59.53611,,,15140.276569,,,
std,28843.590039,17.040903,,,,,,36132.860037,52479.390587,,66.43774,,,8410.230687,,,
min,1.0,12.0,,,,,,1.0,1.0,,0.02,,,1.0,,,
25%,25005.0,26.0,,,,,,31268.5,45443.5,,24.5,,,7862.0,,,
50%,49932.0,41.0,,,,,,62512.0,90854.0,,39.990002,,,15728.0,,,
75%,74904.0,56.0,,,,,,93887.5,136377.5,,69.949997,,,22388.0,,,


In [11]:
df = df.drop(columns='product_name')

In [12]:
def convert_type_dict(df):
    ideal_dtypes = dict()
    
    for column in df.columns:
        dtype = df[column].dtype
        
        if ('_id' in column) or (dtype == object):
            n_unique = df[column].nunique()
            
            # 값의 종류가 n개 미만일 경우에만 category 형식으로 최적화
            if n_unique > 10:
                ideal_dtypes[column] = 'object'
            else:
                ideal_dtypes[column] = 'category'
        elif dtype != object:
            c_min = df[column].min()
            c_max = df[column].max()

            # 숫자형 데이터 형식 최적화
            if 'price' in column:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    ideal_dtypes[column] = 'float16'
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    ideal_dtypes[column] = 'float32'
                else:
                    ideal_dtypes[column] = 'float64'
            else:
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    ideal_dtypes[column] = 'int8'
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    ideal_dtypes[column] = 'uint8'
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    ideal_dtypes[column] = 'int16'
                elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                    ideal_dtypes[column] = 'uint16'
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    ideal_dtypes[column] = 'int32'
                elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                    ideal_dtypes[column] = 'uint32'
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    ideal_dtypes[column] = 'int64'
                elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                    ideal_dtypes[column] = 'uint64'
            
    return ideal_dtypes

In [13]:
data_types = convert_type_dict(df)
data_types

{'user_id': 'object',
 'age': 'int8',
 'gender': 'category',
 'account_city': 'object',
 'account_country': 'object',
 'account_traffic_source': 'category',
 'account_created_at': 'object',
 'order_id': 'object',
 'order_items_id': 'object',
 'status': 'category',
 'sale_price': 'float16',
 'order_created_at': 'object',
 'sess_traffic_source': 'category',
 'product_id': 'object',
 'category': 'object',
 'brand': 'object'}

In [14]:
# 구매한 유저 대상
df = df.dropna(subset='user_id')

In [15]:
df = df.astype(data_types)

In [16]:
df.info(show_counts=True, memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Index: 165843 entries, 7 to 2462749
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   user_id                 165843 non-null  object  
 1   age                     165843 non-null  int8    
 2   gender                  165843 non-null  category
 3   account_city            164274 non-null  object  
 4   account_country         165843 non-null  object  
 5   account_traffic_source  165843 non-null  category
 6   account_created_at      165843 non-null  object  
 7   order_id                145887 non-null  object  
 8   order_items_id          145887 non-null  object  
 9   status                  145887 non-null  category
 10  sale_price              145887 non-null  float16 
 11  order_created_at        145887 non-null  object  
 12  sess_traffic_source     145887 non-null  category
 13  product_id              145887 non-null  object  
 14  category

In [29]:
product_name_table = pd.read_csv('./Looker Ecommerce BigQuery Dataset/products.csv')

In [30]:
product_name_table

Unnamed: 0,id,cost,category,name,brand,retail_price,department,sku,distribution_center_id
0,13842,2.51875,Accessories,Low Profile Dyed Cotton Twill Cap - Navy W39S55D,MG,6.25,Women,EBD58B8A3F1D72F4206201DA62FB1204,1
1,13928,2.33835,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,MG,5.95,Women,2EAC42424D12436BDD6A5B8A88480CC3,1
2,14115,4.87956,Accessories,Enzyme Regular Solid Army Caps-Black W35S45D,MG,10.99,Women,EE364229B2791D1EF9355708EFF0BA34,1
3,14157,4.64877,Accessories,Enzyme Regular Solid Army Caps-Olive W35S45D (...,MG,10.99,Women,00BD13095D06C20B11A2993CA419D16B,1
4,14273,6.50793,Accessories,Washed Canvas Ivy Cap - Black W11S64C,MG,15.99,Women,F531DC20FDE20B7ADF3A73F52B71D0AF,1
...,...,...,...,...,...,...,...,...,...
29115,5676,12.42338,Pants & Capris,WHAT GOES AROUND COMES AROUND Women's Colette ...,What Goes Around Comes Around,24.17,Women,3A01FC0853EBEBA94FDE4D1CC6FB842A,10
29116,6538,13.67500,Shorts,WHAT GOES AROUND COMES AROUND Women's Mimi Shorts,What Goes Around Comes Around,25.00,Women,741A0099C9AC04C7BFC822CAF7C7459F,10
29117,6712,12.40000,Shorts,WHAT GOES AROUND COMES AROUND Women's Fifi Short,What Goes Around Comes Around,25.00,Women,5726DAF2C9EE0F955ECA58291C26D2F3,10
29118,6821,13.47500,Shorts,WHAT GOES AROUND COMES AROUND Women's Ferguson...,What Goes Around Comes Around,25.00,Women,BA0A4D6ECEA3E9E126DD3B6D77291C97,10


In [31]:
product_name_table = product_name_table.drop(columns=['sku', 'distribution_center_id', 'cost'])
product_name_table = product_name_table.drop_duplicates()
product_name_table['name'] = product_name_table['name'].fillna('')
product_name_table['brand'] = product_name_table['brand'].fillna('')
product_name_table = product_name_table.set_index('id')

In [32]:
product_name_table

Unnamed: 0_level_0,category,name,brand,retail_price,department
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
13842,Accessories,Low Profile Dyed Cotton Twill Cap - Navy W39S55D,MG,6.25,Women
13928,Accessories,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,MG,5.95,Women
14115,Accessories,Enzyme Regular Solid Army Caps-Black W35S45D,MG,10.99,Women
14157,Accessories,Enzyme Regular Solid Army Caps-Olive W35S45D (...,MG,10.99,Women
14273,Accessories,Washed Canvas Ivy Cap - Black W11S64C,MG,15.99,Women
...,...,...,...,...,...
5676,Pants & Capris,WHAT GOES AROUND COMES AROUND Women's Colette ...,What Goes Around Comes Around,24.17,Women
6538,Shorts,WHAT GOES AROUND COMES AROUND Women's Mimi Shorts,What Goes Around Comes Around,25.00,Women
6712,Shorts,WHAT GOES AROUND COMES AROUND Women's Fifi Short,What Goes Around Comes Around,25.00,Women
6821,Shorts,WHAT GOES AROUND COMES AROUND Women's Ferguson...,What Goes Around Comes Around,25.00,Women


## 클래식 추천 알고리즘
1. 랜덤 기반
2. 인기도 기반

In [90]:
# 전체 유저
df.user_id.nunique()

96874

In [91]:
# 제품을 구매한 유저
df[df.product_id.notna()].user_id.nunique()

76918

In [94]:
# 제품 수
product_name_table.index.nunique()

29118

In [95]:
# 판매된 제품 수
df[df.order_items_id.notna()].product_id.nunique()

28899

In [96]:
order_product = df.groupby(['user_id', 'product_id']).agg({'order_items_id':'nunique'}).reset_index()
order_product = order_product.rename(columns={'order_items_id':'order_cnt'})
order_product.head()

Unnamed: 0,user_id,product_id,order_cnt
0,1.0,2953.0,1
1,1.0,4731.0,1
2,1.0,7656.0,1
3,2.0,25774.0,1
4,3.0,18177.0,1


In [97]:
order_product.shape

(145880, 3)

In [None]:
order_product_matrix = order_product.pivot_table(index='user_id', columns='product_id', values='order_cnt').fillna(0)

In [None]:
order_product_matrix

product_id,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,29111.0,29112.0,29113.0,29114.0,29115.0,29116.0,29117.0,29118.0,29119.0,29120.0
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99993.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99996.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99998.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
99999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
order_product_matrix.shape

(76918, 28899)

In [38]:
user_indices = np.arange(order_product_matrix.shape[0])
train_user_indices, test_user_indices = train_test_split(user_indices, test_size=0.2, random_state=42)

In [69]:
# 1. 랜덤 기반
def random_recommendation(order_product, n=10):
    unique_products = order_product['product_id'].unique()
    random_products = np.random.choice(unique_products, size=n, replace=False)  # 비복원 -> 중복되면 안되니까!
    return random_products

random_products = random_recommendation(order_product, n=10)
print("Random purchase_info-Based Recommendations:")
for i, product in enumerate(random_products, 1):
    print(f"{i}. {product_name_table.loc[product_name_table.product_id==product, 'product_name'].values[0]}")

Random purchase_info-Based Recommendations:
1. American Apparel Stretch Twill 5 Pocket Pant
2. Tommy Bahama Men's Paradise Palm Tree Boxer Short
3. Men's Polo Ralph Lauren Tyler Plaid Red Gray Blue Green Shorts 1396441 BR
4. Vanity Fair Women's Modern Coverage Look Lifted Under Wire
5. Blue Marlin Men's Ny Curb Hoodie
6. Louis Raphael Men's Toatl Comfort Tic Pattern Flat Front Dress Pant
7. Intimo Women's Microfiber Camisole with Contrast Lace
8. Pendleton The Portland Collection Women's Wallowa Cardigan
9. Red Engine Women's Vega Short
10. Steinbock Tyrolean Sport Coat


In [70]:
# 2. 인기도 기반
def popularity_recommendation(order_product, n=10):
    popular_products = order_product.groupby('product_id')['order_cnt'].sum().sort_values(ascending=False).head(n).index
    return popular_products

popular_products = popularity_recommendation(order_product, n=10)
print("\nPopularity-Based Recommendations:")
for i, product in enumerate(popular_products, 1):
    print(f"{i}. {product_name_table.loc[product_name_table.product_id==product, 'product_name'].values[0]}")


Popularity-Based Recommendations:
1. Elegant Men's Two button Three piece Strip Suit
2. Life is Good Men's King Of The Grill Short Sleeve Tee
3. Mens Print Hot Body Boxer Swimsuit Gary Majdell Sport
4. Knothe - Mens Waffle Weave Robe Tan 29649
5. Buffalo by David Bitton  Men's Six Dust Wash Jean
6. Quiksilver Waterman Men's Coopers Beach Hybrid Trunk
7. Harley Davidson Vintage Classic Mens T-Shirt
8. Wrangler Rugged Wear Men's Unlined Denim Jacket Antique Navy
9. Haggar Men's Tonal Stria Pleat Front Cuff Dress Pant
10. Quiksilver Men's Ante Up Hoodie Sweatshirt Full Zip 110201-Gray


In [71]:
def recommend_product(user_index, strategy, k=10):
    if strategy == "random":
        top_k_products = random_recommendation(order_product, n=k)
    elif strategy == "popularity":
        top_k_products = popularity_recommendation(order_product, n=k)
    else:
        raise ValueError("Invalid recommendation strategy")
    return top_k_products

In [72]:
def evaluate_model(strategy, k=10):
    true_positive = 0
    false_positive = 0
    false_negative = 0

    for user_index in test_user_indices:
        true_products = set(order_product_matrix.iloc[user_index][order_product_matrix.iloc[user_index] > 0].index)
        recommend_products = set(recommend_product(user_index, strategy, k))

        tp = len(true_products.intersection(recommend_products))
        fp = len(recommend_products - true_products)
        fn = len(true_products - recommend_products)

        true_positive += tp
        false_positive += fp
        false_negative += fn

    precision = true_positive / (true_positive + false_positive)
    recall = true_positive / (true_positive + false_negative)

    return precision, recall

- precision: 실제로 몇 개를 맞췄는지
- recall: 유저의 클릭 중 몇 개가 추천인지

In [73]:
# Evaluate the random score-based recommendation model
random_precision, random_recall = evaluate_model(strategy="random")
print(f"Random Score-Based: Precision = {random_precision:.4f}, Recall = {random_recall:.4f}")

Random Score-Based: Precision = 0.0001, Recall = 0.0004


In [74]:
# Evaluate the popularity-based recommendation model
popularity_precision, popularity_recall = evaluate_model(strategy="popularity")
print(f"Popularity-Based: Precision = {popularity_precision:.4f}, Recall = {popularity_recall:.4f}")

Popularity-Based: Precision = 0.0002, Recall = 0.0012
