<a href="https://colab.research.google.com/github/hyuna0926/RecommendSystem/blob/main/test/class_recommend.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. 필요 라이브러리 및 데이터 로드

In [1]:
! pip install implicit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting implicit
  Downloading implicit-0.6.2-cp38-cp38-manylinux2014_x86_64.whl (18.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.6/18.6 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: implicit
Successfully installed implicit-0.6.2


In [2]:
import pandas as pd
from google.colab import drive
from sklearn.model_selection import train_test_split
from scipy import sparse
from scipy.sparse import csr_matrix
from tqdm.notebook import tqdm
import numpy as np
from datetime import datetime

import warnings
# 경고 제거
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from implicit.als import AlternatingLeastSquares as ALS
import implicit
import random
import os

In [3]:
# implicit 라이브러리에서 권장하고 있는 부분입니다. 
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [11]:
path = '/content/drive/MyDrive/CP2_Phase2/'
'''
cart_purchase: 구매이력
product : 상품 정보(31121)
customer_info : 회원 정보(0~100,000)
'''
cart_purchase = pd.read_parquet(path + 'cart_purchase.parquet')
product = pd.read_parquet(path + 'product.parquet')
customer_info = pd.read_parquet(path + 'final_customer_info.parquet')

# 데이터 만들기

In [12]:
class Data:
  '''
  c_idx를 입력했을 때 원하는 데이터 나올 수 있게 하기?~!
  1. 아예 없는 경우
  2. 구매하지 않은 회원(customer_info.parquet)
  3. 20개 미만 산 회원(cart_purchase)
  4. 20개 이상 산 회원(cart_purchase)
  '''
  def __init__(self):
    pass


  def customer_info(self, c_idx):
    # 비회원
    if c_idx not in customer_info['c_idx']:
      print(c_idx, '님은 회원정보가 없습니다.')

    # 구매하지 않는 회원
    elif c_idx not in cart_purchase['c_idx']:
      print(c_idx, '님은 구매이력이 없습니다.')

    # 20개 이상 산 회원
    elif c_idx in self.customer_data()[2]['c_idx'].values: # upper
      print(c_idx, '님은 20개 이상 구매한 회원입니다. ALS 진행')
      
    elif c_idx in self.customer_data()[3]['c_idx'].values: #lower
      print(c_idx, '님은 20개 미만 구매한 회원입니다. CB 진행')

    return c_idx

  
  def customer_data(self):
    '''
    ALS와 CB 데이터 만들기

    return 
    train, test : ALS
    upper, lower : CB(20개 기준으로 나눔)
    '''
    df = cart_purchase.copy()
    df['values'] = 1  # 구매, 장바구니니까 1로 implicit 데이터

    df_group = df.groupby(['c_idx'], as_index=False).count()


    # ALS(20개 이상)
    upper = df[df['c_idx'].isin(df_group.query('values>=20').c_idx)] # 20개 이상
    test = upper.groupby('c_idx').sample(frac=0.2, random_state=42) # als_test
    train = upper.drop(test.index) # als_train
    
    # CB(20개 미만)
    lower = df[df['c_idx'].isin(df_group.query('values<20').c_idx)] # 20개 미만

    return train, test, upper, lower
  


  def no_purchase_data(self):
    '''
    회원정보는 있지만 구매하지 않은 사람을 위한 나이별 데이터 만들기
    customer_info와 cart_purchase 합치기

    return
    youth : 20대 이하 구매 이력
    age_20 : 20대 구매 이력
    age_30 : 30대 이상 구매 이력
    '''
    customer_c = customer_info.copy()
    customer_c.birthdate = customer_c.birthdate.apply(lambda x: datetime.strptime(x, '%Y-%m-%d'))
    customer_c['age']=customer_c.birthdate.apply(lambda x: datetime.today().year - x.year)

    cus_pur = pd.merge(cart_purchase, customer_c, on=['c_idx','customer_id'], how='left')

    customer_nobuy = customer_c[~customer_c['c_idx'].isin(cus_pur['c_idx'])]

    #데이터 분리
    youth = cus_pur.query("age<20")  # 20대 이하 구매 이력
    age_20 = cus_pur.query('age>=20 and age<30')  # 20대 구매 이력
    age_30 = cus_pur.query('age>=30')  # 30대 이상 구매 이력

    youth_nobuy = customer_nobuy.query("age<20")  # 20대 이하 구매 이력
    age_20_nobuy = customer_nobuy.query('age>=20 and age<30')  # 20대 구매 이력
    age_30_nobuy = customer_nobuy.query('age>=30')  # 30대 이상 구매 이력

    return youth, age_20, age_30 , youth_nobuy, age_20_nobuy, age_30_nobuy


# CB

In [13]:
class CB_recommend():
  '''
  20개 미만 구매한 고객에게 성별별로 추천하기
  tf-idf를 이용한 컨텐츠 기반 추천시스템
  '''
  def __init__(self):
    pass
  
  def CB_data(self):
    '''
    CB data 만들어주기
    성별 나눠줌
    - 남자 ['Men','Boys','Unisex'] 17712개
    - 여자 ['Women','Girls','Unisex'] 15157개
    - Unisex는 모두 포함

    return 
    Men : 남성 상품 정보
    Women : 여성 상품 정보
    product_c : 중복값을 제거한 전체 상품 정보
    '''

    # product 중복값 제거
    product_c = product.drop_duplicates(subset=['productDisplayName'], keep='first',ignore_index=True)
    # 결측값 제거
    df = product_c.copy()
    df.dropna(subset=['productDisplayName'],inplace=True)
    col = ['baseColour','season','usage']
    df[col]=df[col].astype('object')
    df[col] = df[col].fillna("unknown") #결측값 채워주기
    df[col]=df[col].astype('category')
    
    #tf-idf를 위해 컬럼 만들어주기
    df['features'] = df[['gender','articleType','baseColour','season','usage']].apply(' '.join, axis=1)

    # 성별 나눠주기
    men = ['Men','Boys','Unisex']
    women = ['Women','Girls','Unisex']
    Men = df[df['gender'].isin(men)]
    Women = df[df['gender'].isin(women)]

    # 보기 좋게 product 정보 만들어주기
    product_c[['gender','masterCategory','subCategory','articleType','baseColour']] = product_c[['gender','masterCategory','subCategory','articleType','baseColour']].astype('str')
    product_c['info'] = product_c['gender'] + '/' + product_c['masterCategory'] + '/' + product_c['subCategory'] + '/' + product_c['articleType'] + '/' + product_c['baseColour']
    product_info = product_c[['product_id','productDisplayName','info','p_idx']]

    return Men, Women, product_info

  def tfidf(self):
    '''
    tf-idf을 이용한 컨텐츠 기반 추천시스템
    남녀별로 코사인 유사도 진행 
    
    return cosine_men, cosine_women
    '''
    Men, Women, product_c = self.CB_data()

    tfidf_m = TfidfVectorizer()
    tfidf_men = tfidf_m.fit_transform(Men['features'])
    tfidf_w = TfidfVectorizer()
    tfidf_women = tfidf_w.fit_transform(Women['features'])


    #코사인 유사도
    cosine_men = pd.DataFrame(cosine_similarity(tfidf_men,tfidf_men),index = Men.p_idx, columns=Men.p_idx)
    cosine_women = pd.DataFrame(cosine_similarity(tfidf_women,tfidf_women),index = Women.p_idx, columns=Women.p_idx)

    return cosine_men, cosine_women


  def recommend(self, c_idx, k=25):
    '''
    Data클래스에 있는 데이터 들고오기
    남자/여자 나눠서 추천
    최근에 산 제품과 유사한 상품 k개 추천

    return recommend(유사한 상품 k개)
    '''
    data = Data()
    lower = data.customer_data()[3] # 20개 이하 구매한 사람

    Men, Women, product_info = self.CB_data()
    cosine_men, cosine_women = self.tfidf()

    # 고객이 최근에 산 상품
    buy_recent = lower[lower['c_idx']==c_idx].sort_values(by='created_at', ascending=False)[:1]  
    p_idx = buy_recent.p_idx.values[0]


    if p_idx in Men['p_idx']: #상품 p_idx가 Men에 들어가있으면
      men_sim = cosine_men[p_idx].sort_values(ascending=False).index
      recommend = product_info[product_info['p_idx'].isin(men_sim)][:k]
    
    else: # Women에 들어가있으면
      women_sim = cosine_women[p_idx].sort_values(ascending=False).index
      recommend = product_info[product_info['p_idx'].isin(women_sim)][:k]

    return recommend
    
   

In [14]:
cb_recommend = CB_recommend()
cb_recommend.recommend(6934)

Unnamed: 0,product_id,productDisplayName,info,p_idx
0,15970,Turtle Check Men Navy Blue Shirt,Men/Apparel/Topwear/Shirts/Navy Blue,27781
1,39386,Peter England Men Party Blue Jeans,Men/Apparel/Bottomwear/Jeans/Blue,20698
3,21379,Manchester United Men Solid Black Track Pants,Men/Apparel/Bottomwear/Track Pants/Black,16433
4,53759,Puma Men Grey T-shirt,Men/Apparel/Topwear/Tshirts/Grey,22200
5,1855,Inkfruit Mens Chain Reaction T-shirt,Men/Apparel/Topwear/Tshirts/Grey,12874
6,30805,Fabindia Men Striped Green Shirt,Men/Apparel/Topwear/Shirts/Green,8473
8,29114,Puma Men Pack of 3 Socks,Men/Accessories/Socks/Socks/Navy Blue,22387
9,30039,Skagen Men Black Watch,Men/Accessories/Watches/Watches/Black,26021
10,9204,Puma Men Future Cat Remix SF Black Casual Shoes,Men/Footwear/Shoes/Casual Shoes/Black,22136
12,18653,Fila Men Cush Flex Black Slippers,Men/Footwear/Flip Flops/Flip Flops/Black,9695


# Nobuy_recommend

In [15]:
class Nobuy_recommend:
  def __init__(self):
    pass
  
  def nobuy_recommend(self,c_idx, k=25):
    '''
    아무 정보 없는 사람과 구매 이력이 없는 사람에게 추천
    '''
    data = Data()
    youth, age_20, age_30, youth_nobuy, age_20_nobuy, age_30_nobuy  = data.no_purchase_data()

    # 보기 좋게 product info 만들어주기
    product_c = product.drop_duplicates(subset=['productDisplayName'], keep='first',ignore_index=True)
    product_c[['gender','masterCategory','subCategory','articleType','baseColour']] = product_c[['gender','masterCategory','subCategory','articleType','baseColour']].astype('str')
    product_c['info'] = product_c['gender'] + '/' + product_c['masterCategory'] + '/' + product_c['subCategory'] + '/' + product_c['articleType'] + '/' + product_c['baseColour']
    product_info = product_c[['product_id','productDisplayName','info','p_idx']]

    if c_idx not in customer_info['c_idx'].values:  # 회원정보가 없는
      item = cart_purchase.groupby('p_idx').count()['created_at'].sort_values().tail(k).index
      recommend = product_info[product_info['p_idx'].isin(item)]
      print('회원정보가 없어 가장 많이 팔린 제품을 추천합니다')

    elif c_idx in youth_nobuy['c_idx'].values: #20세 미만
      item = youth.groupby('p_idx').count().sort_values(by='created_at').tail(k).index
      recommend= product_info[product_info['p_idx'].isin(item)]
      print('회원님! 20대 미만인 고객님들이 가장 많이 구매한 상품을 추천합니다.')
  
    elif c_idx in age_20_nobuy['c_idx'].values: #20대
      item = age_20.groupby('p_idx').count().sort_values(by='created_at').tail(k).index
      recommend= product_info[product_info['p_idx'].isin(item)]
      print('20대 회원님! 20대인 고객님들이 가장 많이 구매한 상품들을 추천합니다.')
    
    else: #30대 이상
      item = age_30.groupby('p_idx').count().sort_values(by='created_at').tail(k).index
      recommend= product_info[product_info['p_idx'].isin(item)]
      print('30대 이상인 회원님! 30대 이상인 고객님들이 가장 많이 구매한 상품들을 추천합니다.')

    return recommend


In [16]:
nobuy = Nobuy_recommend()

In [17]:
nobuy.nobuy_recommend(60000)

20대 회원님! 20대인 고객님들이 가장 많이 구매한 상품들을 추천합니다.


Unnamed: 0,product_id,productDisplayName,info,p_idx
54,59435,Arrow Men Black Formal Shoes,Men/Footwear/Shoes/Formal Shoes/Black,2580
76,54118,Rocia Women Black Flats,Women/Footwear/Shoes/Heels/Black,25016
85,48781,Lucera Women Silver Pendant,Women/Accessories/Jewellery/Pendant/Silver,16317
810,47191,Franco Leone Men Black Formal Shoes,Men/Footwear/Shoes/Formal Shoes/Black,10691
948,7390,Red Tape Men Brown Shoes,Men/Footwear/Shoes/Formal Shoes/Brown,24194
1110,3398,Murcia Women Black Handbag,Women/Accessories/Bags/Handbags/Black,17273
1136,45407,Rocia Women Brown Sandals,Women/Footwear/Shoes/Flats/Brown,25049
1249,30692,Catwalk Women Brown Heels,Women/Footwear/Shoes/Heels/Brown,5280
1370,48728,Lucera Women Silver Earrings,Women/Accessories/Jewellery/Earrings/Silver,16314
1424,47794,Murcia Women Brown Handbag,Women/Accessories/Bags/Handbags/Brown,17291


# ALS(implicit 라이브러리)

In [18]:
class ALS_library:
  '''
  implicit 라이브러리를 이용한 ALS 모델
  '''

  def __init__(self,model):
    self.model=model
  
  def als_data(self):
    data = Data()
    train, test, upper, lower = data.customer_data()
    
    # csr matrix 만들기
    csr_train = sparse.csr_matrix((train['values'],(train['c_idx'], train['p_idx'])))
    csr_test = sparse.csr_matrix((test['values'],(test['c_idx'], test['p_idx'])))

    # 검증셋 만들기
    test_df = test.groupby('c_idx')['p_idx'].unique().to_frame().reset_index()

    #중복값 제거 및 보기 좋게 만들기
    product_c = product.drop_duplicates(subset=['productDisplayName'], keep='first',ignore_index=True)
    product_c[['gender','masterCategory','subCategory','articleType','baseColour']] = product_c[['gender','masterCategory','subCategory','articleType','baseColour']].astype('str')
    product_c['info'] = product_c['gender'] + '/' + product_c['masterCategory'] + '/' + product_c['subCategory'] + '/' + product_c['articleType'] + '/' + product_c['baseColour']
    product_info = product_c[['product_id','productDisplayName','info','p_idx']]
    
    return csr_train, csr_test, test_df, product_info  # 0,1,2,3


  def fit(self):
    # als 학습
    self.model.fit(self.als_data()[0])


  def recommendation(self, c_idx, k=25):
    # 추천하기  
    csr_train, csr_test, test_df, product_info = self.als_data()
    item = self.model.recommend(c_idx, csr_train[c_idx], k)[0]
    recommend = product_info[product_info['p_idx'].isin(item)]

    return recommend
  

  def mean_precisin_hit(self, user,k=25):
    csr_train, csr_test, test_df, product_c = self.als_data()

    hit = 0
    precision = 0
    users = 0
    for c_idx in range(user):
      if c_idx in test_df['c_idx'].values:
        hit_count = 0  # for문 돌 때마다 리셋
        users += 1
        recommend = self.model.recommend(c_idx, csr_train[c_idx],k)[0]
        buy_test = test_df[test_df['c_idx']==c_idx].p_idx.values[0]

        for i in buy_test:
          for j in recommend:
            if i==j:
              hit_count+=1
              precision+=1

        if hit_count >= 1: # count가 1 이상이면 hit한 것이니까 전체에 1 추가
          hit+=1

    hit_rate = hit/users
    mean_precision = (precision/25)/users

    return hit_rate, mean_precision




In [45]:
factor=180;alpha=1;regularization=32;iteration=20
ALS = implicit.als.AlternatingLeastSquares(factors=factor, alpha=alpha, regularization=regularization,
                                            iterations=iteration, random_state=42,calculate_training_loss=True)

In [46]:
als = ALS_library(ALS)
als.fit()

  0%|          | 0/20 [00:00<?, ?it/s]

In [47]:
als.recommendation(800,10)

Unnamed: 0,product_id,productDisplayName,info,p_idx
54,59435,Arrow Men Black Formal Shoes,Men/Footwear/Shoes/Formal Shoes/Black,2580
76,54118,Rocia Women Black Flats,Women/Footwear/Shoes/Heels/Black,25016
948,7390,Red Tape Men Brown Shoes,Men/Footwear/Shoes/Formal Shoes/Brown,24194
1110,3398,Murcia Women Black Handbag,Women/Accessories/Bags/Handbags/Black,17273
1249,30692,Catwalk Women Brown Heels,Women/Footwear/Shoes/Heels/Brown,5280
1606,11922,Franco Leone Men Formal Black Formal Shoes,Men/Footwear/Shoes/Formal Shoes/Black,10714
2082,54510,Lino Perros Women Brown Handbag,Women/Accessories/Bags/Handbags/Brown,15197
2186,43934,Catwalk Women Black Heels,Women/Footwear/Shoes/Heels/Black,5246
2785,45406,Rocia Women Black Sandals,Women/Footwear/Shoes/Flats/Black,25021
5127,22846,Woodland Men Khaki Casual Shoes,Men/Footwear/Shoes/Casual Shoes/Khaki,30254


In [57]:
pur_800=cart_purchase[cart_purchase['c_idx']==800].p_idx.values

In [61]:
product[product.p_idx.isin(pur_800)]

Unnamed: 0,product_id,productDisplayName,info,p_idx
1,39386,Peter England Men Party Blue Jeans,Men/Apparel/Bottomwear/Jeans/Blue,20698
1064,57109,Elle Women Blue Tunic,Women/Apparel/Topwear/Tunics/Blue,7601
3826,30261,Red Tape Men Brown Formal Shoes,Men/Footwear/Shoes/Formal Shoes/Brown,24189
4211,43935,Catwalk Women White Heels,Women/Footwear/Shoes/Heels/White,5501
4310,31382,Locomotive Men Printed White T-shirt,Men/Apparel/Topwear/Tshirts/White,15601
4509,5839,Puma Men's Motorsport Better Blue T-shirt,Men/Apparel/Topwear/Tshirts/Blue,22881
4615,35173,Wildcraft Unisex Black & Red Duffel Bag,Unisex/Accessories/Bags/Duffel Bag/Red,29964
4670,36957,Force 10 Men White Sports Shoes,Men/Footwear/Shoes/Sports Shoes/White,10353
4972,23500,FILA Men Hostile White Sports Shoes,Men/Footwear/Shoes/Sports Shoes/White,8190
5000,12081,W Women Printed Purple Kurtas,Women/Apparel/Topwear/Kurtas/Purple,29738


In [48]:
als.mean_precisin_hit(50000)

(0.28252014121480945, 0.015765366162759118)

# tunning


In [None]:
class Tunning:
  def __init__(self):
    pass
  
  def random_search(self,params, n_iters=10):
      results =[]
      for _ in tqdm(range(n_iters)):
        factor = np.random.choice(params['factor'])
        alpha = np.random.choice(params['alpha'])
        regularization = np.random.choice(params['regularization'])
        iteration = np.random.choice(params['iteration'])

        ALS = implicit.als.AlternatingLeastSquares(factors=factor, alpha=alpha, regularization=regularization, iterations=iteration,
                  random_state=42, calculate_training_loss=True)

        als = ALS_library(ALS)
        als.fit()
        hit_rate, mean_precision = als.mean_precisin_hit(5000)

        
        result = [factor, alpha, regularization, iteration, hit_rate, mean_precision]
        results.append(result)
      frame = pd.DataFrame(results, columns=['factor','alpha','regularization','iteration','hit_rate','mean_precision'])
      frame = frame.sort_values('hit_rate', ascending=False)

      return frame
  
  def grid_search(self, params):
    results =[]
    for factor in tqdm(params['factor']):
      for alpha in params['alpha']:
        for regularization in params['regularization']:
          ALS = implicit.als.AlternatingLeastSquares(factors=factor, alpha=alpha, regularization=regularization, iterations=20,
                random_state=42, calculate_training_loss=True)
          als = ALS_library(ALS)
          als.fit()
          hit_rate, mean_precision = als.mean_precisin_hit(5000)

          
          result = [factor, alpha, regularization, iteration, hit_rate, mean_precision]
          results.append(result)
        frame = pd.DataFrame(results, columns=['factor','alpha','regularization','iteration','hit_rate','mean_precision'])
        frame = frame.sort_values('hit_rate', ascending=False)
    
    return frame

In [None]:
params_grid = {
    'factor': [160,180,200],
    'alpha' : [1,3,5],
    'regularization' : [28,30,32],
    'iteration' : np.arange(15,31,5)
}
tunner = Tunning()
tunner.grid_search(params_grid)

In [None]:
params_random = {
    'factor': np.arange(90,210,10),
    'alpha' : np.arange(1,21,4),
    'regularization' : np.arange(1,30,5),
    'iteration' : np.arange(15,31,5)
}
tunner = Tunning()
tunner.random_search(params_random,n_iters=3)

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,factor,alpha,regularization,iteration,hit_rate,mean_precision
2,160,5,21,20,0.168237,0.008057
1,120,9,11,15,0.159713,0.007609
0,160,13,6,25,0.152086,0.00725
