In [28]:
import pandas as pd, numpy as np
import pymysql,  os, pickle
from scipy import sparse
from tqdm import tqdm
from collections import Counter

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_columns', None)

import warnings
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)

pd.set_option('mode.chained_assignment',  None) # Setting With Copy Warning

from implicit.als import AlternatingLeastSquares

# implicit 라이브러리에서 권장사항입니다.
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
os.environ['MKL_NUM_THREADS'] = '1'

def live_db_conn():
    conn = pymysql.connect(host='host', user='user', password='password', autocommit=True, cursorclass=pymysql.cursors.DictCursor, db="db")
    return conn

def make_dic(something_list):

    name2idx = {}
    idx2name = {}

    for i in range(len(something_list)):

        name2idx[something_list[i]] = i
        idx2name[i] = something_list[i]

    return name2idx, idx2name

def load_token(path, x, y):

    x_name2idx = pickle.load(open(path + '{}_name2idx.pkl'.format(x), 'rb'))
    x_idx2name = pickle.load(open(path + '{}_idx2name.pkl'.format(x), 'rb'))
    y_name2idx = pickle.load(open(path + '{}_name2idx.pkl'.format(y), 'rb'))
    y_idx2name = pickle.load(open(path + '{}_idx2name.pkl'.format(y), 'rb'))

    return x_name2idx, x_idx2name, y_name2idx, y_idx2name

def load_cf(path, user_based_model_name, item_based_model_name):

    ub_model = pickle.load(open(path + user_based_model_name, 'rb'))
    ib_model = pickle.load(open(path + item_based_model_name, 'rb'))
 
    return ub_model, ib_model

def tag_and_type(product_id_list):

    conn = live_db_conn()
    curs = conn.cursor()

    if len(product_id_list) > 1:
        sql = """
        SELECT p.store_id, p.id as product_id, group_concat(ptm.id separator ' ') as ptm_ids, group_concat(pt.type separator ' ') as tag_type
        FROM product p
        INNER JOIN product_tag pt
        INNER JOIN product_tag_master ptm
        on pt.product_id=p.id
        and pt.product_tag_master_id = ptm.id
        WHERE p.id in {}
        """.format(tuple(product_id_list))
    
    else:

        sql = """
        SELECT p.store_id, p.id as product_id, group_concat(ptm.id separator ' ') as ptm_ids, group_concat(pt.type separator ' ') as tag_type
        FROM product p
        INNER JOIN product_tag pt
        INNER JOIN product_tag_master ptm
        on pt.product_id=p.id
        and pt.product_tag_master_id = ptm.id
        WHERE p.id = {}
        """.format(product_id_list[0])

    curs.execute(sql)
    df = pd.DataFrame(curs.fetchall())

    curs.close()
    conn.close()
    
    return df


def rec_product_tag(product_list):

    conn = live_db_conn()
    curs = conn.cursor()

    if len(product_list) > 1:
        sql = """
        SELECT p.store_id as store_id, p.id as product_id, group_concat(ptm.id separator ' ') as ptm_ids, group_concat(pt.type separator ' ') as tag_type
        FROM product p
        INNER JOIN product_tag pt
        INNER JOIN product_tag_master ptm
        on pt.product_id=p.id
        and pt.product_tag_master_id = ptm.id
        WHERE p.id in {} and pt.type != 'option'
        group by p.id
        """.format(tuple(product_list))

    else:
        sql = """
        SELECT p.store_id as store_id, p.id as product_id, group_concat(ptm.id separator ' ') as ptm_ids, group_concat(pt.type separator ' ') as tag_type
        FROM product p
        INNER JOIN product_tag pt
        INNER JOIN product_tag_master ptm
        on pt.product_id=p.id
        and pt.product_tag_master_id = ptm.id
        WHERE p.id = {} and pt.type != 'option'
        group by p.id
        """.format(product_list[0])

    curs.execute(sql)
    df = pd.DataFrame(curs.fetchall())

    curs.close()
    conn.close()

    return df

def get_sim_df(input_df, rec_df):    

    vectorizer = CountVectorizer(min_df = 1)

    full_text = input_df['ptm_ids'].tolist() + rec_df['ptm_ids'].tolist()
    X = vectorizer.fit_transform(full_text)
    index_list = input_df['product_id'].tolist() + rec_df['product_id'].tolist()

    df = pd.DataFrame(
                            data= X.todense(),
                            index = index_list,
                            columns = vectorizer.get_feature_names_out()
                        )

    input_product = input_df['product_id'][0]

    input_vector = df[df.index==input_product]

    sim_value = []

    for i in range(len(df)):
        v = np.linalg.norm(input_vector-df.iloc[i])
        sim_value.append(round(v,2))
  
    df['CB 유사도'] =  sim_value

    result = df[['CB 유사도']]
    result = result.reset_index().rename(columns={'index':'product_id'})
    result.sort_values(by='CB 유사도',ascending=True, inplace=True)
    result.reset_index(inplace=True)
    result.drop('index',axis=1,inplace=True)

    return result

def type_weight(ptm, t_type):

    weight_list = []
    weighted_dic = {'ingredient': 7, 'sauce':5, 'cooking':4, 'option':3}
    
    for i in range(len(ptm)):
        for j in range(weighted_dic[t_type[i]]):
            weight_list.append(ptm[i])

    return weight_list

def precision_k(gt, predict, k=10):

    intersection = list(set(gt) & set(predict[:k]))

    return len(intersection) / k

def rel_k(gt, predict, k=10):

    return 1 if predict[k-1] in gt else 0

def evaluate_map(gt, predict, k=10):
    
    ap = 0
    for i in range(1, min(k, len(gt)) + 1):
        
        ap += precision_k(gt, predict, i) * rel_k(gt, predict, i)

    return ap / min(k, len(gt))

def evaluate_recall(gt, predict, k=10):

    intersection = list(set(gt) & set(predict))

    return min(len(intersection) / min(k, len(gt)),1.0)

def product_tag_set(x):

    key_list = []
    label_count_dic = Counter(x)

    for k,v in label_count_dic.items():

        if v > 1:

            key_list.append(k)

    conn = live_db_conn()
    curs = conn.cursor()

    sql = """
    SELECT product_id, group_concat(product_tag_master_id separator ',') as tag_set  FROM product_tag
    WHERE product_id in {} and type != 'option'
    group by product_id
    """.format(tuple(x))
    curs.execute(sql)

    df = pd.DataFrame(curs.fetchall())

    curs.close()
    conn.close()
    
    df['tag_set'] = df['tag_set'].apply(lambda x:set(x.split(',')))
    
    for key in key_list:
        ki = label_count_dic[key]-1
        for i in range(label_count_dic[key]-1):
            if len(df[df['product_id'] == key]) != 0:
                tmp = pd.DataFrame([df[df['product_id'] == key].iloc[0]])
                df = pd.concat([df, tmp],axis=0)

    tag_set_list = [df[df['product_id'] == i]['tag_set'].values[0] for i in x]
    tag_set_list = list(map(tuple, tag_set_list))
    
    return tag_set_list

def tag_based_precision_k(gt, predict, k=10):
    # [1,1,2,3] [1,2,3,4]
    count = 0 
    for pred in predict[:k]:

        if pred in set(gt):

            count += 1

    return count / k

def tag_based_rel_k(gt, predict, k=10):

    return 1 if predict[k-1] in gt else 0

def tag_based_evaluate_map(gt, predict, k=10):
    
    ap = 0
    for i in range(1, min(k, len(gt)) + 1):
        
        ap += tag_based_precision_k(gt, predict, i) * tag_based_rel_k(gt, predict, i)

    return ap / min(k, len(gt))

def tag_based_evaluate_recall(gt, predict, k=10):

    count = 0

    for pred in predict:

        if pred in set(gt):

            count += 1

    return min(count / min(k, len(gt)),1.0)

def load_data():

    conn = live_db_conn()
    curs = conn.cursor()

    sql = """
    SELECT o.id, o.user_id as reviewer_name, o.store_id as live_store_id, o.status, op.product_id, op.status as reviewer_stars, op.updated_at, v.reserved_at
    FROM `order` o INNER JOIN order_product op on o.id = op.order_id
                INNER JOIN product p on op.product_id = p.id
                INNER JOIN voucher v on o.id = v.order_id
    where p.status = 'normal' and p.sale_status = 'sale'
    """
    curs.execute(sql)

    df = pd.DataFrame(curs.fetchall())

    curs.close()
    conn.close()
    status_dic = {'normal':5,'cancel':1}

    df['reviewer_stars'] = df['reviewer_stars'].apply(lambda x:status_dic[x])

    df = df[['id','live_store_id','product_id','reviewer_name','reviewer_stars', 'updated_at', 'reserved_at']]
    df.dropna(subset='updated_at', inplace=True)

    return df

#### 1. Data Loading & Cleansing

1. 데이터 불러오기

In [46]:
total_review = load_data()
print('리뷰 개수:',len(total_review))
print('유저 수:',len(total_review['reviewer_name'].unique().tolist()))
print('매장 수:',len(total_review['live_store_id'].unique().tolist()))
print('상품 수:',len(total_review['product_id'].unique().tolist()))

리뷰 개수: 55002
유저 수: 11321
매장 수: 431
상품 수: 4576


2. 학습 데이터 정제: tag_type이 존재하지 않는, 학습에 적절하지 않은 상품들에 대해 학습 데이터에서 제외

In [31]:
product_in_review = total_review['product_id'].unique().tolist()

pir_tag_type = rec_product_tag(product_in_review)

pir_tag_type.dropna(subset='tag_type', inplace=True)

final_product = pir_tag_type['product_id'].tolist()

total_review = total_review[total_review['product_id'].isin(final_product)]

In [32]:
total_review.sort_values(by=['id','reviewer_name','product_id','updated_at'], ascending=[True,False,False,False], inplace=True)
total_review = total_review.drop_duplicates(subset=['id','reviewer_name','product_id'], keep='first')

In [33]:
total_review = total_review.groupby(['live_store_id','product_id','reviewer_name','reserved_at'])['reviewer_stars'].mean().reset_index()

3. 학습 / 테스트 데이터 분할

In [34]:
user_Count = total_review['reviewer_name'].value_counts().reset_index()

test_target_review = total_review[total_review['reviewer_name'].isin(user_Count[user_Count['count'] >= 13]['reviewer_name'].tolist())]
test_target_review.reset_index(inplace=True)

test_review = test_target_review.sort_values(by=['reviewer_name','reserved_at'], ascending=[False,False]).groupby('reviewer_name').head(10)
add_train_review = test_target_review[~test_target_review['index'].isin(test_review['index'].tolist())]
train_review = total_review[~total_review['reviewer_name'].isin(test_target_review['reviewer_name'].unique().tolist())]

user_Count_2 = train_review['reviewer_name'].value_counts().reset_index()

final_train_review = train_review[train_review['reviewer_name'].isin(user_Count_2[user_Count_2['count'] >= 3]['reviewer_name'].tolist())]

train = pd.concat([final_train_review, add_train_review], axis=0)
test = test_review

print('학습 데이터 개수:', len(train))
print('테스트 데이터 개수:', len(test))

학습 데이터 개수: 13771
테스트 데이터 개수: 880


#### 2. Data Preprocessing

1. Reviewer-Store Dictionary 생성 및 저장

In [35]:
reviewer_list = train['reviewer_name'].unique().tolist()
product_list = train['product_id'].unique().tolist()

reviewer_name2idx, reviewer_idx2name = make_dic(reviewer_list)
product_name2idx, product_idx2name = make_dic(product_list)

In [36]:
# 저장
pickle.dump(reviewer_name2idx, open('../PICKLE/reviewer_name2idx.pkl', 'wb'))
pickle.dump(reviewer_idx2name, open('../PICKLE/reviewer_idx2name.pkl', 'wb'))
pickle.dump(product_name2idx, open('../PICKLE/product_name2idx.pkl', 'wb'))
pickle.dump(product_idx2name, open('../PICKLE/product_idx2name.pkl', 'wb'))

# 불러오기
reviewer_name2idx = pickle.load(open('../PICKLE/reviewer_name2idx.pkl', 'rb'))
reviewer_idx2name = pickle.load(open('../PICKLE/reviewer_idx2name.pkl', 'rb'))
product_name2idx = pickle.load(open('../PICKLE/product_name2idx.pkl', 'rb'))
product_idx2name = pickle.load(open('../PICKLE/product_idx2name.pkl', 'rb'))

2. Collaborative Filtering - Alternating Least Squares 모델 학습을 위한 전처리

In [37]:
train['reviewer_name'] = train['reviewer_name'].apply(lambda x:reviewer_name2idx[x])
train['product_id'] = train['product_id'].apply(lambda x:product_name2idx[x])

train[['product_id', 'reviewer_name', 'reviewer_stars']] = train[['product_id', 'reviewer_name', 'reviewer_stars']].astype(float)
train[['product_id', 'reviewer_name', 'reviewer_stars']] = train[['product_id', 'reviewer_name', 'reviewer_stars']].astype(int)
train.reset_index(inplace=True)

train.drop('index',axis=1,inplace=True)

In [38]:
num_reviewer = train['reviewer_name'].nunique()
num_product = train['product_id'].nunique()

#### 3. Modeling

1. User-Based Als_Model 학습 및 저장

In [39]:
reviewer_product = csr_matrix((train['reviewer_stars'].values, (train.reviewer_name, train.product_id)), shape= (num_reviewer, num_product))

# Implicit Alternating Least Squares 모델 선언
ub_als = AlternatingLeastSquares(factors=256, regularization=0.01, use_gpu=False, iterations=10, dtype=np.float32)

# 모델 학습
ub_als.fit(reviewer_product)

# 모델 저장
pickle.dump(ub_als, open('../MODEL/ALS_USER_BASED', 'wb'))

  0%|          | 0/10 [00:00<?, ?it/s]

2. Item-Based Als_Model 학습 및 저장

In [40]:
product_reviewer = csr_matrix((train['reviewer_stars'].values, (train.product_id, train.reviewer_name)), shape= (num_product, num_reviewer))

# Implicit Alternating Least Squares 모델 선언
ib_als = AlternatingLeastSquares(factors=256, regularization=0.01, use_gpu=False, iterations=10, dtype=np.float32)

# 모델 훈련
ib_als.fit(product_reviewer)

# 모델 저장
pickle.dump(ib_als, open('../MODEL/ALS_ITEM_BASED', 'wb'))

  0%|          | 0/10 [00:00<?, ?it/s]

#### 4. TEST

In [41]:
test_id = test['reviewer_name'].unique().tolist()

In [42]:
# 필요 딕셔너리, 모델 
reviewer_name2idx, reviewer_idx2name, product_name2idx, product_idx2name = load_token(path='../PICKLE/', x='reviewer', y='product')
als_ub, als_ib = load_cf(path='../MODEL/', user_based_model_name='ALS_USER_BASED', item_based_model_name='ALS_ITEM_BASED')

# 성능 확인을 위한 리스트 
result_list = []

for reviewer_name in tqdm(test_id):

    reviewer_idx = reviewer_name2idx[reviewer_name]
    
    # User-Based Alternating Least Squares Model => User Clustering 
    similar_users = als_ub.similar_users(reviewer_idx, len(als_ub.user_factors))[0].tolist()
    
    # 비슷한 유저가 구매한 상품이(이후 추천 후보 상품) 10개 이상이 되는 최소 유저의 수를 역으로 계산
    product_unique = []
    store_unique = []

    for idx, user_idx in enumerate(similar_users):

        target_df = train[train['reviewer_name'] == user_idx]
        target_product_list = target_df['product_id'].unique().tolist()
        target_store_list = target_df['live_store_id'].unique().tolist()
        product_unique += target_product_list
        store_unique += target_store_list
        if len(set(store_unique)) >= 10: break
    
    # 추천 후보 상품의 TAG
    target_user_list = similar_users[:idx+1]
    target_df = train[train['reviewer_name'].isin(target_user_list)]
    target_product_list = target_df['product_id'].unique().tolist()
    target_product_list = [product_idx2name[i] for i in target_product_list]
    target_tag_df = rec_product_tag(target_product_list)

    # 추천 타겟 유저가 구매한 상품들의 TAG
    input_df = train[train['reviewer_name']==reviewer_idx]
    input_product_list = input_df['product_id'].unique().tolist()
    input_product_list = [product_idx2name[i] for i in input_product_list]
    input_tag_df = tag_and_type(input_product_list)
    input_tag_df['store_id'] = 0
    input_tag_df['product_id'] = 0

    # 추천 타겟 유저가 구매한 상품이 없거나, 그 상품들의 태그가 없거나, 추천 후보 상품이 없는 예외 케이스
    if len(input_tag_df) == 0 or input_tag_df.iloc[0]['ptm_ids'] == None or len(target_tag_df) == 0: continue

    # 추천 타겟 유저가 구매한 상품들의 TAG TYPE 가중치 반영
    input_ptm = input_tag_df.iloc[0]['ptm_ids'].split(' ')
    input_type = input_tag_df.iloc[0]['tag_type'].split(' ')

    input_type_weight_list = type_weight(input_ptm, input_type)

    input_tag_df.at[0,'ptm_ids'] = ' '.join(input_type_weight_list)

    # 추천 후보 상품의 TAG TYPE 가중치 반영
    for k in range(len(target_tag_df)):
        target_ptm = target_tag_df.iloc[k]['ptm_ids'].split(' ')
        target_type = target_tag_df.iloc[k]['tag_type'].split(' ')

        target_type_weight_list = type_weight(target_ptm, target_type)
        
        target_tag_df.at[k,'ptm_ids'] = ' '.join(target_type_weight_list)

    # 추천 타겟 유저 구매 상품과 추천 후보 상품의 유사도 계산 => 낮을 수록 비슷한 상품  
    result = get_sim_df(input_tag_df, target_tag_df)
    
    product_store_dic = {}
    
    target_df['product_id'] = target_df['product_id'].apply(lambda x:product_idx2name[x])
    for vals in target_df.values:
        product_store_dic[vals[2]] = vals[1]

    result = result[result['product_id'] != 0]
    result['store_id'] = result['product_id'].apply(lambda x:product_store_dic[x])

    # result.drop_duplicates(subset='store_id', inplace=True)
    result.sort_values(by='CB 유사도', ascending=True, inplace=True)
    
    pred = result.head(10)['product_id'].values.tolist()
    label = test[test['reviewer_name']==reviewer_name]['product_id'].values.tolist()
    result_list.append([reviewer_idx,label,pred])    

  0%|          | 0/88 [00:00<?, ?it/s]

100%|██████████| 88/88 [00:06<00:00, 13.61it/s]


In [102]:
final_result = pd.DataFrame(result_list,columns=['user_id','label','pred'])


final_result['PRODUCT_BASED:mAP@10'] = final_result.apply(lambda x: evaluate_map(x['label'],x['pred'],10), axis = 1)
final_result['PRODUCT_BASED:Recall@10'] = final_result.apply(lambda x: evaluate_recall(x['label'],x['pred'],10), axis = 1)


final_result['label_tag_set'] = final_result['label'].apply(lambda x:product_tag_set(x))
final_result['pred_tag_set'] = final_result['pred'].apply(lambda x:product_tag_set(x))

final_result['TAG_BASED:mAP@10'] = final_result.apply(lambda x: tag_based_evaluate_map(x['label_tag_set'],x['pred_tag_set'],10), axis = 1)
final_result['TAG_BASED:Recall@10'] = final_result.apply(lambda x: tag_based_evaluate_recall(x['label_tag_set'],x['pred_tag_set'],10), axis = 1)

product_based_result = [
    ['10',final_result['PRODUCT_BASED:mAP@10'].mean(),final_result['PRODUCT_BASED:Recall@10'].mean()]
                 ]
tag_based_result = [
    ['10',final_result['TAG_BASED:mAP@10'].mean(),final_result['TAG_BASED:Recall@10'].mean()]
                 ]

Performace_List = tag_based_result[0]

k = Performace_List[0]
mAP = Performace_List[1]
Recall = Performace_List[2]

print('k: {}'.format(k))
print('mAP@10: {}'.format(mAP))
print('Recall@10: {}'.format(Recall)) 

k: 10
mAP@10: 0.25702065295815296
Recall@10: 0.36250000000000004


In [93]:
def sample_check(user_id):
    
    label_list = final_result[final_result['user_id'] == user_id]['label'].tolist()[0]
    pred_list = final_result[final_result['user_id'] == user_id]['pred'].tolist()[0]

    conn = live_db_conn()
    curs = conn.cursor()

    sql = """
    SELECT id, name from product where id in {}
    """.format(tuple(label_list))
    curs.execute(sql)

    label_df = pd.DataFrame(curs.fetchall())

    sql = """
    SELECT id, name from product where id in {}
    """.format(tuple(pred_list))
    curs.execute(sql)

    pred_df = pd.DataFrame(curs.fetchall())

    curs.close()
    conn.close()


    label_name = label_df['name'].unique().tolist()
    pred_name = pred_df['name'].unique().tolist()

    print('[유저가 먹은 음식]')

    for i in label_name:
        print(i)

    print('\n')
    print('[추천 받은 음식]')
    for j in pred_name:
        print(j)

In [98]:
sample_check(2090)

[유저가 먹은 음식]
수육(大)
간장 갈비찜(大)
나주곰탕
갈비탕
나주육전


[추천 받은 음식]
한우나주곰탕
한우 불고기쌈밥
돼지 불고기쌈밥
매운 갈비찜(中)
나주곰탕
나주곰탕(특)
갈비탕
나주육전
특곰탕
