In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# 추천방식(Contents Based Model)

In [None]:
data = pd.read_csv('/content/drive/MyDrive/foodrec/category_data_test.csv')

In [None]:
product_count = data['product_name'].value_counts().to_frame()
product_count = product_count.reset_index().rename(columns = {"index":"product_name","product_name":"count","count":"category"})
product_count

Unnamed: 0,product_name,count
0,Annie Peely Fruit 4.5 Z,155
1,Organic Brown Rice Cake Salt-Free,155
2,Hampshire 100% Natural Sour Cream,155
3,Gluten Free Sliced Mountain White Bread,155
4,Grassmilk 2% Reduced Fat Milk,155
...,...,...
13612,Organic Red Wine Vinegar,7
13613,Peanut Butter Chocolate Candy,7
13614,Corn Oil,7
13615,Imitation Crab Flakes,7


In [None]:
change_data = pd.merge(data, product_count, on = "product_name")
df = change_data.drop(columns=['add_to_cart_order', 'reordered','aisle_id','department_id','order_id','user_id',
                                'eval_set','order_number','order_dow','order_hour_of_day','days_since_prior_order','aisle','department'], axis=1)
df = df.drop_duplicates()
final_df = df.sort_values('count', ascending = False)
final_df

Unnamed: 0,product_id,product_name,category,count
175621,20169,Annie Peely Fruit 4.5 Z,snacks,155
159022,32052,Organic Brown Rice Cake Salt-Free,snacks,155
18792,3800,Hampshire 100% Natural Sour Cream,cheese,155
374929,28058,Gluten Free Sliced Mountain White Bread,bakery,155
37232,19820,Grassmilk 2% Reduced Fat Milk,milk,155
...,...,...,...,...
327475,31267,Organic Red Wine Vinegar,pantry,7
199308,21815,Peanut Butter Chocolate Candy,snacks,7
327539,6138,Corn Oil,pantry,7
403980,49436,Imitation Crab Flakes,packaged seafood,7


In [None]:
popular_rec_model = final_df[0:500]
popular_rec_model.head(20)

Unnamed: 0,product_id,product_name,category,count
175621,20169,Annie Peely Fruit 4.5 Z,snacks,155
159022,32052,Organic Brown Rice Cake Salt-Free,snacks,155
18792,3800,Hampshire 100% Natural Sour Cream,cheese,155
374929,28058,Gluten Free Sliced Mountain White Bread,bakery,155
37232,19820,Grassmilk 2% Reduced Fat Milk,milk,155
122816,39055,Mild Salsa Roja,deli,155
247693,49273,Light and Lean Quinoa Black Beans with Buttern...,frozen meals,155
208634,43568,Traditional Italian Sauce,pasta sauce,155
158152,7361,Herb Crackers,snacks,155
417149,25630,Healthy Grains Fiber Cinnamon Oat Clusters,breakfast,155


In [None]:
train = pd.read_csv('/content/drive/MyDrive/foodrec/train_customer.csv')
test = pd.read_csv('/content/drive/MyDrive/foodrec/test_customer.csv')

In [None]:
train = pd.merge(train, data[['user_id', 'product_name','product_id']], how='left', on='user_id')
test = pd.merge(test, data[['user_id', 'product_name','product_id']], how='left', on='user_id')

In [None]:
sol = test.groupby(['user_id'])['product_name'].agg({'unique'}).reset_index()
gt = {}
for user in tqdm(sol['user_id'].unique()): 
    gt[user] = list(sol[sol['user_id'] == user]['unique'].values[0])

  0%|          | 0/110227 [00:00<?, ?it/s]

In [None]:
rec_df = pd.DataFrame()
rec_df['user_id'] = train['user_id'].unique()

## TF-IDF 이용

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(final_df['product_name'])
print(tfidf_matrix.shape)

(13617, 4062)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
cosine_matrix.shape

(13617, 13617)

In [None]:
# product_name와 p_id를 매핑할 dictionary를 생성 
product2id = {}
for i, c in enumerate(final_df['product_name']):
    product2id[i] = c

# p_id와 product_name 매핑할 dictionary를 생성
id2product = {}
for i, c in product2id.items():
    id2product[c] = i
    
# p_id와 product_name 매핑할 dictionary를 생성
proid2pro = {}
for i, j in zip(final_df['product_name'].values, final_df['product_id'].values):
    proid2pro[i] = j

In [None]:
final_df['product_name'].head()

175621                    Annie Peely Fruit 4.5 Z
159022          Organic Brown Rice Cake Salt-Free
18792           Hampshire 100% Natural Sour Cream
374929    Gluten Free Sliced Mountain White Bread
37232               Grassmilk 2% Reduced Fat Milk
Name: product_name, dtype: object

In [None]:
idx = id2product['Corn Oil']  
sim_scores = [(product2id[i], c) for i, c in enumerate(cosine_matrix[idx]) if i != idx] 
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True)
sim_scores[0:10] 

[('100% Pure Corn Oil', 0.7092861311433707),
 ('Coconut Oil', 0.5204696399867038),
 ('Peanut Oil', 0.513638365921127),
 ('Vegetable Oil', 0.5033589972360326),
 ('Whole Baby Corn', 0.48702193703222874),
 ('Organic Coconut Oil', 0.4832825929896025),
 ('Organic Peanut Oil', 0.4777986375195055),
 ('Organic Sweet Corn', 0.46776906901215715),
 ('Popped Corn Just The Cheese Corn', 0.4672216806596919),
 ('Organic Fire Roasted Corn', 0.4600845045577137)]

In [None]:
# 상품명 있는지 확인
tf_train = train[train['product_name'].notnull()].reset_index(drop=True)
tf_train['idx2product'] = tf_train['product_name'].apply(lambda x: id2product[x])
tf_train.head()

Unnamed: 0,user_id,customer_chracter,product_name,product_id,idx2product
0,1,breakfast,Pistachios,10258,744
1,1,breakfast,Aged White Cheddar Popcorn,26088,233
2,1,breakfast,Cinnamon Toast Crunch,13032,24
3,2,snacks,Asian Pears,13640,115
4,2,snacks,Organic Cashew Carrot Ginger Soup,1757,1218


In [None]:
idx2id2product = {}
for i, j in zip(tf_train['idx2product'].values, tf_train['product_id'].values):
    idx2id2product[i] = j

In [None]:
# 1. 각 유저별로 구입한 상품 목록 수집
user = 1
bought_list = tf_train.groupby(['user_id'])['idx2product'].agg({'unique'}).reset_index()
bought = bought_list[bought_list['user_id'] == user]['unique'].values[0]
bought

array([744, 233,  24])

In [None]:
# 2. 구매한 상품과 유사한 상품 확인
cosine_matrix[744]

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
# 3. 모든 상품에 대해서 유사도를 더한 값을 계산 
total_cosine_sim = np.zeros(len(product2id))
for product_ in bought: 
    total_cosine_sim += cosine_matrix[product_]
    # 744,233,24까지의 유사도 결합
total_cosine_sim.max() # 확인용

1.0

In [None]:
# 4.유사도가 가장 높은 순서대로 추출
# 자기 자신을 제외한 상품의 유사도 측정 및 인덱스 추출
sim_scores = [(i, c) for i, c in enumerate(total_cosine_sim) if i not in bought]
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse=True) # 유사도가 높은 순서대로 정렬 
sim_scores[0:5]

[(1587, 0.8862463382783108),
 (11468, 0.7734999368370705),
 (13357, 0.758208053005512),
 (8675, 0.7527555167202933),
 (3448, 0.7391598414436177)]

In [None]:
# Aged White Cheddar Popcorn 넣었을 때 나오는 유사도 높은 항목
print(product2id[1587])

Vegan Aged White Cheddar  Popcorn


In [None]:
tf_train['user_id'].unique()

array([     1,      2,      5, ..., 206203, 206205, 206209])

In [None]:
total_rec_list = {}

bought_list1 = train.groupby(['user_id'])['product_id'].agg({'unique'}).reset_index() # 유저별로 구매한 상품 목록 리스트로 표현
bought_list2 = tf_train.groupby(['user_id'])['idx2product'].agg({'unique'}).reset_index() # 유저별로 구매한 상품 번호(idx2product) 리스트로 표현

for user in tqdm(train['user_id'].unique()):
    rec_list = [] # 추천할 목록 리스트

    # TF-IDF의 추천대상일 경우
    if user in tf_train['user_id'].unique():
        bought = bought_list2[bought_list2['user_id'] == user]['unique'].values[0] # 샀던 목록들 리스트 불러오고
        total_cosine_sim = np.zeros(len(product2id)) # 상품 수만큼(13617개) 0으로 채워주고
        for product_ in bought: # 샀던 목록에 상품에 대해서
            total_cosine_sim += cosine_matrix[product_] # 상품의 유사도값 더해주기
            
        # 유사도가 가장 높은 순서대로 추출
        sim_scores = [(proid2pro[product2id[i]], c) for i, c in enumerate(total_cosine_sim) if i not in bought] # 자기 자신을 제외한 상품 유사도, 인덱스 추출 
        recs = sorted(sim_scores, key = lambda x: x[1], reverse=True)[0:100] # 유사도가 높은 순서대로 정렬 
        for rec in recs: 
            if rec not in bought:
                rec_list.append(rec)   
        
    # TF-IDF의 추천대상이 아닐 경우
    else: 
        bought = bought_list1[bought_list1['user_id'] == user]['unique'].values[0]
        for rec in popular_rec_model[0:100]: # 가장 많이 샀던 물품 추천해주기
            if rec not in bought:
                rec_list.append(rec)
                
    total_rec_list[user] = rec_list[0:9]

  0%|          | 0/80685 [00:00<?, ?it/s]



In [None]:
# https://github.com/kakao-arena/brunch-article-recommendation/blob/master/evaluate.py 코드 참조
# MAP, NDCG, EntDiv 구하는 함수
import six
import math

class evaluate():
    def __init__(self, recs, gt, topn=100):
        self.recs = recs
        self.gt = gt 
        self.topn = topn 
        
    def _ndcg(self):
        Q, S = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            dcg = 0.0
            idcg = sum([1.0 / math.log(i + 2, 2) for i in range(min(len(seen), len(rec)))])
            for i, r in enumerate(rec):
                if r not in seen:
                    continue
                rank = i + 1
                dcg += 1.0 / math.log(rank + 1, 2)
            ndcg = dcg / idcg
            S += ndcg
            Q += 1
        return S / Q


    def _map(self):
        n, ap = 0.0, 0.0
        for u, seen in six.iteritems(self.gt):
            seen = list(set(seen))
            rec = self.recs.get(u, [])
            if not rec or len(seen) == 0:
                continue

            _ap, correct = 0.0, 0.0
            for i, r in enumerate(rec):
                if r in seen:
                    correct += 1
                    _ap += (correct / (i + 1.0))
            _ap /= min(len(seen), len(rec))
            ap += _ap
            n += 1.0
        return ap / n


    def _entropy_diversity(self):
        sz = float(len(self.recs)) * self.topn
        freq = {}
        for u, rec in six.iteritems(self.recs):
            for r in rec:
                freq[r] = freq.get(r, 0) + 1
        ent = -sum([v / sz * math.log(v / sz) for v in six.itervalues(freq)])
        return ent
    
    def _evaluate(self):
        print('MAP@%s: %s' % (self.topn, self._map()))
        print('NDCG@%s: %s' % (self.topn, self._ndcg()))
        print('EntDiv@%s: %s' % (self.topn, self._entropy_diversity()))

In [None]:
evaluate_func = evaluate(recs=total_rec_list, gt = gt, topn=9)
evaluate_func._evaluate()
# 성능 측정이 안됨

MAP@9: 0.0
NDCG@9: 0.0
EntDiv@9: 12.339836953780422


In [None]:
# 추천이 잘 이루어지는지 확인해보기
total_rec_list