In [2]:
from surprise import KNNBasic, SVD, SVDpp, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

def find_best_params(df, min_r, max_r):
    reader = Reader(rating_scale=(min_r, max_r))
    data = Dataset.load_from_df(df, reader=reader)
    # data_folds = DatasetAutoFolds(ratings_file='surprise_dataset_1027.csv', reader=reader)
    #trainset = data_folds_build_full_trainset()
    trainset, testset = train_test_split(data,test_size=.25,random_state=0)
    
    print('SVD 모델로 학습') 
    algo = SVD(random_state=0)
    cross_validate(algo, data, measures=['RMSE','MAE'], cv=5, verbose=True)


    # 최적화할 파라미터를 딕셔너리 형태로 지정 
    param_grid ={'n_epochs': [20,40,60], 'n_factors': [50,100,200]}

    gs = GridSearchCV(SVD, param_grid, measures = ['rmse','mae'],cv=3)
    gs.fit(data)

    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    
    return gs.best_params['rmse']
    


In [3]:
from surprise.dataset import DatasetAutoFolds

def r_predict_df(df, max_r,min_r, best_params):
    reader = Reader(rating_scale=(min_r,max_r))
    data = Dataset.load_from_df(df, reader=reader)
    
    n_epochs = best_params['n_epochs']
    n_factors = best_params['n_factors']   
    
    #train 과  test으로 나누지말고 전부로 모델 학습하기 
    trainset = data.build_full_trainset()
    
    #SVD 모델로 학습 
    algo = SVD(n_epochs=n_epochs ,n_factors=n_factors, random_state=0)
    algo.fit(trainset)

    print('-------------학습 완료-------------------')
    
    return algo


In [4]:
# 1. 해당 userid가 아직 평점을 매기지 않은 제품, 평점을 매긴 모든 제품 추출 (리스트 형태)
def get_unpurchased_surprise(origin_df, userid):
    # userid에 해당하는 사용자가 평점을 매긴 모든 제품
    purchased_prds = origin_df[(origin_df['CUSTNO']==userid)&(origin_df['SCORE'] > 0)]['PRD'].tolist()
    
    total_prds = origin_df['PRD'].unique()
    
    # userid에 해당하는 사용자가 한번도 구매하지 않은 모든 제품
    unpurchased_prds = [PRD for PRD in total_prds if PRD not in purchased_prds]

    # 한번도 구매하지 않은 모든 제품, 평점을 매긴 모든 제품 반출
    return unpurchased_prds,  purchased_prds

# 입력(군집별 고객 데이터, 최적의 예측값 모델, 고객번호, top 개수) -> 출력 (안사본 물품 중 top, 구매한 물품 중 top)
def recomm_prd_by_surprise(temp_df, algo,  userid, top_n):
    
    # 구매하지 않은 제품, 평점을 매긴 모든 제품 추출 각각 추출 (1. 함수 적용)
    unpurchased_prds, purchased_prds = get_unpurchased_surprise(temp_df, userid)
    
#  모델 적용
    # 1) 아직 안사본 제품
    predictions = [algo.predict(str(userid), str(PRD)) for PRD in unpurchased_prds]
    # 2) 이미 사본 제품
    predictions_bought = [algo.predict(str(userid), str(PRD)) for PRD in purchased_prds]
    
    # predictions list 객체는 predictions 객체를 원소로 갖고 있음
    # est 값으로 정렬 _ sortkey_est 함수 생성
    def sortkey_est(pred):
        return pred.est
    
# pred값을 내림 차순으로 정렬

## 1) 아직 안사본 제품
    predictions.sort(key = sortkey_est, reverse = True)
    # 상위 n개만 추출
    top_predictions = predictions[:top_n]
    
## 2) 이미 사본 제품
    predictions_bought.sort(key = sortkey_est, reverse = True)
    # 상위 n개만 추출
    top_predictions_bought = predictions_bought[:top_n]
    
# ! 최종 추출
# 상위 n 개의 제품 정보, 예측 지수
    
    # 1) 안사본 제품 
    top_prds = [pred.iid for pred in top_predictions] # 해당 제품
    top_prds_rating = [pred.est for pred in top_predictions] # 예측 지수
    
    # 2) 사본 제품
    top_prds_bought = [pred.iid for pred in top_predictions_bought] # 해당 제품
    top_prds_rating_bought = [pred.est for pred in top_predictions_bought] # 예측 지수
    
# *최종 값 형태 = 리스트 내 튜플 형태 (해당 제품, 예측 지수) ]
    top_prds_preds = [ ( id, rating) for id, rating in zip(top_prds, top_prds_rating)]
    top_prds_preds_bought = [ ( id, rating) for id, rating in zip(top_prds_bought, top_prds_rating_bought)]
    
    return top_prds_preds, top_prds_preds_bought

# 데이터프레임 불러오기
- 구매지수 R 데이터프레임 
- 고객번호와 군집 라벨 데이터프레임
- 카테고리 데이터프레임

In [5]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

surprise_df = pd.read_csv('C:/cakd7/2차 프로젝트/surprise_dataset_1027.csv')
surprise_df.columns = ['CUSTNO', 'PRD', 'SCORE']
# user_item_index_df
surprise_df.head()

Unnamed: 0,CUSTNO,PRD,SCORE
0,1,A_4대 B/D,0.0
1,1,A_5 ON THE GO,0.0
2,1,A_ACC Bloom (1F),0.0
3,1,A_ACC Bloom (3F),0.0
4,1,A_AK골프,0.0


In [7]:
cust_df = pd.read_csv('./생성데이터/5. 추천시스템/surprise_data/cust_clustering2.csv', index_col=0)

cluster_list = list(cust_df['cluster'].unique())

cust_df.head()

Unnamed: 0_level_0,cluster
CUSTNO,Unnamed: 1_level_1
6,3
11,0
12,3
13,3
16,3


In [8]:
final_cat = pd.read_csv('생성데이터/5. 추천시스템/최종cat순서나열.csv', index_col=0)
final_cat.drop(columns=['대분류코드','QTY','구매금액_min','구매금액_max','구매금액_mean'], inplace=True)
# category_df
final_cat.head()

Unnamed: 0,제휴사,금액 타입,대분류명_y,중분류명,소분류명,UPRICE,AVG,AVGVS
0,B,저가,식품,버섯,건버섯,3017,7926,0.4
1,B,저가,식품,버섯,느타리버섯,3017,7926,0.4
2,B,저가,식품,버섯,버섯모둠,3017,7926,0.4
3,B,저가,식품,버섯,버섯선물세트,3017,7926,0.4
4,B,저가,식품,버섯,새송이버섯,3017,7926,0.4


In [9]:
# 특정 고객의 구매이력이 있는 추천리스트, 구매이력이 없는 추천리스트 딕셔너리 형태로 반환하는 함수 생성
# 입력값 (고객별 군집결과 데이터, 구매지수 데이터, 소분류명 정보_제휴사, 단가) 
# -> 반출값 : 1. 구매이력이 있는 추천리스트, 2. 구매이력이 없는 추천리스트
def dict_recommend_unrecommend(cluster_df, user_item_index_df, cluster_name): 
    
    # 해당 군집에 있는 고객 번호 뽑아서 리스트로
    cust_index_list = list(cluster_df[cluster_df['cluster']==cluster_name].index)
    # 구매지수 데이터에서 해당하는 고객번호들의 데이터로 새로운 데이터프레임 생성
    temp_df = user_item_index_df[user_item_index_df['CUSTNO'].isin(cust_index_list)] #isin(cust_index_list)]

    # 구매지수 최댓값, 최솟값 추출
    max_r = temp_df['SCORE'].max()
    min_r = temp_df['SCORE'].min()

    ## 서프라이즈 최적 파라미터  돌리기
    best_params = find_best_params(temp_df,min_r,max_r)

     ## 서프라이즈 최적 파라미터로 해당 for문의 군집을 위한 모델 학습
    algo = r_predict_df(temp_df,max_r,min_r, best_params)
    
    # 해당군집으로 학습된 모델을 반환
    return algo, cust_index_list, temp_df


def return_of_recommend_dict(algo, cust_index_list, temp_df):
    
    super_list = []
    super_list_bought = []
    
    # 고객별 추천 결과 받기 
# !--출력 값 (딕셔너리 형태)
        # 1) 안사본 물품
    recomm_dict = {} 
    # 2) 사본 물품
    recomm_dict_bought = {}

    # 위에서 해당 군집의 각 고객별로 상품 예측 지수를 위에서 학습시킨 모델로 구하기 
    for cust in cust_index_list:

        # 입력 (해당하는 군집에 있는 고객 데이터, 최적 파라미터로 모델, 고객, top) -> 추출 (안사본 물품 중 top, 구매한 물품 중 top)
        top_prd_preds, top_prd_preds_bought  = recomm_prd_by_surprise(temp_df, algo, cust, top_n=50)

        # 해당하는 고객의  입력 _(해당 제품, 구매지수) ex. {9_고객번호 : (양말_해당 제품이름, 0.5), (), () ......}

    # 1) 안사본 물품
        recomm_dict[cust] = [top_prd_preds[0]] # 첫번째 value 입력

        # 딕셔너리 차례로 담기
        for prd in top_prd_preds[1:]:
            if cust in recomm_dict:
                recomm_dict[cust].append(prd)

    # 2) 이미 사본 제품
        recomm_dict_bought[cust] = [top_prd_preds_bought[0]]
        for prd in top_prd_preds_bought[1:]:
            if cust in recomm_dict_bought:
                recomm_dict_bought[cust].append(prd)

# ! -- 최종 출력 값(리스트 형태) _ 군집별 고객별 제품 추천 딕셔너리
        super_list.append(recomm_dict)
        super_list_bought.append(recomm_dict_bought)

    return super_list, super_list_bought

# 특정 고객의 추천 리스트 받기 

In [10]:
# algo, cust_index_list, temp_df = dict_recommend_unrecommend(df, user_item_index_df, cluster)
# super_list, super_list_bought = return_of_recommend_dict(algo, cust_index_list, temp_df)

In [11]:
# 고객 

def get_recommend_cust(cluster, cust, super_list, category_df):
    
    dict_custs = super_list[cluster]
    df = pd.DataFrame(dict_custs[cust])
    df.columns = ['ITEM','RATING']
    df['ASSO'] = df['ITEM'].apply(lambda x: x[0])
    df['ITEM'] = df['ITEM'].apply(lambda x: x[2:])

    dff = df.merge(category_df, how='left', left_on=['ASSO','ITEM'], right_on=['제휴사', '소분류명'])
    
    dff_replace = dff[['제휴사', '대분류명_y', '중분류명', '소분류명', '금액 타입', 'RATING']]
    dff_replace.columns = ['제휴사', '대분류명', '중분류명', '추천 상품', '금액 타입', '예측 구매 지수']
    
    cat_grouped = dff_replace.groupby('대분류명')
    
    cat_list = list(dff_replace['대분류명'].unique())
    
    print(f' ♣ {cust} 고객님만을 위한 추천 상품이에요!!!! only for you~ ', '\n')
    for cat in cat_list:
        print()
        print(f' {cat} 추천 제품 ▼')
        display(cat_grouped.get_group(cat))
        
def get_recommend_cust_bought(cluster, cust, super_list_bought, category_df):
    
    dict_custs = super_list_bought[cluster]
    df = pd.DataFrame(super_list_bought[cust])
    df.columns = ['ITEM','RATING']
    df['ASSO'] = df['ITEM'].apply(lambda x: x[0])
    df['ITEM'] = df['ITEM'].apply(lambda x: x[2:])

    dff = df.merge(category_df, how='left', left_on=['ASSO','ITEM'], right_on=['제휴사', '소분류명'])
    
    dff_replace = dff[['제휴사', '대분류명_y', '중분류명', '소분류명', '금액 타입', 'RATING']]
    dff_replace.columns = ['제휴사', '대분류명', '중분류명', '추천 상품', '금액 타입', '예측 구매 지수']
    
    cat_grouped = dff_replace.groupby('대분류명')
    
    cat_list = list(dff_replace['대분류명'].unique())
    
    print(f' ♣ {cust} 고객님이 선호하는 상품이에요!!!! only for you~ ', '\n')
    # 출력값
    for cat in cat_list:
        print()
        print(f' {cat}의 추천 제품 ▼')
        display(cat_grouped.get_group(cat))
        
    
# print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', '\n')
# get_recommend_cust(cust_df, 0,9538, final_cat)
# print('')
# print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   PURCHSED RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', '\n')
# get_recommend_cust_bought(cust_df, 0,9538, final_cat)


# 값 출력해보기

## 0번 군집 학습

In [22]:
algo, cust_index_list, temp_df = dict_recommend_unrecommend(cust_df, surprise_df, 0)

SVD 모델로 학습
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0050  0.0054  0.0052  0.0050  0.0049  0.0051  0.0002  
MAE (testset)     0.0007  0.0007  0.0007  0.0007  0.0007  0.0007  0.0000  
Fit time          82.74   88.48   86.07   90.51   90.47   87.65   2.95    
Test time         42.55   33.08   42.98   41.91   31.72   38.45   4.97    
0.005089507141736633
{'n_epochs': 40, 'n_factors': 200}
-------------학습 완료-------------------


In [23]:
super_list, super_list_bought = return_of_recommend_dict(algo, cust_index_list, temp_df)

In [None]:
super_list

In [None]:
get_recommend_cust(0, 11, super_list, final_cat)

## 1번 군집 학습

In [12]:
algo1, cust_index_list1, temp_df1 = dict_recommend_unrecommend(cust_df, surprise_df, 1)

SVD 모델로 학습
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0067  0.0067  0.0075  0.0062  0.0062  0.0067  0.0005  
MAE (testset)     0.0008  0.0008  0.0008  0.0008  0.0008  0.0008  0.0000  
Fit time          50.09   58.86   46.51   58.24   53.89   53.52   4.73    
Test time         12.69   15.10   13.46   14.11   14.71   14.01   0.86    
0.006672268786087228
{'n_epochs': 40, 'n_factors': 100}
-------------학습 완료-------------------


In [None]:
super_list1, super_list_bought1 = return_of_recommend_dict(algo1, cust_index_list1, temp_d1)

In [None]:
get_recommend_cust(cl, 1, 특정고객 아이디, super_list1, final_cat)

## 2번 군집 학습

In [13]:
algo2, cust_index_list2, temp_df2 = dict_recommend_unrecommend(cust_df, surprise_df, 2)

SVD 모델로 학습
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0054  0.0054  0.0086  0.0078  0.0050  0.0064  0.0015  
MAE (testset)     0.0006  0.0006  0.0006  0.0006  0.0006  0.0006  0.0000  
Fit time          88.52   116.38  97.43   93.57   93.35   97.85   9.69    
Test time         31.63   35.35   31.21   25.26   32.96   31.28   3.34    
0.0064641596854195064
{'n_epochs': 40, 'n_factors': 200}
-------------학습 완료-------------------


## 3번 군집 학습

In [None]:
algo4, cust_index_list4, temp_df4 = dict_recommend_unrecommend(cust_df, surprise_df, 3)

SVD 모델로 학습
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0038  0.0048  0.0050  0.0039  0.0046  0.0044  0.0005  
MAE (testset)     0.0005  0.0005  0.0005  0.0005  0.0005  0.0005  0.0000  
Fit time          225.79  242.36  239.70  237.92  236.32  236.42  5.68    
Test time         185.74  75.28   80.70   92.28   87.19   104.24  41.16   


## 4번 군집 학습

In [14]:
algo4, cust_index_list4, temp_df4 = dict_recommend_unrecommend(cust_df, surprise_df, 4)

SVD 모델로 학습
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0066  0.0065  0.0147  0.0064  0.0070  0.0082  0.0032  
MAE (testset)     0.0008  0.0008  0.0008  0.0008  0.0008  0.0008  0.0000  
Fit time          36.83   38.62   38.31   39.14   38.92   38.36   0.82    
Test time         7.30    12.42   9.81    12.70   12.56   10.96   2.12    
0.00845829644218087
{'n_epochs': 60, 'n_factors': 100}
-------------학습 완료-------------------


In [None]:
clust_df = pd.read_csv('생성데이터/5. 추천시스템/surprise_data/cust_clustering.csv', index_col = 0)

def get_recommend_cust_bought(df, cluster, cust, user_item_index_df, category_df):
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', '\n')
get_recommend_cust(clust_df, 0,9538, surprise_df, final_cat)
print('')
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   PURCHSED RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', '\n')
get_recommend_cust_bought(clust_df, 0,9538, surprise_df, final_cat)

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 

SVD 모델로 학습
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0066  0.0069  0.0059  0.0073  0.0064  0.0066  0.0005  
MAE (testset)     0.0007  0.0007  0.0007  0.0007  0.0007  0.0007  0.0000  
Fit time          98.45   107.46  101.24  92.31   94.39   98.77   5.34    
Test time         32.04   34.74   35.64   25.02   31.87   31.86   3.73    
0.006641465075608799
{'n_epochs': 40, 'n_factors': 200}
-------------학습 완료-------------------
SVD 모델로 학습
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.0071  0.0068  0.0069  0.0071  0.0069  0.0070  0.0001  
MAE (testset)     0.0009  0.0009  0.0008  0.0009  0.0009  0.0009  0.0000  
Fit time          72.93   80.69   87.