# 서프라이즈 모델 파라미터 개선 및 학습 코드 

In [28]:
from surprise import KNNBasic, SVD, SVDpp, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

def find_best_params(df, min_r, max_r):
    reader = Reader(rating_scale=(min_r, max_r))
    data = Dataset.load_from_df(df, reader=reader)
    # data_folds = DatasetAutoFolds(ratings_file='surprise_dataset_1027.csv', reader=reader)
    #trainset = data_folds_build_full_trainset()
    trainset, testset = train_test_split(data,test_size=.25,random_state=0)
    
    print('SVD 모델로 학습') 
    algo = SVD(random_state=0)
    cross_validate(algo, data, measures=['RMSE','MAE'], cv=3, verbose=True)


    # 최적화할 파라미터를 딕셔너리 형태로 지정 
    param_grid ={'n_epochs': [20,40], 'n_factors': [50]}

    gs = GridSearchCV(SVD, param_grid, measures = ['rmse','mae'],cv=3)
    gs.fit(data)

    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    
    return gs.best_params['rmse']
    


In [45]:
from surprise.dataset import DatasetAutoFolds

def r_predict_df(df, min_r,max_r):
    reader = Reader(rating_scale=(min_r,max_r))
    data = Dataset.load_from_df(df, reader=reader)
    
    n_epochs = 20#best_params['n_epochs']
    n_factors = 50# best_params['n_factors']   
    
    #train 과  test으로 나누지말고 전부로 모델 학습하기 
    trainset = data.build_full_trainset()
    
    #SVD 모델로 학습 
    algo = SVD(n_epochs=n_epochs ,n_factors=n_factors, random_state=0)
    algo.fit(trainset)

    print('-------------학습 완료-------------------')
    
    return algo


In [30]:
# 1. 해당 userid가 아직 평점을 매기지 않은 제품, 평점을 매긴 모든 제품 추출 (리스트 형태)
def get_unpurchased_surprise(origin_df, userid):
    # userid에 해당하는 사용자가 평점을 매긴 모든 제품
    purchased_prds = origin_df[(origin_df['CUSTNO']==userid)&(origin_df['SCORE'] > 0)]['PRD'].tolist()
    
    total_prds = origin_df['PRD'].unique()
    
    # userid에 해당하는 사용자가 한번도 구매하지 않은 모든 제품
    unpurchased_prds = [PRD for PRD in total_prds if PRD not in purchased_prds]

    # 한번도 구매하지 않은 모든 제품, 평점을 매긴 모든 제품 반출
    return unpurchased_prds,  purchased_prds

# 입력(군집별 고객 데이터, 최적의 예측값 모델, 고객번호, top 개수) -> 출력 (안사본 물품 중 top, 구매한 물품 중 top)
def recomm_prd_by_surprise(temp_df, algo,  userid, top_n):
    
    # 구매하지 않은 제품, 평점을 매긴 모든 제품 추출 각각 추출 (1. 함수 적용)
    unpurchased_prds, purchased_prds = get_unpurchased_surprise(temp_df, userid)
    
#  모델 적용
    # 1) 아직 안사본 제품
    predictions = [algo.predict(str(userid), str(PRD)) for PRD in unpurchased_prds]
    # 2) 이미 사본 제품
    predictions_bought = [algo.predict(str(userid), str(PRD)) for PRD in purchased_prds]
    
    # predictions list 객체는 predictions 객체를 원소로 갖고 있음
    # est 값으로 정렬 _ sortkey_est 함수 생성
    def sortkey_est(pred):
        return pred.est
    
# pred값을 내림 차순으로 정렬

## 1) 아직 안사본 제품
    predictions.sort(key = sortkey_est, reverse = True)
    # 상위 n개만 추출
    top_predictions = predictions[:top_n]
    
## 2) 이미 사본 제품
    predictions_bought.sort(key = sortkey_est, reverse = True)
    # 상위 n개만 추출
    top_predictions_bought = predictions_bought[:top_n]
    
# ! 최종 추출
# 상위 n 개의 제품 정보, 예측 지수
    
    # 1) 안사본 제품 
    top_prds = [pred.iid for pred in top_predictions] # 해당 제품
    top_prds_rating = [pred.est for pred in top_predictions] # 예측 지수
    
    # 2) 사본 제품
    top_prds_bought = [pred.iid for pred in top_predictions_bought] # 해당 제품
    top_prds_rating_bought = [pred.est for pred in top_predictions_bought] # 예측 지수
    
# *최종 값 형태 = 리스트 내 튜플 형태 (해당 제품, 예측 지수) ]
    top_prds_preds = [ ( id, rating) for id, rating in zip(top_prds, top_prds_rating)]
    top_prds_preds_bought = [ ( id, rating) for id, rating in zip(top_prds_bought, top_prds_rating_bought)]
    
    return top_prds_preds, top_prds_preds_bought

# 데이터프레임 불러오기
- 구매지수 R 데이터프레임 
- 고객번호와 군집 라벨 데이터프레임
- 카테고리 데이터프레임

In [31]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

surprise_df = pd.read_csv('C:/jupyter-files/cakd7/2차 프로젝트/surprise_dataset_1027.csv')
surprise_df.columns = ['CUSTNO', 'PRD', 'SCORE']
surprise_df.head()

Unnamed: 0,CUSTNO,PRD,SCORE
0,1,A_4대 B/D,0.0
1,1,A_5 ON THE GO,0.0
2,1,A_ACC Bloom (1F),0.0
3,1,A_ACC Bloom (3F),0.0
4,1,A_AK골프,0.0


In [33]:
# user_item_index_df
surprise_df.head()

Unnamed: 0,CUSTNO,PRD,SCORE
0,1,A_4대 B/D,0.0
1,1,A_5 ON THE GO,0.0
2,1,A_ACC Bloom (1F),0.0
3,1,A_ACC Bloom (3F),0.0
4,1,A_AK골프,0.0


In [34]:
final_cat = pd.read_csv('C:/jupyter-files/cakd7/second_projects/surprise_data/최종cat순서나열.csv', index_col=0)
final_cat.drop(columns=['대분류코드','QTY','구매금액_min','구매금액_max','구매금액_mean'], inplace=True)

In [35]:
# category_df
final_cat.head()

Unnamed: 0,제휴사,금액 타입,대분류명_y,중분류명,소분류명,UPRICE,AVG,AVGVS
0,B,저가,식품,버섯,건버섯,3017,7926,0.4
1,B,저가,식품,버섯,느타리버섯,3017,7926,0.4
2,B,저가,식품,버섯,버섯모둠,3017,7926,0.4
3,B,저가,식품,버섯,버섯선물세트,3017,7926,0.4
4,B,저가,식품,버섯,새송이버섯,3017,7926,0.4


In [36]:
# cluster_df
cust_df.head()

Unnamed: 0,연령대_5세,전반기_횟수별_중요 카테고리 top_1_가구인테리어/잡화_고가,전반기_횟수별_중요 카테고리 top_1_가구인테리어/잡화_저가,전반기_횟수별_중요 카테고리 top_1_가구인테리어/잡화_중가,전반기_횟수별_중요 카테고리 top_1_가전/레저취미/멀티샵_고가,전반기_횟수별_중요 카테고리 top_1_가전/레저취미/멀티샵_저가,전반기_횟수별_중요 카테고리 top_1_가전/레저취미/멀티샵_중가,전반기_횟수별_중요 카테고리 top_1_교육문화/아동_고가,전반기_횟수별_중요 카테고리 top_1_교육문화/아동_중가,전반기_횟수별_중요 카테고리 top_1_식품/생활_고가,...,Monetary_식품/생활_고가,Monetary_식품/생활_저가,Monetary_식품/생활_중가,Monetary_의류/이미용_고가,Monetary_의류/이미용_저가,Monetary_의류/이미용_중가,Monetary_침구/주방_고가,Monetary_침구/주방_저가,Monetary_침구/주방_중가,predict_label
1,10,0,0,0,0,0,0,0,0,0,...,-72535.01,-24828.571797,-269201.53426,-949535.028305,0.0,-1058998.0,-165908.7,0.0,312656.5,1
2,10,0,0,0,0,0,0,0,0,0,...,81227.97,-2491.269959,328501.243401,574421.219977,-86539.557675,323922.2,-93142.51,0.0,-660853.8,0
3,10,0,0,0,0,0,0,0,0,0,...,-4.690343e-12,-35067.847606,6687.523932,0.0,3506.95168,0.0,0.0,0.0,0.0,0
4,10,0,0,0,0,0,0,0,0,0,...,-4914.547,-18056.383394,-16920.037041,-147236.642339,27518.517273,-48303.29,-3.338658e-12,0.0,24146.81,1
6,10,0,0,0,0,0,0,0,0,0,...,115812.3,11335.769083,175955.37085,-122198.249767,30457.812295,-197531.8,0.0,2200.441977,-1.734332e-12,0


In [46]:
# 특정 고객의 구매이력이 있는 추천리스트, 구매이력이 없는 추천리스트 딕셔너리 형태로 반환하는 함수 생성
# 입력값 (고객별 군집결과 데이터, 구매지수 데이터, 소분류명 정보_제휴사, 단가) 
# -> 반출값 : 1. 구매이력이 있는 추천리스트, 2. 구매이력이 없는 추천리스트
def dict_recommend_unrecommend(cluster_df, user_item_index_df, category_df): 
    
    # 반출할 빈 리스트 생성
    super_list = []
    super_list_bought = []

    # 군집 리스트
    cluster_list = list(cluster_df['cluster'].unique()).sort() # 군집 종류
    print(cluster_list)
    
    
    for cluster in cluster_list[:1]:
        print('해당 클러스터는:',cluster)
        cust_index_list = list(cluster_df[cluster_df['cluster']==cluster].index) # 해당 군집에 있는 고객 번호 뽑기
        print(len(cust_index_list))
        # 구매지수 데이터에서 해당하는 고객번호만 출력(새로운 데이터 생성)
        temp_df = user_item_index_df[user_item_index_df['CUSTNO'].isin(cust_index_list)] #isin(cust_index_list)]

        # 구매지수 최댓값, 최솟값 추출
        max_r = temp_df['SCORE'].max()
        min_r = temp_df['SCORE'].min()

        ## 서프라이즈 최적 파라미터  돌리기
        #best_params = find_best_params(temp_df,min_r,max_r)

         ## 서프라이즈 최적 파라미터로 모델 학습
        #algo = r_predict_df(temp_df, best_params,min_r,max_r)
        algo = r_predict_df(temp_df,min_r,max_r)

        
    # 고객별 추천 결과 받기 
# !--출력 값 (딕셔너리 형태)
        # 1) 안사본 물품
        recomm_dict = {} 
        # 2) 사본 물품
        recomm_dict_bought = {}
        
        for cust in cust_index_list:
            print("고객: ",cust)
            
            # 입력 (해당하는 군집에 있는 고객 데이터, 최적 파라미터로 모델, 고객, top) -> 추출 (안사본 물품 중 top, 구매한 물품 중 top)
            top_prd_preds, top_prd_preds_bought  = recomm_prd_by_surprise(temp_df, algo, cust, top_n=50)

            # 해당하는 고객의  입력 _(해당 제품, 구매지수) ex. {9_고객번호 : (양말_해당 제품이름, 0.5), (), () ......}
            
        # 1) 안사본 물품
            recomm_dict[cust] = [top_prd_preds[0]] # 첫번째 value 입력
            
            # 딕셔너리 차례로 담기
            for prd in top_prd_preds[1:]:
                if cust in recomm_dict:
                    recomm_dict[cust].append(prd)

        # 2) 이미 사본 제품
            recomm_dict_bought[cust] = [top_prd_preds_bought[0]]
            for prd in top_prd_preds_bought[1:]:
                if cust in recomm_dict_bought:
                    recomm_dict_bought[cust].append(prd)

# ! -- 최종 출력 값(리스트 형태) _ 군집별 고객별 제품 추천 딕셔너리
            super_list.append(recomm_dict)
            super_list_bought.append(recomm_dict_bought)
    print("super_list 완료")
    return super_list, super_list_bought

# 특정 고객의 추천 리스트 받기 

In [58]:
# 고객 
def get_recommend_cust( cluster, cust, user_item_index_df, category_df,super_list):
    
#     super_list, super_list_bought = dict_recommend_unrecommend(df, user_item_index_df, category_df)
    
    dict_custs = super_list[cluster]
    df = pd.DataFrame(dict_custs[cust])
    df.columns = ['ITEM','RATING']
    df['ASSO'] = df['ITEM'].apply(lambda x: x[0])
    df['ITEM'] = df['ITEM'].apply(lambda x: x[2:])

    dff = df.merge(category_df, how='left', left_on=['ASSO','ITEM'], right_on=['제휴사', '소분류명'])
    
    dff_replace = dff[['제휴사', '대분류명_y', '중분류명', '소분류명', '금액 타입', 'RATING']]
    dff_replace.columns = ['제휴사', '대분류명', '중분류명', '추천 상품', '금액 타입', '예측 구매 지수']
    
    cat_grouped = dff_replace.groupby('대분류명')
    
    cat_list = list(dff_replace['대분류명'].unique())
    
    print(f' ♣ {cust} 고객님만을 위한 추천 상품이에요!!!! only for you~ ', '\n')
    for cat in cat_list:
        print()
        print(f' {cat} 추천 제품 ▼')
        display(cat_grouped.get_group(cat))
        
def get_recommend_cust_bought( cluster, cust, user_item_index_df, category_df, super_list_bought):
    
#     super_list, super_list_bought = dict_recommend_unrecommend(df, user_item_index_df, category_df)
    dict_custs = super_list_bought[cluster]
    df = pd.DataFrame(dict_custs[cust])
    df.columns = ['ITEM','RATING']
    df['ASSO'] = df['ITEM'].apply(lambda x: x[0])
    df['ITEM'] = df['ITEM'].apply(lambda x: x[2:])

    dff = df.merge(category_df, how='left', left_on=['ASSO','ITEM'], right_on=['제휴사', '소분류명'])
    
    dff_replace = dff[['제휴사', '대분류명_y', '중분류명', '소분류명', '금액 타입', 'RATING']]
    dff_replace.columns = ['제휴사', '대분류명', '중분류명', '추천 상품', '금액 타입', '예측 구매 지수']
    
    cat_grouped = dff_replace.groupby('대분류명')
    
    cat_list = list(dff_replace['대분류명'].unique())
    
    print(f' ♣ {cust} 고객님이 선호하는 상품이에요!!!! only for you~ ', '\n')
    # 출력값
    for cat in cat_list:
        print()
        print(f' {cat}의 추천 제품 ▼')
        display(cat_grouped.get_group(cat))
        
    
# print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', '\n')
# get_recommend_cust(cust_df, 0,9538, final_cat)
# print('')
# print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   PURCHSED RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', '\n')
# get_recommend_cust_bought(cust_df, 0,9538, final_cat)


In [39]:
cluster_df

Unnamed: 0_level_0,cluster
CUSTNO,Unnamed: 1_level_1
6,4
11,1
12,4
13,0
16,0
...,...
19319,1
19320,2
19328,4
19329,0


# 값 출력해보기

In [47]:
cluster_df = pd.read_csv('생성데이터/5. 추천시스템/surprise_data/cust_clustering.csv', index_col = 0)
super_list, super_list_bought = dict_recommend_unrecommend(cluster_df, surprise_df, final_cat)

1495
-------------학습 완료-------------------


In [67]:
cluster_list = list(cluster_df['cluster'].unique())
print(cluster_list)

[4, 1, 0, 3, 2]


In [59]:
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', '\n')
get_recommend_cust( 0,18367, surprise_df, final_cat, super_list)
print('')
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   PURCHSED RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', '\n')
get_recommend_cust_bought(0,18367, surprise_df, final_cat,super_list_bought)

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 

 ♣ 18367 고객님만을 위한 추천 상품이에요!!!! only for you~  


 식품 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
0,A,식품,건강식품,건강식품(비타민),고가,0.05327
2,A,식품,농산물,청과,중가,0.0268
3,A,식품,육류,우육,고가,0.025891
4,A,식품,농산물,농산가공,중가,0.024518
5,A,식품,농산물,유기농채소,중가,0.024106
6,A,식품,농산물,채소,중가,0.023931
7,A,식품,가공식품,음료,중가,0.02158
8,A,식품,주류,음료,고가,0.02158
9,A,식품,가공식품,전문베이커리,중가,0.021353
10,A,식품,수산품,생선,고가,0.020412



 이미용 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
1,A,이미용,화장품,기초 화장품,고가,0.027408
23,A,이미용,화장품,색조 화장품,고가,0.015143
46,A,이미용,화장품,기초A,고가,0.005459
53,B,이미용,기초화장품,일반화장품,저가,0.0046



 생활 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
14,A,생활,생활잡화,단기행사,고가,0.016865
27,A,생활,생활잡화,욕실용품,고가,0.011674
38,A,생활,생활잡화,타월,고가,0.006664



 의류 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
15,A,의류,남성 트랜디,단기행사,중가,0.016865
32,A,의류,캐주얼,global SPA,중가,0.010227
51,A,의류,란제리/내의,패션내의,고가,0.005148



 잡화 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
16,A,잡화,일용잡화,단기행사,저가,0.016865
17,A,잡화,피혁잡화,단기행사,중가,0.016865
49,A,잡화,명품,수입잡화,고가,0.005222



 레저취미 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
35,A,레저취미,스포츠,스포츠의류,중가,0.007457



 주방 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
47,A,주방,주방가전,수입소형가전,고가,0.005346



<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   PURCHSED RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 

 ♣ 18367 고객님이 선호하는 상품이에요!!!! only for you~  


 식품의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
0,A,식품,가공식품,제과제빵,중가,0.009481
1,A,식품,차/커피,디저트류,중가,0.008019
5,D,식품,과자,쿠키,저가,0.003496
7,D,식품,과자,일반스낵,저가,0.002369
8,D,식품,과자,하드캔디,저가,0.002245
9,A,식품,가공식품,대형테넌트,중가,0.002136
10,C,식품,맥주,수입맥주,중가,0.001723
12,C,식품,우유,일반우유,저가,0.001533
13,C,식품,맥주,국산맥주,중가,0.001414
16,D,식품,건강기능식품,일반비타민,중가,0.001255



 의류의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
2,A,의류,캐주얼,이지캐주얼,중가,0.004792
3,A,의류,캐주얼,스트리트,중가,0.003787
6,A,의류,캐주얼,컬처캐주얼,중가,0.003443



 레저취미의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
4,A,레저취미,스포츠,스포츠화,중가,0.003549
14,A,레저취미,스포츠,나이키의류,중가,0.001281
40,A,레저취미,스포츠,아디다스화,중가,0.000378
45,A,레저취미,스포츠,뉴발란스아울렛,중가,0.000332



 주방의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
11,A,주방,주방용품,국산주방,중가,0.001574



 이미용의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
15,D,이미용,클렌징,페이셜클렌저,저가,0.001266
18,D,이미용,베이직케어,크림/밤/오일,중가,0.001215
28,D,이미용,헤어케어,샴푸,저가,0.000624
30,D,이미용,베이스메이크업,BB/파운데이션/컴팩트류,저가,0.000598
31,D,이미용,베이직케어,스킨/토너,중가,0.000574
34,D,이미용,바디케어,바디보습,저가,0.000519
35,D,이미용,선케어,선크림류,저가,0.00049
36,D,이미용,베이직케어,에센스/세럼,중가,0.000479
37,D,이미용,립메이크업,립글로즈/틴트,저가,0.000465
39,D,이미용,헤어케어,염모제,저가,0.000443



 생활의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
19,D,생활,방향/제습/탈취제,차량용방향/제취제,중가,0.001195
23,D,생활,구강케어,치약,중가,0.000835
24,D,생활,구강케어,구강청정제,중가,0.000834
25,D,생활,화장지/티슈,물티슈,저가,0.000803
38,D,생활,구강케어,칫솔,중가,0.000461
46,D,생활,일반의약외품,밴드류,중가,0.000317


In [None]:
# clust_df[clust_df ['cluster'] == 0].