# 서프라이즈 모델 파라미터 개선 및 학습 코드 

In [37]:
from surprise import KNNBasic, SVD, SVDpp, NMF
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV

def find_best_params(df, min_r, max_r):
    reader = Reader(rating_scale=(min_r, max_r))
    data = Dataset.load_from_df(df, reader=reader)
    # data_folds = DatasetAutoFolds(ratings_file='surprise_dataset_1027.csv', reader=reader)
    #trainset = data_folds_build_full_trainset()
    trainset, testset = train_test_split(data,test_size=.25,random_state=0)
    
    print('SVD 모델로 학습') 
    algo = SVD(random_state=0)
    cross_validate(algo, data, measures=['RMSE','MAE'], cv=5, verbose=True)


    # 최적화할 파라미터를 딕셔너리 형태로 지정 
    param_grid ={'n_epochs': [20,40,60], 'n_factors': [50,100,200]}

    gs = GridSearchCV(SVD, param_grid, measures = ['rmse','mae'],cv=3)
    gs.fit(data)

    print(gs.best_score['rmse'])
    print(gs.best_params['rmse'])
    
    return gs.best_params['rmse']
    


In [38]:
from surprise.dataset import DatasetAutoFolds

def r_predict_df(df, best_params):
    reader = Reader(rating_scale=(0,8.1))
    data = Dataset.load_from_df(df, reader=reader)
    
    n_epochs = best_params['n_epochs']
    n_factors = best_params['n_factors']   
    
    #train 과  test으로 나누지말고 전부로 모델 학습하기 
    trainset = data.build_full_trainset()
    
    #SVD 모델로 학습 
    algo = SVD(n_epochs=n_epochs ,n_factors=n_factors, random_state=0)
    algo.fit(trainset)

    print('-------------학습 완료-------------------')
    
    return algo


In [39]:
# 1. 해당 userid가 아직 평점을 매기지 않은 제품, 평점을 매긴 모든 제품 추출 (리스트 형태)
def get_unpurchased_surprise(origin_df, userid):
    # userid에 해당하는 사용자가 평점을 매긴 모든 제품
    purchased_prds = origin_df[(origin_df['CUSTNO']==userid)&(origin_df['SCORE'] > 0)]['PRD'].tolist()
    
    total_prds = origin_df['PRD'].unique()
    
    # userid에 해당하는 사용자가 한번도 구매하지 않은 모든 제품
    unpurchased_prds = [PRD for PRD in total_prds if PRD not in purchased_prds]

    # 한번도 구매하지 않은 모든 제품, 평점을 매긴 모든 제품 반출
    return unpurchased_prds,  purchased_prds

# 입력(군집별 고객 데이터, 최적의 예측값 모델, 고객번호, top 개수) -> 출력 (안사본 물품 중 top, 구매한 물품 중 top)
def recomm_prd_by_surprise(temp_df, algo,  userid, top_n):
    
    # 구매하지 않은 제품, 평점을 매긴 모든 제품 추출 각각 추출 (1. 함수 적용)
    unpurchased_prds, purchased_prds = get_unpurchased_surprise(temp_df, userid)
    
#  모델 적용
    # 1) 아직 안사본 제품
    predictions = [algo.predict(str(userid), str(PRD)) for PRD in unpurchased_prds]
    # 2) 이미 사본 제품
    predictions_bought = [algo.predict(str(userid), str(PRD)) for PRD in purchased_prds]
    
    # predictions list 객체는 predictions 객체를 원소로 갖고 있음
    # est 값으로 정렬 _ sortkey_est 함수 생성
    def sortkey_est(pred):
        return pred.est
    
# pred값을 내림 차순으로 정렬

## 1) 아직 안사본 제품
    predictions.sort(key = sortkey_est, reverse = True)
    # 상위 n개만 추출
    top_predictions = predictions[:top_n]
    
## 2) 이미 사본 제품
    predictions_bought.sort(key = sortkey_est, reverse = True)
    # 상위 n개만 추출
    top_predictions_bought = predictions_bought[:top_n]
    
# ! 최종 추출
# 상위 n 개의 제품 정보, 예측 지수
    
    # 1) 안사본 제품 
    top_prds = [pred.iid for pred in top_predictions] # 해당 제품
    top_prds_rating = [pred.est for pred in top_predictions] # 예측 지수
    
    # 2) 사본 제품
    top_prds_bought = [pred.iid for pred in top_predictions_bought] # 해당 제품
    top_prds_rating_bought = [pred.est for pred in top_predictions_bought] # 예측 지수
    
# *최종 값 형태 = 리스트 내 튜플 형태 (해당 제품, 예측 지수) ]
    top_prds_preds = [ ( id, rating) for id, rating in zip(top_prds, top_prds_rating)]
    top_prds_preds_bought = [ ( id, rating) for id, rating in zip(top_prds_bought, top_prds_rating_bought)]
    
    return top_prds_preds, top_prds_preds_bought

# 데이터프레임 불러오기
- 구매지수 R 데이터프레임 
- 고객번호와 군집 라벨 데이터프레임
- 카테고리 데이터프레임

In [40]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

surprise_df = pd.read_csv('surprise_data/surprise_dataset_1027.csv')
surprise_df.columns = ['CUSTNO', 'PRD', 'SCORE']
surprise_df.head()

Unnamed: 0,CUSTNO,PRD,SCORE
0,1,A_4대 B/D,0.0
1,1,A_5 ON THE GO,0.0
2,1,A_ACC Bloom (1F),0.0
3,1,A_ACC Bloom (3F),0.0
4,1,A_AK골프,0.0


In [41]:
cust_df = pd.read_csv('최종모델/Total_Top_Q6_Q8_plus_predictlabel.csv', index_col=0)
df = cust_df[['predict_label']]


cluster_list = list(cust_df['predict_label'].unique())

In [42]:
# user_item_index_df
surprise_df.head()

Unnamed: 0,CUSTNO,PRD,SCORE
0,1,A_4대 B/D,0.0
1,1,A_5 ON THE GO,0.0
2,1,A_ACC Bloom (1F),0.0
3,1,A_ACC Bloom (3F),0.0
4,1,A_AK골프,0.0


In [43]:
final_cat = pd.read_csv('최종cat순서나열.csv', index_col=0)
final_cat.drop(columns=['대분류코드','QTY','구매금액_min','구매금액_max','구매금액_mean'], inplace=True)

In [44]:
# category_df
final_cat.head()

Unnamed: 0,제휴사,금액 타입,대분류명_y,중분류명,소분류명,UPRICE,AVG,AVGVS
0,B,저가,식품,버섯,건버섯,3017,7926,0.4
1,B,저가,식품,버섯,느타리버섯,3017,7926,0.4
2,B,저가,식품,버섯,버섯모둠,3017,7926,0.4
3,B,저가,식품,버섯,버섯선물세트,3017,7926,0.4
4,B,저가,식품,버섯,새송이버섯,3017,7926,0.4


In [45]:
# cluster_df
cust_df.head()

Unnamed: 0,연령대_5세,전반기_횟수별_중요 카테고리 top_1_가구인테리어/잡화_고가,전반기_횟수별_중요 카테고리 top_1_가구인테리어/잡화_저가,전반기_횟수별_중요 카테고리 top_1_가구인테리어/잡화_중가,전반기_횟수별_중요 카테고리 top_1_가전/레저취미/멀티샵_고가,전반기_횟수별_중요 카테고리 top_1_가전/레저취미/멀티샵_저가,전반기_횟수별_중요 카테고리 top_1_가전/레저취미/멀티샵_중가,전반기_횟수별_중요 카테고리 top_1_교육문화/아동_고가,전반기_횟수별_중요 카테고리 top_1_교육문화/아동_중가,전반기_횟수별_중요 카테고리 top_1_식품/생활_고가,...,Monetary_식품/생활_고가,Monetary_식품/생활_저가,Monetary_식품/생활_중가,Monetary_의류/이미용_고가,Monetary_의류/이미용_저가,Monetary_의류/이미용_중가,Monetary_침구/주방_고가,Monetary_침구/주방_저가,Monetary_침구/주방_중가,predict_label
1,10,0,0,0,0,0,0,0,0,0,...,-72535.01,-24828.571797,-269201.53426,-949535.028305,0.0,-1058998.0,-165908.7,0.0,312656.5,1
2,10,0,0,0,0,0,0,0,0,0,...,81227.97,-2491.269959,328501.243401,574421.219977,-86539.557675,323922.2,-93142.51,0.0,-660853.8,0
3,10,0,0,0,0,0,0,0,0,0,...,-4.690343e-12,-35067.847606,6687.523932,0.0,3506.95168,0.0,0.0,0.0,0.0,0
4,10,0,0,0,0,0,0,0,0,0,...,-4914.547,-18056.383394,-16920.037041,-147236.642339,27518.517273,-48303.29,-3.338658e-12,0.0,24146.81,1
6,10,0,0,0,0,0,0,0,0,0,...,115812.3,11335.769083,175955.37085,-122198.249767,30457.812295,-197531.8,0.0,2200.441977,-1.734332e-12,0


In [46]:
# 특정 고객의 구매이력이 있는 추천리스트, 구매이력이 없는 추천리스트 딕셔너리 형태로 반환하는 함수 생성
# 입력값 (고객별 군집결과 데이터, 구매지수 데이터, 소분류명 정보_제휴사, 단가) 
# -> 반출값 : 1. 구매이력이 있는 추천리스트, 2. 구매이력이 없는 추천리스트
def dict_recommend_unrecommend(cluster_df, user_item_index_df, category_df): 
    
    # 반출할 빈 리스트 생성
    super_list = []
    super_list_bought = []

    # 군집 리스트
    cluster_list = list(cluster_df['predict_label'].unique()) # 군집 종류
    
    for cluster in cluster_list:
        cust_index_list = list(cluster_df[cluster_df['predict_label']==cluster].index) # 해당 군집에 있는 고객 번호 뽑기
        # 구매지수 데이터에서 해당하는 고객번호만 출력(새로운 데이터 생성)
        temp_df = user_item_index_df[user_item_index_df['CUSTNO'].isin(cust_index_list)] #isin(cust_index_list)]

        # 구매지수 최댓값, 최솟값 추출
        max_r = temp_df['SCORE'].max()
        min_r = temp_df['SCORE'].min()

        ## 서프라이즈 최적 파라미터  돌리기
        best_params = find_best_params(temp_df,min_r,max_r)

         ## 서프라이즈 최적 파라미터로 모델 학습
        algo = r_predict_df(temp_df, best_params)

        
    # 고객별 추천 결과 받기 
# !--출력 값 (딕셔너리 형태)
        # 1) 안사본 물품
        recomm_dict = {} 
        # 2) 사본 물품
        recomm_dict_bought = {}
        
        for cust in cust_index_list:
            
            # 입력 (해당하는 군집에 있는 고객 데이터, 최적 파라미터로 모델, 고객, top) -> 추출 (안사본 물품 중 top, 구매한 물품 중 top)
            top_prd_preds, top_prd_preds_bought  = recomm_prd_by_surprise(temp_df, algo, cust, top_n=50)

            # 해당하는 고객의  입력 _(해당 제품, 구매지수) ex. {9_고객번호 : (양말_해당 제품이름, 0.5), (), () ......}
            
        # 1) 안사본 물품
            recomm_dict[cust] = [top_prd_preds[0]] # 첫번째 value 입력
            
            # 딕셔너리 차례로 담기
            for prd in top_prd_preds[1:]:
                if cust in recomm_dict:
                    recomm_dict[cust].append(prd)

        # 2) 이미 사본 제품
            recomm_dict_bought[cust] = [top_prd_preds_bought[0]]
            for prd in top_prd_preds_bought[1:]:
                if cust in recomm_dict_bought:
                    recomm_dict_bought[cust].append(prd)

# ! -- 최종 출력 값(리스트 형태) _ 군집별 고객별 제품 추천 딕셔너리
            super_list.append(recomm_dict)
            super_list_bought.append(recomm_dict_bought)

    return super_list, super_list_bought

# 특정 고객의 추천 리스트 받기 

In [52]:
# 고객 
def get_recommend_cust(df, cluster, cust, category_df):
    
    dict_custs = super_list[cluster]
    df = pd.DataFrame(dict_custs[cust])
    df.columns = ['ITEM','RATING']
    df['ASSO'] = df['ITEM'].apply(lambda x: x[0])
    df['ITEM'] = df['ITEM'].apply(lambda x: x[2:])

    dff = df.merge(category_df, how='left', left_on=['ASSO','ITEM'], right_on=['제휴사', '소분류명'])
    
    dff_replace = dff[['제휴사', '대분류명_y', '중분류명', '소분류명', '금액 타입', 'RATING']]
    dff_replace.columns = ['제휴사', '대분류명', '중분류명', '추천 상품', '금액 타입', '예측 구매 지수']
    
    cat_grouped = dff_replace.groupby('대분류명')
    
    cat_list = list(dff_replace['대분류명'].unique())
    
    print(f' ♣ {cust} 고객님만을 위한 추천 상품이에요!!!! only for you~ ', '\n')
    for cat in cat_list:
        print()
        print(f' {cat} 추천 제품 ▼')
        display(cat_grouped.get_group(cat))
        
def get_recommend_cust_bought(df, cluster, cust, category_df):
    dict_custs = super_list_bought[cluster]
    df = pd.DataFrame(dict_custs[cust])
    df.columns = ['ITEM','RATING']
    df['ASSO'] = df['ITEM'].apply(lambda x: x[0])
    df['ITEM'] = df['ITEM'].apply(lambda x: x[2:])

    dff = df.merge(category_df, how='left', left_on=['ASSO','ITEM'], right_on=['제휴사', '소분류명'])
    
    dff_replace = dff[['제휴사', '대분류명_y', '중분류명', '소분류명', '금액 타입', 'RATING']]
    dff_replace.columns = ['제휴사', '대분류명', '중분류명', '추천 상품', '금액 타입', '예측 구매 지수']
    
    cat_grouped = dff_replace.groupby('대분류명')
    
    cat_list = list(dff_replace['대분류명'].unique())
    
    print(f' ♣ {cust} 고객님이 선호하는 상품이에요!!!! only for you~ ', '\n')
    # 출력값
    for cat in cat_list:
        print()
        print(f' {cat}의 추천 제품 ▼')
        display(cat_grouped.get_group(cat))
        
    
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', '\n')
get_recommend_cust(cust_df, 0,9538, final_cat)
print('')
print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   PURCHSED RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>', '\n')
get_recommend_cust_bought(cust_df, 0,9538, final_cat)


<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 

 ♣ 9538 고객님만을 위한 추천 상품이에요!!!! only for you~  


 식품 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
0,A,식품,건강식품,건강식품(비타민),고가,0.025146
2,A,식품,수산품,생선,고가,0.023207
3,A,식품,수산품,멸치류,고가,0.023109
6,A,식품,육류,돈육,고가,0.014686
7,A,식품,가공식품,브랑제리,중가,0.013438
8,A,식품,육류,양념육,고가,0.011135
11,A,식품,가공식품,밥류,중가,0.009534
12,B,식품,레스토랑,한식레스토랑,고가,0.008521
13,B,식품,레스토랑,한식레스토랑,고가,0.008521
14,A,식품,가공식품,일식델리,중가,0.008179



 의류 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
1,A,의류,캐주얼,영 캐릭터,중가,0.023651
10,A,의류,시티웨어,엘레강스,중가,0.010377
21,A,의류,커리어,커리어,중가,0.006113
28,A,의류,트래디셔널,트래디셔널,중가,0.005126
29,A,의류,트래디셔널,트래디셔널,중가,0.005126
31,A,의류,모피/피혁,숙녀고정행사,고가,0.004524
48,A,의류,남성 트랜디,TNGT,중가,0.00324
49,A,의류,남성정장,캠브리지,고가,0.003101



 이미용 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
4,A,이미용,화장품,기초 화장품,고가,0.018631
18,A,이미용,화장품,색조 화장품,고가,0.006921



 생활 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
5,A,생활,생활잡화,기능성침대,고가,0.017257
15,A,생활,생활잡화,토탈데코,고가,0.007822
19,A,생활,생활잡화,타월,고가,0.006534
47,C,생활,생리용품,생리대,중가,0.003246



 잡화 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
9,A,잡화,명품,힐앤토트,고가,0.010569
35,A,잡화,준보석/시계,시계,고가,0.004079
42,A,잡화,명품,수입잡화,고가,0.00344
51,A,잡화,명품,직수입,고가,0.002987



 주방 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
40,A,주방,주방가전,에어컨,고가,0.003673
41,A,주방,주방용품,수입주방,중가,0.00353



 침구 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
45,A,침구,침구/수예,N.B,고가,0.003332



<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<   PURCHSED RECOMMEND PRODUCTS    >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 

 ♣ 9538 고객님이 선호하는 상품이에요!!!! only for you~  


 식품의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
0,A,식품,농산물,청과,중가,0.059822
1,A,식품,육류,우육,고가,0.040212
2,A,식품,농산물,채소,중가,0.039253
3,A,식품,건강식품,건강식품(홍삼),고가,0.038219
4,A,식품,농산물,농산가공,중가,0.034674
5,A,식품,축산가공,유제품,중가,0.032244
6,A,식품,가공식품,한식델리,중가,0.022358
7,A,식품,가공식품,일반가공식품,중가,0.018854
8,A,식품,가공식품,제과제빵,중가,0.012236
10,A,식품,농산물,유기농채소,중가,0.011412



 잡화의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
9,A,잡화,명품,명품시계기타,고가,0.011752
16,A,잡화,일용잡화,단기행사,저가,0.010021
17,A,잡화,피혁잡화,단기행사,중가,0.010021
58,A,잡화,일용잡화,뷰티상품,저가,0.001906



 생활의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
14,A,생활,생활잡화,단기행사,고가,0.010021
30,C,생활,유아용품,유아/아동용칫솔,중가,0.003079
31,B,생활,생리대,팬티라이너,중가,0.00307
32,B,생활,여성위생용품,팬티라이너,중가,0.00307
42,B,생활,교환/보수용품,오일/첨가제,중가,0.002725
43,B,생활,자동차 정비용품,오일/첨가제,중가,0.002725
51,B,생활,유아위생용품,유아기저귀,고가,0.002123



 의류의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
15,A,의류,남성 트랜디,단기행사,중가,0.010021
39,A,의류,란제리/내의,패션내의,고가,0.002785



 교육문화의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
19,B,교육문화,남아,액션피겨,중가,0.008303
20,B,교육문화,BOYS 1,액션피겨,고가,0.008303
25,B,교육문화,EDUCATION 1,블럭,고가,0.006036
55,A,교육문화,문구/팬시,서적,중가,0.002017



 아동의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
27,A,아동,아동,L/C 아동복,중가,0.004397
40,A,아동,아동,유아용품,중가,0.002783
50,A,아동,아동,토들러,중가,0.002127



 레저취미의 추천 제품 ▼


Unnamed: 0,제휴사,대분류명,중분류명,추천 상품,금액 타입,예측 구매 지수
28,A,레저취미,골프용품,직수입 골프의류,고가,0.003951
59,A,레저취미,스포츠,스포츠의류,중가,0.001806
