# 알고리즘 예측 검증 노트북

### word2vector 기반 유저 프로파일 / 딜 피쳐

* logistic regression
* gradient boost classifier

In [1]:
import requests
import time
import urllib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import json
import elasticsearch
import csv
import pickle
from operator import itemgetter
import pandas as pd

# Elastic Search 를 이용

In [2]:
es = elasticsearch.Elasticsearch('twiceSpark1:9200')

def es_search(es, deal_ids):
    res = es.search(index='ojm5', body={'query': {'ids':{'values':deal_ids}}}, size=100)
    n = res['hits']['total']
    dic = {}
    for hit in res['hits']['hits']:
        dic[hit['_id']] =  hit['_source']
    return dic
    

In [3]:
def show_preds(pred):
    img_base_url=r'http://img.wemep.co.kr/'
    history = es_search(es, pred[0])
        
    fig, axes = plt.subplots(ncols=len(pred[0]), figsize=(16,16))
    
    for i, h in enumerate(pred[0]):
        title = history[h]['_2']
        img = history[h]['_3']
        f = urllib.request.urlopen(img_base_url+img)
        a = plt.imread(f, format='jpg')
        axes[i].imshow(a)
        axes[i].set_title("{}".format(title))
        axes[i].axis('off')
    plt.show()
    
    candidates = list(map(lambda x: x[0], pred[1]))
    res = es_search(es, candidates)
    
    i = 0
    for prod, slot, prob in pred[1]:
        i+=1
        if prod in res:
            cat = res[prod]['_4']['_5']
            title = res[prod]['_2']
            img = res[prod]['_3']
            #print('{}[{}].{}.prod {}.slot {}.prob {}'.format(title, cat, img, prod, slot, prob))
            #continue
            
            f = urllib.request.urlopen(img_base_url+img)        
            # read the image file in a numpy array
            a = plt.imread(f, format='jpg')
            plt.imshow(a)
            plt.title("{}[{}] score:{} {} [{}]".format(i, slot, prob, title, cat))
            plt.axis('off')
            plt.show()

        else:
            print('no entry for product {}.slot {}.prob {}'.format(prod, slot, prob))

In [4]:
def show_history_only(pred):
    img_base_url=r'http://img.wemep.co.kr/'
    history = es_search(es, pred[0])
        
    fig, axes = plt.subplots(ncols=len(pred[0]), figsize=(16,16))
    
    for i, h in enumerate(pred[0]):
        title = history[h]['_2']
        img = history[h]['_3']
        f = urllib.request.urlopen(img_base_url+img)
        a = plt.imread(f, format='jpg')
        axes[i].imshow(a)
        axes[i].set_title("{}".format(title))
        axes[i].axis('off')
    plt.show()

In [5]:
def show_item_b(item_b, show_figure=False):
    img_base_url=r'http://img.wemep.co.kr/'
    
    candidates = list(map(lambda x: x[0], item_b))
    res = es_search(es, candidates)
    
    i = 0
    for deal, slot, score in item_b:
        i+=1
        if deal in res:
            cat = res[deal]['_4']['_5']
            title = res[deal]['_2']
            img = res[deal]['_3']
            
            f = urllib.request.urlopen(img_base_url+img)        
            # read the image file in a numpy array
            a = plt.imread(f, format='jpg')
            plt.imshow(a)
            plt.title("{}[{}] score:{} {} [{}]".format(i, slot, score, title, cat))
            plt.axis('off')
            plt.show()

        else:
            print('no entry for product {} score{}'.format(deal, score))

In [6]:
def es_scan_extra_by_dids(dids):
    """
    dids로부터, mn, tn1를 가져온다.
    """
    res = es.search(index='dealinfos', 
                body={
                    'from':0, 'size': len(dids),
                    "_source": ["mn", "tn1", "did"],
                    'query': {
                        'ids': {'values': dids }
                        }                        
                    }
               )
    dic = {}
    for hit in res['hits']['hits']:
        dic[hit['_source']['did']] = (hit['_source']['mn'], hit['_source']['tn1'])
    return dic

In [7]:
def es_read_wepick_setting(dt, start_slot=20):
    """
    위픽 세팅 로딩
    """
    res = es.search(index='wepick_setting_ext', 
                body={
                    'query': {
                        'term': {'dt': dt }
                        }                        
                    }
               )
    if res['hits']['total'] > 0:
        dic = {}
        vec = []
        for s in res['hits']['hits'][0]['_source']['settings']:
            dic[s['slot']] = s['did']
            vec.append(s['did'])
        return vec, dic
    return None, None

In [8]:
def get_w2v_words_from_did(dids):
    """
    해당 딜번호의 워드 모음
    """
    res = es.search(index='deal_word2vec', 
                body={
                  "query": { "terms": {"v":dids } },
                  "_source": ["words", "v"],
                    "size": len(dids)
                    }
               )
    dic = {}
    if res['hits']['total'] > 0:
        for hits in res['hits']['hits']:
            dic[hits['_source']['v']] = hits['_source']['words']
    return dic

In [9]:
def print_user_wepick_history(user_id, day_limit, gte_slot=20, lte_slot=100):
    """
    사용자의 위픽 히스토리를 출력
    day_limit 미만
    클릭한 슬롯이: gte_slot ~ lte_slot 이어야 한다.
    """
    res = es.search(index='wepick_seq', 
                body={
                  "query": {
                    "bool": {
                      "must": { 
                        "term": {"u":user_id}
                      },
                      "filter": [
                          {
                            "range" : {
                              "rgtime" : {
                              "lt" : day_limit
                              }
                            }                              
                          },
                          {
                            "range" : {
                              "slot" : {
                              "gte" : gte_slot,
                              "lte" : lte_slot
                              }
                            }                              
                          }
                      ]
                    }
                  },
                  "size": 128,
                  "sort": {"rgtime":"desc"}
                                }
               )
    filtered = []
    if res['hits']['total'] > 0:
        filtered = []
        for hit in res['hits']['hits']:
            filtered.append((hit['_source']['v'], hit['_source']['rgtime'], hit['_source']['slot']))
    if len(filtered) > 0:
        ret = []
        deal_dic = es_scan_extra_by_dids(list(map(lambda x: x[0], filtered)))
        deal_dcs = get_w2v_words_from_did(list(map(lambda x: x[0], filtered)))
        for did, ts, slot in filtered:
            if did in deal_dic:
                if did in deal_dcs:
                    dcs = deal_dcs[did]
                else:
                    dcs = []
                ret.append((did, ts, slot, deal_dic[did][0], deal_dic[did][1], dcs))
                #print("{},{},{},{},{}".format(did, ts, slot, deal_dic[did][0], deal_dic[did][1]))
        filtered = ret
    return filtered

## Ranking 관련

In [48]:
def print_result(predictions, extra):
    for _did, score in predictions:
        did = int(_did)
        if did in extra:
            if did in deal_to_slot:
                print((score, did, extra[did][0], deal_to_slot[did], extra[did][1]))

In [17]:
def print_predicted_ranks(prediction):
    print_result(prediction, es_scan_extra_by_dids(list(map(lambda x: int(x[0]), prediction))))

# 특정 시각의 위픽 대표값 로딩

In [40]:
# 2018-04-11 21 시의 위픽 세팅 로딩
wepick_setting, wepick_dic = es_read_wepick_setting('2018-04-11 21')

In [42]:
deal_to_slot = dict(zip(wepick_dic.values(), wepick_dic.keys()))

## logistic regression 예측

In [43]:
lr_predict_df = pd.read_csv(r'd:\WMIND\temp\lr_review_0411_21.csv', index_col = 0)

In [44]:
lr_pred_dic = {}    
for id, row in lr_predict_df.iterrows():
    lr_pred_dic[id] = sorted(row.to_dict().items(), key=itemgetter(1), reverse=True)

In [45]:
result = print_user_wepick_history(8808210, '2018-04-11', gte_slot=20)

In [21]:
for did, ts, slot, title, tn, dcs in result:
    print(did, ts, slot, title, tn)

3512009 2018-04-09T12:52:13.956000 22 [원더쿠폰] 첼로걸, 20%할인 쿠폰 블라우스
3518099 2018-04-09T12:52:06.156000 20 [리빙위크] 리베 북유럽풍5단 서랍장 리빙박스/수납함
3505580 2018-04-07T23:30:17.141000 86 [무료배송] 파파라치 예쁜속옷! 여성 브라팬티 세트
3506322 2018-04-07T23:28:10.359000 61 [원더쿠폰] 구두/신발/+20%쿠폰 여성단화
3513258 2018-04-07T23:26:14.835000 51 [무료배송] 원피스/블라우스/쟈켓 원피스
3513720 2018-04-07T23:25:45.817000 50 [하객패션] 러브캣비쥬ACC + 20%쿠폰 귀걸이
3503845 2018-04-07T23:25:11.899000 36 [무료배송] 백화점 리에통+20%쿠폰 여성가방
3498161 2018-04-06T14:16:08.994000 82 [주말직구] 명품뷰티 기초 색조200종 색조메이크업
3501094 2018-04-06T14:15:36.561000 72 [무료배송] 탐스백 신상 여성가방 여성가방
3507104 2018-04-06T14:14:06.273000 33 [무료배송] 아디다스 400종 OPEN! 런닝화/운동화 기타
3499864 2018-04-05T08:49:49.479000 68 [무료배송] 매장판 리복 180종 런닝화
3483501 2018-04-05T08:48:40.225000 49 [봄신상패션] 스파오 티셔츠/팬츠 外 티셔츠
3472770 2018-04-05T08:47:11.305000 40 [봄신상패션] 엔비룩 추가 할인쿠폰! 블라우스
3505054 2018-04-05T08:47:01.390000 32 [타임특가] 펀업 블럭 핸드폰 케이스 스마트 액세서리
3505529 2018-04-05T08:46:53.371000 29 [투데이특가] 예쁘고편한 유카타잠옷 잠옷/홈웨어
3505723 2018-04-05T08:46:

In [49]:
print_predicted_ranks(lr_pred_dic[8808210])

(0.7936674476928148, 3527477, '[투데이특가] 니트/가디건/원피스 외', 61, '원피스')
(0.7785943898652803, 3514459, '[심야특가] 파파야 여성 의류 모음전', 32, '티셔츠')
(0.6785739211042247, 3508946, '[리빙위크] 쉬즈홈 Best 이불 커튼', 64, '요')
(0.6766076320794777, 3515524, '[무료배송] 롱티/티셔츠/원피스', 45, '티셔츠')
(0.6714543282374738, 3524294, '[투데이특가] 여심저격 클루나드시계', 83, '패션시계')
(0.6591598507445531, 3513787, '[리빙위크] 3M 정전기청소포/물걸레', 67, '밀대/청소포')
(0.6507917476506134, 3525317, '[무료배송] 빅사이즈/원피스/롱티', 28, '원피스')
(0.6487084366814696, 3544419, '[게릴라특가] 봄맞이 카페트 150x200', 71, '카페트/러그')
(0.6395921595962396, 3525500, '[하객패션] 포커스 봄구성완벽해', 74, '티셔츠')
(0.6379578135348503, 3515690, '[투데이특가] 닥스셔츠 긴/반팔 BEST', 81, '셔츠/남방')
(0.602041843710961, 3504137, '[리빙위크] 비즈니스보루네오 소파', 78, '소파')
(0.6006457837532824, 3527053, '[투데이특가] 더사랑이 여름 아동복', 43, '아동공용의류')
(0.5951147540178241, 3522395, '[롯데] 르까프 아동/성인 빅세일', 46, '남성 티셔츠/상의 기타')
(0.589846201719842, 3512593, '[무료배송] 봄 아동복 브랜드 연합전', 36, '아동공용의류')
(0.5872673066234849, 3527575, '[무료배송] 프롬유 ~20%할인쿠폰', 55, '티셔츠')
(0.571967832199

## gboost 예측

In [50]:
gbc_predict_df = pd.read_csv(r'd:\WMIND\temp\gbc_review_0411_21.csv', index_col = 0)

In [51]:
gbc_pred_dic = {}    
for id, row in gbc_predict_df.iterrows():
    gbc_pred_dic[id] = sorted(row.to_dict().items(), key=itemgetter(1), reverse=True)

In [52]:
result = print_user_wepick_history(8808210, '2018-04-11', gte_slot=20)

In [26]:
for did, ts, slot, title, tn, dcs in result:
    print(did, ts, slot, title, tn)

3512009 2018-04-09T12:52:13.956000 22 [원더쿠폰] 첼로걸, 20%할인 쿠폰 블라우스
3518099 2018-04-09T12:52:06.156000 20 [리빙위크] 리베 북유럽풍5단 서랍장 리빙박스/수납함
3505580 2018-04-07T23:30:17.141000 86 [무료배송] 파파라치 예쁜속옷! 여성 브라팬티 세트
3506322 2018-04-07T23:28:10.359000 61 [원더쿠폰] 구두/신발/+20%쿠폰 여성단화
3513258 2018-04-07T23:26:14.835000 51 [무료배송] 원피스/블라우스/쟈켓 원피스
3513720 2018-04-07T23:25:45.817000 50 [하객패션] 러브캣비쥬ACC + 20%쿠폰 귀걸이
3503845 2018-04-07T23:25:11.899000 36 [무료배송] 백화점 리에통+20%쿠폰 여성가방
3498161 2018-04-06T14:16:08.994000 82 [주말직구] 명품뷰티 기초 색조200종 색조메이크업
3501094 2018-04-06T14:15:36.561000 72 [무료배송] 탐스백 신상 여성가방 여성가방
3507104 2018-04-06T14:14:06.273000 33 [무료배송] 아디다스 400종 OPEN! 런닝화/운동화 기타
3499864 2018-04-05T08:49:49.479000 68 [무료배송] 매장판 리복 180종 런닝화
3483501 2018-04-05T08:48:40.225000 49 [봄신상패션] 스파오 티셔츠/팬츠 外 티셔츠
3472770 2018-04-05T08:47:11.305000 40 [봄신상패션] 엔비룩 추가 할인쿠폰! 블라우스
3505054 2018-04-05T08:47:01.390000 32 [타임특가] 펀업 블럭 핸드폰 케이스 스마트 액세서리
3505529 2018-04-05T08:46:53.371000 29 [투데이특가] 예쁘고편한 유카타잠옷 잠옷/홈웨어
3505723 2018-04-05T08:46:

In [54]:
print_predicted_ranks(gbc_pred_dic[8808210])

(0.7113136103323243, 3522395, '[롯데] 르까프 아동/성인 빅세일', 46, '남성 티셔츠/상의 기타')
(0.7041484917624345, 3514459, '[심야특가] 파파야 여성 의류 모음전', 32, '티셔츠')
(0.7024690784010017, 3525500, '[하객패션] 포커스 봄구성완벽해', 74, '티셔츠')
(0.7002006587651551, 3527477, '[투데이특가] 니트/가디건/원피스 외', 61, '원피스')
(0.6940005388097746, 3515690, '[투데이특가] 닥스셔츠 긴/반팔 BEST', 81, '셔츠/남방')
(0.6932612798325845, 3529165, '[하객패션] 락피쉬 18년S/S+20%쿠폰!', 70, '여성단화')
(0.6631775562271357, 3521050, '[하객패션] 엔비룩 봄신상최애템~', 4, '블라우스')
(0.6539876609560208, 3515524, '[무료배송] 롱티/티셔츠/원피스', 45, '티셔츠')
(0.6521534766796955, 3527575, '[무료배송] 프롬유 ~20%할인쿠폰', 55, '티셔츠')
(0.6486066521817627, 3522402, '[무료배송] 에비수 본사특가 20%쿠폰', 44, '티셔츠')
(0.6262001571662322, 3512593, '[무료배송] 봄 아동복 브랜드 연합전', 36, '아동공용의류')
(0.6158906365831677, 3527569, '[투데이특가] 아디다스 그래픽스케일', 91, '반팔 티셔츠')
(0.5845235992012867, 3527053, '[투데이특가] 더사랑이 여름 아동복', 43, '아동공용의류')
(0.5736536968124379, 3525317, '[무료배송] 빅사이즈/원피스/롱티', 28, '원피스')
(0.5506777337072294, 3526985, '[원더쿠폰] 봄 귀걸이,추가할인쿠폰', 42, '귀걸이')
(0.5409439058