# word2vector 기반 유저 프로파일 / 딜 피쳐
* logistic regression
* gradient boost classifier

In [1]:
import requests
import time
import urllib
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import json
import elasticsearch
import csv
import pickle
from operator import itemgetter
import pandas as pd

# 예측 결과 로딩

# Elastic Search 를 이용

In [29]:
es = elasticsearch.Elasticsearch('twiceSpark1:9200')

def es_search(es, deal_ids):
    res = es.search(index='ojm5', body={'query': {'ids':{'values':deal_ids}}}, size=100)
    n = res['hits']['total']
    dic = {}
    for hit in res['hits']['hits']:
        dic[hit['_id']] =  hit['_source']
    return dic
    

In [30]:
def show_preds(pred):
    img_base_url=r'http://img.wemep.co.kr/'
    history = es_search(es, pred[0])
        
    fig, axes = plt.subplots(ncols=len(pred[0]), figsize=(16,16))
    
    for i, h in enumerate(pred[0]):
        title = history[h]['_2']
        img = history[h]['_3']
        f = urllib.request.urlopen(img_base_url+img)
        a = plt.imread(f, format='jpg')
        axes[i].imshow(a)
        axes[i].set_title("{}".format(title))
        axes[i].axis('off')
    plt.show()
    
    candidates = list(map(lambda x: x[0], pred[1]))
    res = es_search(es, candidates)
    
    i = 0
    for prod, slot, prob in pred[1]:
        i+=1
        if prod in res:
            cat = res[prod]['_4']['_5']
            title = res[prod]['_2']
            img = res[prod]['_3']
            #print('{}[{}].{}.prod {}.slot {}.prob {}'.format(title, cat, img, prod, slot, prob))
            #continue
            
            f = urllib.request.urlopen(img_base_url+img)        
            # read the image file in a numpy array
            a = plt.imread(f, format='jpg')
            plt.imshow(a)
            plt.title("{}[{}] score:{} {} [{}]".format(i, slot, prob, title, cat))
            plt.axis('off')
            plt.show()

        else:
            print('no entry for product {}.slot {}.prob {}'.format(prod, slot, prob))

In [31]:
def show_history_only(pred):
    img_base_url=r'http://img.wemep.co.kr/'
    history = es_search(es, pred[0])
        
    fig, axes = plt.subplots(ncols=len(pred[0]), figsize=(16,16))
    
    for i, h in enumerate(pred[0]):
        title = history[h]['_2']
        img = history[h]['_3']
        f = urllib.request.urlopen(img_base_url+img)
        a = plt.imread(f, format='jpg')
        axes[i].imshow(a)
        axes[i].set_title("{}".format(title))
        axes[i].axis('off')
    plt.show()

In [32]:
def show_item_b(item_b, show_figure=False):
    img_base_url=r'http://img.wemep.co.kr/'
    
    candidates = list(map(lambda x: x[0], item_b))
    res = es_search(es, candidates)
    
    i = 0
    for deal, slot, score in item_b:
        i+=1
        if deal in res:
            cat = res[deal]['_4']['_5']
            title = res[deal]['_2']
            img = res[deal]['_3']
            
            f = urllib.request.urlopen(img_base_url+img)        
            # read the image file in a numpy array
            a = plt.imread(f, format='jpg')
            plt.imshow(a)
            plt.title("{}[{}] score:{} {} [{}]".format(i, slot, score, title, cat))
            plt.axis('off')
            plt.show()

        else:
            print('no entry for product {} score{}'.format(deal, score))

In [33]:
def es_scan_extra_by_dids(dids):
    """
    dids로부터, mn, tn1를 가져온다.
    """
    res = es.search(index='dealinfos', 
                body={
                    'from':0, 'size': len(dids),
                    "_source": ["mn", "tn1", "did"],
                    'query': {
                        'ids': {'values': dids }
                        }                        
                    }
               )
    dic = {}
    for hit in res['hits']['hits']:
        dic[hit['_source']['did']] = (hit['_source']['mn'], hit['_source']['tn1'])
    return dic

In [100]:
def es_read_wepick_setting(dt, start_slot=20):
    """
    위픽 세팅 로딩
    """
    res = es.search(index='wepick_setting_ext', 
                body={
                    'query': {
                        'term': {'dt': dt }
                        }                        
                    }
               )
    if res['hits']['total'] > 0:
        dic = {}
        vec = []
        for s in res['hits']['hits'][0]['_source']['settings']:
            dic[s['slot']] = s['did']
            vec.append(s['did'])
        return vec, dic
    return None, None

In [35]:
def get_w2v_words_from_did(dids):
    """
    해당 딜번호의 워드 모음
    """
    res = es.search(index='deal_word2vec', 
                body={
                  "query": { "terms": {"v":dids } },
                  "_source": ["words", "v"],
                    "size": len(dids)
                    }
               )
    dic = {}
    if res['hits']['total'] > 0:
        for hits in res['hits']['hits']:
            dic[hits['_source']['v']] = hits['_source']['words']
    return dic

In [36]:
def print_user_wepick_history(user_id, day_limit, gte_slot=20, lte_slot=100):
    """
    사용자의 위픽 히스토리를 출력
    day_limit 미만
    클릭한 슬롯이: gte_slot ~ lte_slot 이어야 한다.
    """
    res = es.search(index='wepick_seq', 
                body={
                  "query": {
                    "bool": {
                      "must": { 
                        "term": {"u":user_id}
                      },
                      "filter": [
                          {
                            "range" : {
                              "rgtime" : {
                              "lt" : day_limit
                              }
                            }                              
                          },
                          {
                            "range" : {
                              "slot" : {
                              "gte" : gte_slot,
                              "lte" : lte_slot
                              }
                            }                              
                          }
                      ]
                    }
                  },
                  "size": 128,
                  "sort": {"rgtime":"desc"}
                                }
               )
    filtered = []
    if res['hits']['total'] > 0:
        filtered = []
        for hit in res['hits']['hits']:
            filtered.append((hit['_source']['v'], hit['_source']['rgtime'], hit['_source']['slot']))
    if len(filtered) > 0:
        ret = []
        deal_dic = es_scan_extra_by_dids(list(map(lambda x: x[0], filtered)))
        deal_dcs = get_w2v_words_from_did(list(map(lambda x: x[0], filtered)))
        for did, ts, slot in filtered:
            if did in deal_dic:
                if did in deal_dcs:
                    dcs = deal_dcs[did]
                else:
                    dcs = []
                ret.append((did, ts, slot, deal_dic[did][0], deal_dic[did][1], dcs))
                #print("{},{},{},{},{}".format(did, ts, slot, deal_dic[did][0], deal_dic[did][1]))
        filtered = ret
    return filtered

In [101]:
# 2018-04-11 21 시의 위픽 세팅 로딩
wepick_setting, wepick_dic = es_read_wepick_setting('2018-04-10 21')

In [102]:
deal_to_slot = dict(zip(wepick_dic.values(), wepick_dic.keys()))

In [104]:
deal_to_slot

{1432589: 13,
 1432649: 23,
 2271029: 34,
 2828078: 77,
 2833857: 44,
 3119252: 50,
 3182979: 30,
 3198909: 21,
 3312415: 61,
 3345369: 64,
 3360824: 19,
 3419534: 10,
 3433098: 39,
 3471308: 35,
 3477356: 27,
 3478607: 56,
 3479353: 16,
 3480437: 31,
 3495780: 68,
 3495953: 20,
 3499106: 65,
 3501098: 73,
 3501642: 87,
 3503208: 55,
 3503234: 7,
 3504379: 90,
 3504778: 43,
 3504800: 84,
 3505210: 6,
 3505675: 58,
 3506024: 80,
 3507148: 38,
 3508976: 42,
 3509649: 17,
 3511692: 14,
 3512551: 28,
 3512963: 60,
 3513118: 53,
 3513125: 1,
 3513276: 36,
 3513863: 46,
 3513973: 79,
 3514159: 47,
 3514241: 24,
 3514324: 9,
 3514358: 51,
 3514385: 91,
 3514623: 26,
 3514977: 63,
 3515652: 4,
 3516021: 49,
 3516557: 29,
 3516905: 18,
 3517960: 40,
 3518367: 85,
 3518662: 66,
 3518904: 12,
 3519150: 5,
 3519493: 11,
 3520774: 88,
 3520942: 103,
 3520946: 37,
 3520992: 78,
 3521079: 75,
 3521090: 57,
 3521092: 48,
 3521100: 33,
 3521251: 89,
 3521260: 76,
 3521284: 67,
 3521412: 54,
 3521485: 8

## Ranking 관련

In [80]:
def print_result(predictions, extra):
    for _did, score in predictions:
        did = int(_did)
        if did in extra:
            if did in deal_to_slot:
                print((score, did, extra[did][0], deal_to_slot[did], extra[did][1]))
            else:
                print((score, did, extra[did][0], -1, extra[did][1]))

In [72]:
def print_predicted_ranks(prediction):
    print_result(prediction, es_scan_extra_by_dids(list(map(lambda x: int(x[0]), prediction))))

## logistic regression 예측

In [73]:
lr_predict_df = pd.read_csv(r'd:\WMIND\temp\lr_review_0411_21.csv', index_col = 0)

In [74]:
lr_pred_dic = {}    
for id, row in lr_predict_df.iterrows():
    lr_pred_dic[id] = sorted(row.to_dict().items(), key=itemgetter(1), reverse=True)

In [75]:
result = print_user_wepick_history(8808210, '2018-04-11', gte_slot=20)

In [58]:
for did, ts, slot, title, tn, dcs in result:
    print(did, ts, slot, title, tn)

3512009 2018-04-09T12:52:13.956000 22 [원더쿠폰] 첼로걸, 20%할인 쿠폰 블라우스
3518099 2018-04-09T12:52:06.156000 20 [리빙위크] 리베 북유럽풍5단 서랍장 리빙박스/수납함
3505580 2018-04-07T23:30:17.141000 86 [무료배송] 파파라치 예쁜속옷! 여성 브라팬티 세트
3506322 2018-04-07T23:28:10.359000 61 [원더쿠폰] 구두/신발/+20%쿠폰 여성단화
3513258 2018-04-07T23:26:14.835000 51 [무료배송] 원피스/블라우스/쟈켓 원피스
3513720 2018-04-07T23:25:45.817000 50 [하객패션] 러브캣비쥬ACC + 20%쿠폰 귀걸이
3503845 2018-04-07T23:25:11.899000 36 [무료배송] 백화점 리에통+20%쿠폰 여성가방
3498161 2018-04-06T14:16:08.994000 82 [주말직구] 명품뷰티 기초 색조200종 색조메이크업
3501094 2018-04-06T14:15:36.561000 72 [무료배송] 탐스백 신상 여성가방 여성가방
3507104 2018-04-06T14:14:06.273000 33 [무료배송] 아디다스 400종 OPEN! 런닝화/운동화 기타
3499864 2018-04-05T08:49:49.479000 68 [무료배송] 매장판 리복 180종 런닝화
3483501 2018-04-05T08:48:40.225000 49 [봄신상패션] 스파오 티셔츠/팬츠 外 티셔츠
3472770 2018-04-05T08:47:11.305000 40 [봄신상패션] 엔비룩 추가 할인쿠폰! 블라우스
3505054 2018-04-05T08:47:01.390000 32 [타임특가] 펀업 블럭 핸드폰 케이스 스마트 액세서리
3505529 2018-04-05T08:46:53.371000 29 [투데이특가] 예쁘고편한 유카타잠옷 잠옷/홈웨어
3505723 2018-04-05T08:46:

In [105]:
print_predicted_ranks(lr_pred_dic[8808210])

(0.7936674476928148, 3527477, '[투데이특가] 니트/가디건/원피스 외', -1, '원피스')
(0.7785943898652803, 3514459, '[심야특가] 파파야 여성 의류 모음전', -1, '티셔츠')
(0.7219723510724965, 3521090, '[어린이날] 인기 브랜드슈즈 파격가!', 57, '유아동신발')
(0.6890653295486322, 3501098, '[무료배송] 신디키즈 봄 아동복', 73, '아동공용의류')
(0.6852484731095589, 3505800, '[하객패션] 스파오 봄여름 의류/잡화', -1, '티셔츠')
(0.6785739211042247, 3508946, '[리빙위크] 쉬즈홈 Best 이불 커튼', -1, '요')
(0.6766076320794777, 3515524, '[무료배송] 롱티/티셔츠/원피스', -1, '티셔츠')
(0.6737932497312635, 3513863, '[하객패션] 청바지/데님/스키니진', 46, '청바지/진')
(0.6729759298437094, 3525812, '[리빙위크] 마켓비 조명 서랍장 선반', 22, '서랍장')
(0.6714543282374738, 3524294, '[투데이특가] 여심저격 클루나드시계', -1, '패션시계')
(0.6684625493713289, 3521412, '[투데이특가] 원피스/반팔티/맨투맨', 54, '티셔츠')
(0.6653919932217346, 3508976, '[하객패션] 마이수야, 20% 할인쿠폰', 42, '블라우스')
(0.6621619841484573, 3513973, '[투데이특가] 원피스/블라우스/스커', 79, '원피스')
(0.6591598507445531, 3513787, '[리빙위크] 3M 정전기청소포/물걸레', -1, '밀대/청소포')
(0.6507917476506134, 3525317, '[무료배송] 빅사이즈/원피스/롱티', -1, '원피스')
(0.6487084366814696, 35444

## gboost 예측

In [61]:
gbc_predict_df = pd.read_csv(r'd:\WMIND\temp\gbc_review_0411_21.csv', index_col = 0)

In [62]:
gbc_pred_dic = {}    
for id, row in gbc_predict_df.iterrows():
    gbc_pred_dic[id] = sorted(row.to_dict().items(), key=itemgetter(1), reverse=True)

In [63]:
result = print_user_wepick_history(8808210, '2018-04-11', gte_slot=20)

In [64]:
for did, ts, slot, title, tn, dcs in result:
    print(did, ts, slot, title, tn)

3512009 2018-04-09T12:52:13.956000 22 [원더쿠폰] 첼로걸, 20%할인 쿠폰 블라우스
3518099 2018-04-09T12:52:06.156000 20 [리빙위크] 리베 북유럽풍5단 서랍장 리빙박스/수납함
3505580 2018-04-07T23:30:17.141000 86 [무료배송] 파파라치 예쁜속옷! 여성 브라팬티 세트
3506322 2018-04-07T23:28:10.359000 61 [원더쿠폰] 구두/신발/+20%쿠폰 여성단화
3513258 2018-04-07T23:26:14.835000 51 [무료배송] 원피스/블라우스/쟈켓 원피스
3513720 2018-04-07T23:25:45.817000 50 [하객패션] 러브캣비쥬ACC + 20%쿠폰 귀걸이
3503845 2018-04-07T23:25:11.899000 36 [무료배송] 백화점 리에통+20%쿠폰 여성가방
3498161 2018-04-06T14:16:08.994000 82 [주말직구] 명품뷰티 기초 색조200종 색조메이크업
3501094 2018-04-06T14:15:36.561000 72 [무료배송] 탐스백 신상 여성가방 여성가방
3507104 2018-04-06T14:14:06.273000 33 [무료배송] 아디다스 400종 OPEN! 런닝화/운동화 기타
3499864 2018-04-05T08:49:49.479000 68 [무료배송] 매장판 리복 180종 런닝화
3483501 2018-04-05T08:48:40.225000 49 [봄신상패션] 스파오 티셔츠/팬츠 外 티셔츠
3472770 2018-04-05T08:47:11.305000 40 [봄신상패션] 엔비룩 추가 할인쿠폰! 블라우스
3505054 2018-04-05T08:47:01.390000 32 [타임특가] 펀업 블럭 핸드폰 케이스 스마트 액세서리
3505529 2018-04-05T08:46:53.371000 29 [투데이특가] 예쁘고편한 유카타잠옷 잠옷/홈웨어
3505723 2018-04-05T08:46:

In [65]:
print_predicted_ranks(gbc_pred_dic[8808210])

(0.806966813568699, 3525812, '[리빙위크] 마켓비 조명 서랍장 선반', '서랍장')
(0.7381653562475311, 3505800, '[하객패션] 스파오 봄여름 의류/잡화', '티셔츠')
(0.7250965637694399, 3508976, '[하객패션] 마이수야, 20% 할인쿠폰', '블라우스')
(0.7238992581340316, 3513863, '[하객패션] 청바지/데님/스키니진', '청바지/진')
(0.7113136103323243, 3522395, '[롯데] 르까프 아동/성인 빅세일', '남성 티셔츠/상의 기타')
(0.7041484917624345, 3514459, '[심야특가] 파파야 여성 의류 모음전', '티셔츠')
(0.7024690784010017, 3525500, '[하객패션] 포커스 봄구성완벽해', '티셔츠')
(0.7002006587651551, 3527477, '[투데이특가] 니트/가디건/원피스 외', '원피스')
(0.6940005388097746, 3515690, '[투데이특가] 닥스셔츠 긴/반팔 BEST', '셔츠/남방')
(0.6933413845440604, 3346365, '[결혼해봄] 위닉스 제습기 5종', '제습기')
(0.6932612798325845, 3529165, '[하객패션] 락피쉬 18년S/S+20%쿠폰!', '여성단화')
(0.6757429646077227, 3524297, '[심야특가] 신발/운동화/구두/슬립온', '가보시/웨지힐/통굽')
(0.6756433847120844, 3521284, '[어린이날] 월튼 나들이 룩 ', '아동공용의류')
(0.6631775562271357, 3521050, '[하객패션] 엔비룩 봄신상최애템~', '블라우스')
(0.6621463133800314, 3521412, '[투데이특가] 원피스/반팔티/맨투맨', '티셔츠')
(0.6539876609560208, 3515524, '[무료배송] 롱티/티셔츠/원피스', '티셔츠')
(0.652153476