In [1]:
import pandas as pd
import json
from scipy import stats
import numpy as np

## 제품 클릭 데이터

In [2]:
read_path = '../csv/click_1911.csv'
click = pd.read_csv(read_path)

click = click.rename(columns={'target_id': 'pid'})

click['pid'] = click['pid'].astype(str)

click.tail()

Unnamed: 0,updated,pid,keyword
71493417,2019-11-07 23:59:55,95136963,
71493418,2019-11-07 23:59:55,96203304,
71493419,2019-11-07 23:59:55,106010080,오소이
71493420,2019-11-07 23:59:55,111370984,멜론
71493421,2019-11-07 23:59:55,111111572,


## 제품 태그 데이터

In [3]:
read_path = '../csv/product_keyword_1911.csv'
product = pd.read_csv(read_path)

product['pid'] = product['pid'].astype(str)

product.tail()

Unnamed: 0,pid,create_date,keyword
427470,111374143,2019-11-07 23:55:44.494597,"나이키패딩XL,스우시패딩,양면패딩"
427471,111374207,2019-11-07 23:56:31.566875,
427472,111374271,2019-11-07 23:57:21.458948,"아이폰,갤럭시,갤럭시s10,노트10플러스,갤럭시노트"
427473,111374335,2019-11-07 23:58:14.403435,나이키 후드점퍼
427474,111374399,2019-11-07 23:59:22.621879,"소아온,나와호랑이님,라이트노벨,라노벨"


In [4]:

def keyword_count(row):
            
    if pd.isnull(row['keyword']):
        return 0
    else:
        replaced = str(row['keyword']).replace(',', '')
        return len(str(row['keyword'])) - len(replaced) + 1

product['keyword_count'] = product.apply(keyword_count, axis=1)

product

Unnamed: 0,pid,create_date,keyword,keyword_count
0,110954880,2019-11-01 00:00:00.000000,,0
1,110957120,2019-11-01 00:00:00.000000,"여성장지갑,여성장지갑선물,여성장지갑여자지갑,여성장지갑추천",4
2,110957760,2019-11-01 00:00:00.000000,"에어팟,2세대,애플,꿀",4
3,110962432,2019-11-01 00:00:00.000000,"샤넬,클러치,샤넬클러치,이미테이션",4
4,110964608,2019-11-01 00:00:00.000000,"판도라반지54호,판도라정품반지,판도라반지",3
...,...,...,...,...
427470,111374143,2019-11-07 23:55:44.494597,"나이키패딩XL,스우시패딩,양면패딩",3
427471,111374207,2019-11-07 23:56:31.566875,,0
427472,111374271,2019-11-07 23:57:21.458948,"아이폰,갤럭시,갤럭시s10,노트10플러스,갤럭시노트",5
427473,111374335,2019-11-07 23:58:14.403435,나이키 후드점퍼,1


In [27]:
product.groupby(['keyword_count'], as_index=False).agg({'pid': 'count'})

Unnamed: 0,keyword_count,pid
0,0,60633
1,1,48868
2,2,35362
3,3,51117
4,4,58170
5,5,173319
6,6,3
7,11,1
8,17,1
9,19,1


## merge click with product data

In [5]:
click_product = pd.merge(click, product, on='pid', suffixes=('_click', '_product'))

click_product

Unnamed: 0,updated,pid,keyword_click,create_date,keyword_product,keyword_count
0,2019-11-01 00:01:06,110946993,,2019-11-01 00:00:24.533452,"셀린자켓,by1031",2
1,2019-11-01 00:05:49,110946993,,2019-11-01 00:00:24.533452,"셀린자켓,by1031",2
2,2019-11-01 00:01:10,110946993,,2019-11-01 00:00:24.533452,"셀린자켓,by1031",2
3,2019-11-01 00:02:35,110946993,,2019-11-01 00:00:24.533452,"셀린자켓,by1031",2
4,2019-11-01 00:16:46,110946993,,2019-11-01 00:00:24.533452,"셀린자켓,by1031",2
...,...,...,...,...,...,...
17632037,2019-11-07 23:59:37,111373252,공기계 울산,2019-11-07 23:43:37.648117,"Q7,Q720,중고폰,부산중고폰,울산중고폰",5
17632038,2019-11-07 23:59:37,111374187,,2019-11-07 23:56:14.573294,"넷플릭스,#영화,#TV",3
17632039,2019-11-07 23:59:54,111374426,,2019-11-07 23:59:46.963877,"디즈니,디즈니공주인형,공주인형,엘레나공주,디즈니인형",5
17632040,2019-11-07 23:59:54,111374346,,2019-11-07 23:58:30.799362,,0


### 상품 키워드 수에 따른 상품 수, 클릭 수

In [6]:
click_product_keyword = click_product.groupby(['keyword_count'], as_index=False).agg({'updated': 'count', 'pid': pd.Series.nunique})

click_product_keyword

Unnamed: 0,keyword_count,updated,pid
0,0,2143453,55943
1,1,1428653,45093
2,2,1339276,33459
3,3,2235481,48686
4,4,2608633,55607
5,5,7871652,163424
6,6,399,3
7,11,1706,1
8,17,1120,1
9,19,1669,1


In [23]:
click_product_keyword = click_product.dropna(subset=['keyword_click'], axis=0)

click_product_keyword

Unnamed: 0,updated,pid,keyword_click,create_date,keyword_product,keyword_count
87,2019-11-07 23:49:07,110946993,숏코트,2019-11-01 00:00:24.533452,"셀린자켓,by1031",2
91,2019-11-01 00:23:35,110947003,세븐틴,2019-11-01 00:00:32.055294,,0
92,2019-11-01 00:33:13,110947003,세븐틴,2019-11-01 00:00:32.055294,,0
97,2019-11-01 07:59:01,110947003,세븐틴,2019-11-01 00:00:32.055294,,0
108,2019-11-01 16:26:59,110947003,세븐틴,2019-11-01 00:00:32.055294,,0
...,...,...,...,...,...,...
17632011,2019-11-07 23:57:49,111331826,등산자켓,2019-11-07 13:02:11.602752,"에이글,등산복,방수자켓,등산자켓",4
17632018,2019-11-07 23:58:25,111330502,발망청바지,2019-11-07 12:38:43.847310,,0
17632033,2019-11-07 23:59:03,111330369,발망청바지,2019-11-07 12:36:07.342882,,0
17632034,2019-11-07 23:59:23,111363377,구스,2019-11-07 21:29:29.043651,"패딩,구스패딩,구스,잠바,점퍼",5


In [28]:
click_by_pid = click_product_keyword.groupby(['keyword_count'], as_index=False).agg({'updated' : 'count'})

click_by_pid

Unnamed: 0,keyword_count,updated
0,0,501755
1,1,277432
2,2,266093
3,3,448583
4,4,545411
5,5,1531356
6,6,29
7,11,51
8,17,12
9,19,22


In [25]:
click_by_pid_keyword = click_by_pid.groupby(['keyword_count'], as_index=False).agg({'pid': 'count', 'updated': 'sum'})

click_by_pid_keyword

Unnamed: 0,keyword_count,pid,updated
0,0,40694,501755
1,1,29761,277432
2,2,24621,266093
3,3,37511,448583
4,4,43816,545411
5,5,126912,1531356
6,6,3,29
7,11,1,51
8,17,1,12
9,19,1,22


In [26]:

def matching_keyword(row):
    
    result = 0
    
    for each_text in row['keyword_click']:
    
        if each_text in row['keyword_product']:
            result += 1
            
    if result > 0:
        return 1
    else:
        return None

click_product_keyword['click_by_keyword'] = click_product_keyword.apply(matching_keyword, axis=1)

click_product_keyword

TypeError: ("argument of type 'float' is not iterable", 'occurred at index 91')

In [18]:
click_by_pid = click_product_keyword.groupby(['pid', 'keyword_count'], as_index=False).agg({'click_by_keyword' : 'count'})

click_by_pid

Unnamed: 0,pid,keyword_count,click_by_keyword
0,110946976,1,11
1,110946977,4,3
2,110946980,5,13
3,110946981,4,4
4,110946983,5,133
...,...,...,...
262622,111374234,5,1
262623,111374247,3,0
262624,111374267,4,0
262625,111374268,3,1


In [19]:
click_by_pid_keyword = click_by_pid.groupby(['keyword_count'], as_index=False).agg({'pid': 'count', 'click_by_keyword': 'sum'})

click_by_pid_keyword

Unnamed: 0,keyword_count,pid,click_by_keyword
0,1,29761,169061
1,2,24621,227509
2,3,37511,415581
3,4,43816,517742
4,5,126912,1462926
5,6,3,29
6,11,1,51
7,17,1,12
8,19,1,22


In [26]:
click_by_pid_keyword = click_by_pid[click_by_pid['click_by_keyword'] > 0 ]

click_by_pid_keyword

Unnamed: 0,pid,click_by_keyword
0,110946976,11
1,110946977,3
2,110946980,13
3,110946981,4
4,110946983,133
...,...,...
262619,111374145,1
262621,111374225,1
262622,111374234,1
262625,111374268,1


In [8]:

def matching_full_keyword(row):
            
    if row['keyword_click'] in row['keyword_product']:
        return 1
    else:
        return None

click_product_keyword['click_by_full_keyword'] = click_product_keyword.apply(matching_full_keyword, axis=1)

click_product_keyword

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


Unnamed: 0,updated,pid,keyword_click,create_date,keyword_product,keyword_count,click_by_full_keyword
87,2019-11-07 23:49:07,110946993,숏코트,2019-11-01 00:00:24.533452,"셀린자켓,by1031",2,
304,2019-11-01 00:24:14,110946997,베이지 숏패딩,2019-11-01 00:00:26.454526,100105,2,
310,2019-11-01 01:58:38,110946997,베이지 패딩,2019-11-01 00:00:26.454526,100105,2,
312,2019-11-01 03:20:30,110946997,숏패딩 베이지,2019-11-01 00:00:26.454526,100105,2,
313,2019-11-01 07:13:19,110946997,숏패딩,2019-11-01 00:00:26.454526,100105,2,
...,...,...,...,...,...,...,...
17631995,2019-11-07 23:56:47,111371167,아이폰11,2019-11-07 23:14:50.332713,"아이폰8,중고폰,가개통,A급,아이폰11",5,1.0
17631997,2019-11-07 23:56:52,111371447,장식품,2019-11-07 23:18:36.557912,"골동품,수집품,장식품,희귀품,은",5,1.0
17632011,2019-11-07 23:57:49,111331826,등산자켓,2019-11-07 13:02:11.602752,"에이글,등산복,방수자켓,등산자켓",4,1.0
17632034,2019-11-07 23:59:23,111363377,구스,2019-11-07 21:29:29.043651,"패딩,구스패딩,구스,잠바,점퍼",5,1.0


In [15]:
click_product_keyword[click_product_keyword['keyword_count'] > 5]

Unnamed: 0,updated,pid,keyword_click,create_date,keyword_product,keyword_count,click_by_full_keyword
10492979,2019-11-04 14:48:29,111142339,라인프렌즈 스마트폰,2019-11-04 14:23:51.057981,"셀잇,가성비,중고폰,스마트폰,갤럭시,노트,휴대폰,최신형,중고,키즈폰,어린이폰,학생폰...",19,
10492988,2019-11-04 15:15:31,111142339,스마트폰,2019-11-04 14:23:51.057981,"셀잇,가성비,중고폰,스마트폰,갤럭시,노트,휴대폰,최신형,중고,키즈폰,어린이폰,학생폰...",19,1.0
10493119,2019-11-04 18:39:29,111142339,스마트폰,2019-11-04 14:23:51.057981,"셀잇,가성비,중고폰,스마트폰,갤럭시,노트,휴대폰,최신형,중고,키즈폰,어린이폰,학생폰...",19,1.0
10493198,2019-11-04 20:34:57,111142339,휴대폰,2019-11-04 14:23:51.057981,"셀잇,가성비,중고폰,스마트폰,갤럭시,노트,휴대폰,최신형,중고,키즈폰,어린이폰,학생폰...",19,1.0
10493359,2019-11-05 00:57:46,111142339,휴대폰,2019-11-04 14:23:51.057981,"셀잇,가성비,중고폰,스마트폰,갤럭시,노트,휴대폰,최신형,중고,키즈폰,어린이폰,학생폰...",19,1.0
...,...,...,...,...,...,...,...
15933563,2019-11-07 18:36:36,111285882,갤럭시북,2019-11-06 18:00:05.390318,"태블릿,안드로이드,윈도우,갤럭시북12.0,갤럭시 북12.0,갤럭시 북 12.0",6,1.0
15933572,2019-11-07 19:53:00,111285882,갤럭시북,2019-11-06 18:00:05.390318,"태블릿,안드로이드,윈도우,갤럭시북12.0,갤럭시 북12.0,갤럭시 북 12.0",6,1.0
15933581,2019-11-07 19:53:21,111285882,갤럭시북,2019-11-06 18:00:05.390318,"태블릿,안드로이드,윈도우,갤럭시북12.0,갤럭시 북12.0,갤럭시 북 12.0",6,1.0
15933590,2019-11-07 22:10:23,111285882,갤럭시북 12.0,2019-11-06 18:00:05.390318,"태블릿,안드로이드,윈도우,갤럭시북12.0,갤럭시 북12.0,갤럭시 북 12.0",6,


In [11]:
click_by_pid = click_product_keyword.groupby(['pid', 'keyword_count'], as_index=False).agg({'click_by_full_keyword' : 'count'})

click_by_pid

Unnamed: 0,pid,keyword_count,click_by_full_keyword
0,110946976,1,0
1,110946977,4,3
2,110946980,5,8
3,110946981,4,4
4,110946983,5,106
...,...,...,...
262622,111374234,5,1
262623,111374247,3,0
262624,111374267,4,0
262625,111374268,3,1


In [12]:
click_by_pid_keyword = click_by_pid.groupby(['keyword_count'], as_index=False).agg({'pid' : 'count', 'click_by_full_keyword': 'sum'})

click_by_pid_keyword

Unnamed: 0,keyword_count,pid,click_by_full_keyword
0,1,29761,83335
1,2,24621,122254
2,3,37511,221021
3,4,43816,267096
4,5,126912,728214
5,6,3,15
6,11,1,9
7,17,1,3
8,19,1,14


In [13]:
click_by_pid_keyword = click_by_pid[click_by_pid['click_by_full_keyword'] > 0 ]

click_by_pid_keyword

Unnamed: 0,pid,keyword_count,click_by_full_keyword
1,110946977,4,3
2,110946980,5,8
3,110946981,4,4
4,110946983,5,106
5,110946984,5,1
...,...,...,...
262619,111374145,5,1
262621,111374225,5,1
262622,111374234,5,1
262625,111374268,3,1


In [14]:
click_by_pid_keyword_count = click_by_pid_keyword.groupby(['keyword_count'], as_index=False).agg({'pid' : 'count', 'click_by_full_keyword': 'sum'})

click_by_pid_keyword_count

Unnamed: 0,keyword_count,pid,click_by_full_keyword
0,1,13299,83335
1,2,17775,122254
2,3,30261,221021
3,4,35808,267096
4,5,99768,728214
5,6,3,15
6,11,1,9
7,17,1,3
8,19,1,14


In [23]:
click_product_keyword.drop(['click_by_keyword'], axis=1, inplace=True)

click_product_keyword

Unnamed: 0,updated,pid,keyword_click,create_date,keyword_product
87,2019-11-07 23:49:07,110946993,숏코트,2019-11-01 00:00:24.533452,"셀린자켓,by1031"
304,2019-11-01 00:24:14,110946997,베이지 숏패딩,2019-11-01 00:00:26.454526,100105
310,2019-11-01 01:58:38,110946997,베이지 패딩,2019-11-01 00:00:26.454526,100105
312,2019-11-01 03:20:30,110946997,숏패딩 베이지,2019-11-01 00:00:26.454526,100105
313,2019-11-01 07:13:19,110946997,숏패딩,2019-11-01 00:00:26.454526,100105
...,...,...,...,...,...
17631995,2019-11-07 23:56:47,111371167,아이폰11,2019-11-07 23:14:50.332713,"아이폰8,중고폰,가개통,A급,아이폰11"
17631997,2019-11-07 23:56:52,111371447,장식품,2019-11-07 23:18:36.557912,"골동품,수집품,장식품,희귀품,은"
17632011,2019-11-07 23:57:49,111331826,등산자켓,2019-11-07 13:02:11.602752,"에이글,등산복,방수자켓,등산자켓"
17632034,2019-11-07 23:59:23,111363377,구스,2019-11-07 21:29:29.043651,"패딩,구스패딩,구스,잠바,점퍼"
