In [1]:
import pandas as pd
import json
from scipy import stats
import numpy as np
import re

### 브랜드 리스트

In [2]:
## brand list
read_path = '../csv/dict_brand.csv'
df_brand = pd.read_csv(read_path)

brand_list = df_brand['brand'].values.tolist()

brand_compile = re.compile(r"(?<!\w)(?:{})(?!\w)".format('|'.join([re.escape(x) for x in brand_list])), re.IGNORECASE)

### 사이즈 리스트

In [3]:
# shoes size list
size_list = ['220', '225', '230', '235', '240', '245', '250', '255', '260', '265', '270', '275',
            '280', '285', '290', '295', '300',
           'xxs', 'xs', 's', 'm' 'l', 'xl', 'xxl', '2xl', '3xl', '4xl', 
           '80', '85', '90', '95', '100', '105', '110', '115', '120', 
           '22', '24', '26', '28', '30', '32', '34', '36', '38', '40', '42', 
           '33', '44', '55', '66', '77', '88', '99', 
           'free']

size_compile = re.compile(r"(?<!\w)(?:{})(?!\w)".format('|'.join([re.escape(x) for x in size_list])), re.IGNORECASE)

In [4]:
# category list

category_list = ['슬리퍼', '운동화', '안전화', '농구화', '롱부츠', '구두', '축구화', '로퍼', '신발', '남성구두', '여성구두',
                 '하이힐', '블로퍼', '부츠', '등산화', '워커', '런닝화', '러닝화']

### 패션 잡화(400) 검색

In [9]:
read_path = '../csv/item_click_400_1902_2001.csv'
click_400 = pd.read_csv(read_path)

click_400['category_id'] = click_400['category_id'].astype(str)
click_400['keyword'] = click_400['keyword'].astype(str)

click_400 = click_400[click_400['keyword'] != 'undefined']

click_400

Unnamed: 0,category_id,name,keyword,click_count
0,400020300,비즈니스가방,남성서류가방,8105
1,400040300,캐주얼화,꼼데 컨버스,33489
2,400010500,백팩,mcm백팩,54210
3,400040999,기타(운동화),280,60050
4,400010100,숄더백,클루니,2051
...,...,...,...,...
6269242,400081999,기타(주얼리/액세서리),폴라로이드카메라,1
6269243,400040999,기타(운동화),에어포스1 07 검흰,1
6269244,400051400,로퍼/슬립온,메이저,1
6269245,400010999,기타(여성가방),동,1


In [10]:
click_400['keyword'].count()

6269151

In [64]:
click_400_keyword = click_400.groupby(['keyword'], as_index=False).agg({'click_count': 'sum'})

click_400_keyword['click_count'].sum()

254405505

In [65]:
click_400_keyword_more_1200  = click_400_keyword[click_400_keyword['click_count'] >= 1200]

click_400_keyword_more_1200 = click_400_keyword_more_1200.sort_values(['click_count'], ascending=False)

click_400_keyword_more_1200['click_count'].sum()

202219475

In [66]:
# search brand & size

def search_brand(row):
    
    i = row['keyword']
        
    if brand_compile.search(i):
        return brand_compile.search(i).group()
    
    else:
        return ''

def search_size(row):
    
    i = row['keyword']
        
    if size_compile.search(i):
        return size_compile.search(i).group()
    
    else:
        return ''

click_400_keyword_more_1200['brand'] = click_400_keyword_more_1200.apply(search_brand, axis=1).astype(str)
click_400_keyword_more_1200['size'] = click_400_keyword_more_1200.apply(search_size, axis=1).astype(str)

click_400_keyword_more_1200

Unnamed: 0,keyword,click_count,brand,size
1376397,시계,2144802,,
1889394,카드지갑,1260332,,
335775,구찌,1218827,구찌,
1829267,지갑,1105901,,
308116,골든구스,1078017,골든구스,
...,...,...,...,...
1365188,스파이더 양말,1200,스파이더,
507924,나이키 터프,1200,나이키,
2065727,티파니다이아반지,1200,,
1659414,원스타 270,1200,,270


In [75]:
# count string length

def count_len(string):
    
    return len(string) - string.count(' ')

click_400_keyword_more_1200['keyword_len'] = click_400_keyword_more_1200['keyword'].apply(count_len)
click_400_keyword_more_1200['brand_len'] = click_400_keyword_more_1200['brand'].apply(count_len)
click_400_keyword_more_1200['size_len'] = click_400_keyword_more_1200['size'].apply(count_len)

click_400_keyword_more_1200

Unnamed: 0,keyword,click_count,brand,size,keyword_len,brand_len,size_len
1376397,시계,2144802,,,2,0,0
1889394,카드지갑,1260332,,,4,0,0
335775,구찌,1218827,구찌,,2,2,0
1829267,지갑,1105901,,,2,0,0
308116,골든구스,1078017,골든구스,,4,4,0
...,...,...,...,...,...,...,...
1365188,스파이더 양말,1200,스파이더,,6,4,0
507924,나이키 터프,1200,나이키,,5,3,0
2065727,티파니다이아반지,1200,,,8,0,0
1659414,원스타 270,1200,,270,6,0,3


In [76]:
# save
save_path = '../csv/result/click_400_keyword_more_1200.csv'
click_400_keyword_more_1200.to_csv(save_path, index=False, mode='w', header=True)

### 신발(400040, 400050, 400051) 검색

In [69]:
# 신발 카테고리 : 400040, 400050, 400051
click_shoes = click_400[click_400['category_id'].str[:6].isin(['400040', '400050', '400051'])]
click_shoes

Unnamed: 0,category_id,name,keyword,click_count
1,400040300,캐주얼화,꼼데 컨버스,33489
3,400040999,기타(운동화),280,60050
5,400051999,기타(남성화),업템포,20512
7,400040300,캐주얼화,골든구스40,10547
8,400051400,로퍼/슬립온,필립플레인,3145
...,...,...,...,...
6269233,400040300,캐주얼화,발렌시아가 트리플s 43 고퀄,1
6269239,400050999,기타(여성화),나이키 데이브레이크 250,1
6269241,400050200,하이힐(8cm이상),가죽구두 225,1
6269243,400040999,기타(운동화),에어포스1 07 검흰,1


In [70]:
click_shoes_keyword = click_shoes.groupby(['keyword'], as_index=False).agg({'click_count': 'sum'})

click_shoes_keyword['click_count'].sum()

93534064

In [71]:
click_shoes_keyword_more_120  = click_shoes_keyword[click_shoes_keyword['click_count'] >= 120]

click_shoes_keyword_more_120 = click_shoes_keyword_more_120.sort_values(['click_count'], ascending=False)

click_shoes_keyword_more_120['click_count'].sum()

84479904

In [72]:
# search brand & size

def search_brand(row):
    
    i = row['keyword']
        
    if brand_compile.search(i):
        return brand_compile.search(i).group()
    
    else:
        return ''

def search_size(row):
    
    i = row['keyword']
        
    if size_compile.search(i):
        return size_compile.search(i).group()
    
    else:
        return ''

click_shoes_keyword_more_120['brand'] = click_shoes_keyword_more_120.apply(search_brand, axis=1).astype(str)
click_shoes_keyword_more_120['size'] = click_shoes_keyword_more_120.apply(search_size, axis=1).astype(str)

click_shoes_keyword_more_120

Unnamed: 0,keyword,click_count,brand,size
137144,골든구스,1069710,골든구스,
774467,이지부스트,984333,,
190832,나이키,744674,나이키,
477561,발렌시아가 트리플s,562867,발렌시아가,
660915,알렉산더맥퀸,422225,알렉산더맥퀸,
...,...,...,...,...
486566,발렌티노 화이트,120,발렌티노,
388401,루이비통 트레이너 스니커즈,120,루이비통,
310719,뉴워리어스,120,,
83753,ck 첼시,120,첼시,


In [73]:
# count string length

def count_len(string):
    
    return len(string) - string.count(' ')

click_shoes_keyword_more_120['keyword_len'] = click_shoes_keyword_more_120['keyword'].apply(count_len)
click_shoes_keyword_more_120['brand_len'] = click_shoes_keyword_more_120['brand'].apply(count_len)
click_shoes_keyword_more_120['size_len'] = click_shoes_keyword_more_120['size'].apply(count_len)

click_shoes_keyword_more_120

Unnamed: 0,keyword,click_count,brand,size,keyword_len,brand_len,size_len
137144,골든구스,1069710,골든구스,,4,4,0
774467,이지부스트,984333,,,5,0,0
190832,나이키,744674,나이키,,3,3,0
477561,발렌시아가 트리플s,562867,발렌시아가,,9,5,0
660915,알렉산더맥퀸,422225,알렉산더맥퀸,,6,6,0
...,...,...,...,...,...,...,...
486566,발렌티노 화이트,120,발렌티노,,7,4,0
388401,루이비통 트레이너 스니커즈,120,루이비통,,12,4,0
310719,뉴워리어스,120,,,5,0,0
83753,ck 첼시,120,첼시,,4,2,0


In [74]:
# save
save_path = '../csv/result/click_shoes_keyword_more_120.csv'
click_shoes_keyword_more_120.to_csv(save_path, index=False, mode='w', header=True)

### 여성의류(310) 검색

In [5]:
read_path = '../csv/item_click_310_1902_2001.csv'
click_310 = pd.read_csv(read_path)

click_310['category_id'] = click_310['category_id'].astype(str)
click_310['keyword'] = click_310['keyword'].astype(str)

click_310 = click_310[click_310['keyword'] != 'undefined']

click_310

Unnamed: 0,category_id,name,keyword,click_count
0,310110050,무스탕,여성의류,314
1,310140050,하이웨스트 진,니트팬츠,8
2,310090030,패딩,여성라쿤롱패딩,75
3,310090030,패딩,숏패딩,416315
4,310120010,캐주얼 원피스,옷 처분,409
...,...,...,...,...
5576037,310170999,기타(레깅스),트레이닝 일괄,1
5576038,310070040,롱 가디건,여자빅,1
5576039,310060010,라운드넥 니트,타미 롱패딩,1
5576040,310200999,기타(언더웨어/속옷),실버브라,1


In [6]:
click_310['keyword'].count()

5575869

In [92]:
click_310_keyword = click_310.groupby(['keyword'], as_index=False).agg({'click_count': 'sum'})

click_310_keyword['click_count'].sum()

122591461

In [95]:
click_310_keyword_more_1200  = click_310_keyword[click_310_keyword['click_count'] >= 1200]

click_310_keyword_more_1200 = click_310_keyword_more_1200.sort_values(['click_count'], ascending=False)

click_310_keyword_more_1200['click_count'].sum()

91149161

In [96]:
# search brand & size

def search_brand(row):
    
    i = row['keyword']
        
    if brand_compile.search(i):
        return brand_compile.search(i).group()
    
    else:
        return ''

def search_size(row):
    
    i = row['keyword']
        
    if size_compile.search(i):
        return size_compile.search(i).group()
    
    else:
        return ''

click_310_keyword_more_1200['brand'] = click_310_keyword_more_1200.apply(search_brand, axis=1).astype(str)
click_310_keyword_more_1200['size'] = click_310_keyword_more_1200.apply(search_size, axis=1).astype(str)

click_310_keyword_more_1200

Unnamed: 0,keyword,click_count,brand,size
1090780,원피스,1553446,,
464733,롱원피스,613048,,
708622,블라우스,604841,,
1472709,핸드메이드코트,575284,,
428572,럭키슈에뜨,569453,럭키슈에뜨,
...,...,...,...,...
954966,양털 가디건,1201,,
568538,무스너클정품,1200,,
558185,몽클레어 트레이닝,1200,몽클레어,
796114,소라블라우스,1200,,


In [97]:
# count string length

def count_len(string):
    
    return len(string) - string.count(' ')

click_310_keyword_more_1200['keyword_len'] = click_310_keyword_more_1200['keyword'].apply(count_len)
click_310_keyword_more_1200['brand_len'] = click_310_keyword_more_1200['brand'].apply(count_len)
click_310_keyword_more_1200['size_len'] = click_310_keyword_more_1200['size'].apply(count_len)

click_310_keyword_more_1200

Unnamed: 0,keyword,click_count,brand,size,keyword_len,brand_len,size_len
1090780,원피스,1553446,,,3,0,0
464733,롱원피스,613048,,,4,0,0
708622,블라우스,604841,,,4,0,0
1472709,핸드메이드코트,575284,,,7,0,0
428572,럭키슈에뜨,569453,럭키슈에뜨,,5,5,0
...,...,...,...,...,...,...,...
954966,양털 가디건,1201,,,5,0,0
568538,무스너클정품,1200,,,6,0,0
558185,몽클레어 트레이닝,1200,몽클레어,,8,4,0
796114,소라블라우스,1200,,,6,0,0


In [98]:
# save
save_path = '../csv/result/click_310_keyword_more_1200.csv'
click_310_keyword_more_1200.to_csv(save_path, index=False, mode='w', header=True)

### 남성의류(320) 검색

In [7]:
read_path = '../csv/item_click_320_1902_2001.csv'
click_320 = pd.read_csv(read_path)

click_320['category_id'] = click_320['category_id'].astype(str)
click_320['keyword'] = click_320['keyword'].astype(str)

click_320 = click_320[click_320['keyword'] != 'undefined']

click_320

Unnamed: 0,category_id,name,keyword,click_count
0,320070400,패딩 조끼,몽클레어 조끼,6722
1,320080200,패딩 점퍼,대장급,14962
2,320030200,후드 티셔츠,앤더슨벨,16943
3,320030100,맨투맨 티셔츠,스파이더,118078
4,320090999,기타(자켓),타미 후리스,2310
...,...,...,...,...
5597390,320100100,모직 코트,누빔 셔츠,1
5597391,320090300,가죽 자켓,가죽 코모,1
5597392,320120999,기타(면/캐주얼 팬츠),레드 펑크,1
5597393,320050200,브이넥 니트,몽클레어 니트패딩 레플,1


In [8]:
click_320['keyword'].count()

5597270

In [85]:
click_320_keyword = click_320.groupby(['keyword'], as_index=False).agg({'click_count': 'sum'})

click_320_keyword['click_count'].sum()

176525135

In [99]:
click_320_keyword_more_1200  = click_320_keyword[click_320_keyword['click_count'] >= 1200]

click_320_keyword_more_1200 = click_320_keyword_more_1200.sort_values(['click_count'], ascending=False)

click_320_keyword_more_1200['click_count'].sum()

140128712

In [100]:
# search brand & size

def search_brand(row):
    
    i = row['keyword']
        
    if brand_compile.search(i):
        return brand_compile.search(i).group()
    
    else:
        return ''

def search_size(row):
    
    i = row['keyword']
        
    if size_compile.search(i):
        return size_compile.search(i).group()
    
    else:
        return ''

click_320_keyword_more_1200['brand'] = click_320_keyword_more_1200.apply(search_brand, axis=1).astype(str)
click_320_keyword_more_1200['size'] = click_320_keyword_more_1200.apply(search_size, axis=1).astype(str)

click_320_keyword_more_1200

Unnamed: 0,keyword,click_count,brand,size
1027761,스톤아일랜드,2751893,스톤아일랜드,
1035219,스톤아일랜드 맨투맨,1518771,스톤아일랜드,
760655,무스너클 버니스웨터,1242915,무스너클,
756879,무스너클,892398,무스너클,
1065422,스파이더,874207,스파이더,
...,...,...,...,...
1604620,패딩 블레이저,1200,,
687936,마르셀로불론 패딩,1200,마르셀로불론,
418795,남자바지.카고바지,1200,,
1312895,유니클로바지,1200,,


In [101]:
# count string length

def count_len(string):
    
    return len(string) - string.count(' ')

click_320_keyword_more_1200['keyword_len'] = click_320_keyword_more_1200['keyword'].apply(count_len)
click_320_keyword_more_1200['brand_len'] = click_320_keyword_more_1200['brand'].apply(count_len)
click_320_keyword_more_1200['size_len'] = click_320_keyword_more_1200['size'].apply(count_len)

click_320_keyword_more_1200

Unnamed: 0,keyword,click_count,brand,size,keyword_len,brand_len,size_len
1027761,스톤아일랜드,2751893,스톤아일랜드,,6,6,0
1035219,스톤아일랜드 맨투맨,1518771,스톤아일랜드,,9,6,0
760655,무스너클 버니스웨터,1242915,무스너클,,9,4,0
756879,무스너클,892398,무스너클,,4,4,0
1065422,스파이더,874207,스파이더,,4,4,0
...,...,...,...,...,...,...,...
1604620,패딩 블레이저,1200,,,6,0,0
687936,마르셀로불론 패딩,1200,마르셀로불론,,8,6,0
418795,남자바지.카고바지,1200,,,9,0,0
1312895,유니클로바지,1200,,,6,0,0


In [102]:
# save
save_path = '../csv/result/click_320_keyword_more_1200.csv'
click_320_keyword_more_1200.to_csv(save_path, index=False, mode='w', header=True)