## 모듈 실행

In [1]:
import zipfile, os, json, time
import pandas as pd

## 압축푸는 함수

In [2]:
def unzip(source_file, dest_path):
    with zipfile.ZipFile(source_file, 'r') as zf:
        zipInfo = zf.infolist()
        for member in zipInfo:
            try:
                member.filename = member.filename.encode('cp437').decode('euc-kr', 'ignore')
                zf.extract(member, dest_path)
            except:
                print(source_file)
                raise Exception('what?!')

## session table

In [3]:
def sessionTable(json_data,file_list):
    data = pd.DataFrame(columns = ['userId','sessionId','sessionDate','device','platform'])

    for j in range(len(json_data.get('sessions'))):
        userId = file_list.split('.')[0]+'_'+file_list.split('.')[1]
        sessionId = json_data.get('sessions')[j].get('sessionId')
        sessionDate = json_data.get('sessions')[j].get('sessionDate')
        device = json_data.get('sessions')[j].get('deviceCategory')
        platform = json_data.get('sessions')[j].get('platform')

        part_data = pd.DataFrame([userId,sessionId,sessionDate,device,platform],
                                  index = ['userId','sessionId','sessionDate','device','platform']).T
        data = pd.concat([data,part_data])
        data = data.reset_index().drop(['index'], axis=1)

    return data

## activity Table

In [4]:
def activityTable(json_data,file_list):
    data = pd.DataFrame(columns = ['userId','sessionId','transectionId','activityTime','source','medium','channelGrouping',\
                                   'campaign','keyword','hostname','landingPagePath','transectionRevenue'])

    for j in range(len(json_data.get('sessions'))):
        for z in range(len(json_data.get('sessions')[j].get('activities'))):

            if (json_data.get('sessions')[j].get('activities')[z].get('activityType') == 'ECOMMERCE'):
                if (json_data.get('sessions')[j].get('activities')[z].get('ecommerce').get('actionType') == 'PAYMENT'):
                    userId = file_list.split('.')[0]+'_'+file_list.split('.')[1]
                    sessionId = json_data.get('sessions')[j].get('sessionId')
                    activityTime = json_data.get('sessions')[j].get('activities')[z].get('activityTime')
                    source = json_data.get('sessions')[j].get('activities')[z].get('source')
                    medium = json_data.get('sessions')[j].get('activities')[z].get('medium')
                    channelGrouping = json_data.get('sessions')[j].get('activities')[z].get('channelGrouping')
                    campaign = json_data.get('sessions')[j].get('activities')[z].get('campaign')
                    keyword = json_data.get('sessions')[j].get('activities')[z].get('keyword')
                    hostname = json_data.get('sessions')[j].get('activities')[z].get('hostname')
                    landingPagePath = json_data.get('sessions')[j].get('activities')[z].get('landingPagePath')
                    transectionId = json_data.get('sessions')[j].get('activities')[z].get('ecommerce').get('transaction').\
                    get('transactionId')
                    transectionRevenue= json_data.get('sessions')[j].get('activities')[z].get('ecommerce').get('transaction').\
                    get('transactionRevenue')

                    part_data = pd.DataFrame([userId,sessionId,transectionId,activityTime,source,medium,channelGrouping,\
                                              campaign,keyword,hostname,landingPagePath,transectionRevenue],
                                              index = ['userId','sessionId','transectionId','activityTime','source','medium',\
                                                       'channelGrouping','campaign','keyword','hostname','landingPagePath',\
                                                       'transectionRevenue']).T

                    data = pd.concat([data,part_data])
                    data = data.reset_index().drop(['index'], axis=1)

    return data

## product table

In [5]:
def productTable(json_data,file_list):
    data = pd.DataFrame(columns = ['userId','sessionId','transectionId','productSku','productName','itemRevenue','productQuantity'])

    for j in range(len(json_data.get('sessions'))):
        for z in range(len(json_data.get('sessions')[j].get('activities'))):

            if (json_data.get('sessions')[j].get('activities')[z].get('activityType') == 'ECOMMERCE'):
                if (json_data.get('sessions')[j].get('activities')[z].get('ecommerce').get('actionType') == 'PAYMENT'):
                    userId = file_list.split('.')[0]+'_'+file_list.split('.')[1]
                    sessionId = json_data.get('sessions')[j].get('sessionId')
                    transectionId = json_data.get('sessions')[j].get('activities')[z].get('ecommerce').get('transaction').\
                    get('transactionId')

                    for p in range(len(json_data.get('sessions')[j].get('activities')[z].get('ecommerce').get('products'))):
                        productSku = json_data.get('sessions')[j].get('activities')[z].get('ecommerce').get('products')[p].\
                        get('productSku')
                        productName = json_data.get('sessions')[j].get('activities')[z].get('ecommerce').get('products')[p].\
                        get('productName')
                        itemRevenue = json_data.get('sessions')[j].get('activities')[z].get('ecommerce').get('products')[p].\
                        get('itemRevenue')
                        productQuantity = json_data.get('sessions')[j].get('activities')[z].get('ecommerce').get('products')[p].\
                        get('productQuantity')


                        part_data = pd.DataFrame([userId,sessionId,transectionId,productSku,productName,itemRevenue,productQuantity],
                                                  index = ['userId','sessionId','transectionId','productSku','productName',\
                                                           'itemRevenue','productQuantity']).T

                        data = pd.concat([data,part_data])
                        data = data.reset_index().drop(['index'], axis=1)

    return data

---

## 데이터 로드

### 1. JSON 압축풀기

In [6]:
unzip('data/json_revenue.zip','data/json_revenue')

### 2. 파일 리스트 생성 및 파일 수 확인

In [6]:
path_dir = 'data/json_revenue/json'

file_list = os.listdir(path_dir)

len(file_list)

2226

### 3. 테이블 생성

In [8]:
start = time.time()

session_data = pd.DataFrame()
activity_data = pd.DataFrame()
product_data = pd.DataFrame()


for i in range(0,len(file_list)):
    input_file_name = path_dir+"/"+file_list[i]
    with open(input_file_name, encoding='UTF8') as json_file:
        json_data = json.load(json_file)
    
    data1 = sessionTable(json_data,file_list[i])
    data2 = activityTable(json_data,file_list[i])
    data3 = productTable(json_data,file_list[i])
    
    session_data = pd.concat([session_data,data1]).reset_index().drop(['index'], axis=1)
    activity_data = pd.concat([activity_data,data2]).reset_index().drop(['index'], axis=1)
    product_data = pd.concat([product_data,data3]).reset_index().drop(['index'], axis=1)
    
print("time :", time.time() - start) 

time : 67.15347146987915


### 4. 데이터 검토

#### 1)세션 테이블

In [14]:
print(len(session_data))
session_data.head()

11888


Unnamed: 0,userId,sessionId,sessionDate,device,platform
0,689679556_1578719054,1580440083,2020-01-31,desktop,Windows
1,689679556_1578719054,1580193439,2020-01-28,desktop,Windows
2,689679556_1578719054,1578719060,2020-01-11,desktop,Windows
3,689679556_1578719054,1578719056,2020-01-11,desktop,Windows
4,475325885_1578657870,1578833601,2020-01-12,mobile,iOS


In [15]:
session_data['userId'].value_counts()

1259415449_1574252717    112
1256814883_1573715316    100
408015221_1573683192      92
1119202247_1573648860     75
754256883_1573644374      75
                        ... 
1451761988_1580441410      1
1975203679_1578493245      1
614331018_1578566750       1
549938494_1579324565       1
44630100_1579055070        1
Name: userId, Length: 2226, dtype: int64

In [18]:
session_data['sessionId'].value_counts()

1580453359    2
1579564216    2
1579532302    2
1578412603    2
1580112019    2
             ..
1579686816    1
1579124553    1
1579795693    1
1578718501    1
1579854953    1
Name: sessionId, Length: 11843, dtype: int64

In [27]:
session_data['sessionDate'].value_counts().sort_index()

2020-01-01    188
2020-01-02    219
2020-01-03    210
2020-01-04    231
2020-01-05    177
2020-01-06    215
2020-01-07    332
2020-01-08    551
2020-01-09    381
2020-01-10    345
2020-01-11    383
2020-01-12    312
2020-01-13    258
2020-01-14    382
2020-01-15    449
2020-01-16    408
2020-01-17    384
2020-01-18    496
2020-01-19    512
2020-01-20    584
2020-01-21    491
2020-01-22    401
2020-01-23    474
2020-01-24    362
2020-01-25    350
2020-01-26    307
2020-01-27    308
2020-01-28    543
2020-01-29    531
2020-01-30    592
2020-01-31    512
Name: sessionDate, dtype: int64

In [28]:
session_data['device'].value_counts()

mobile     11314
desktop      546
tablet        28
Name: device, dtype: int64

In [29]:
session_data['platform'].value_counts()

iOS           5994
Android       5342
Windows        522
Macintosh       26
BlackBerry       4
Name: platform, dtype: int64

#### 2) 활동 테이블

In [2]:
print(len(activity_data))
activity_data.head()
# 정보 유출 문제로 생략

In [32]:
activity_data['transectionId'].value_counts()

20200121-0000369     2
Pay-1578720495668    2
20200131-0000687     2
Pay-1578194505484    1
20200123-0000475     1
                    ..
Pay-1578455780110    1
20200118-0000867     1
20200129-0000096     1
Pay-1578381404298    1
20200105-0000421     1
Name: transectionId, Length: 2603, dtype: int64

In [33]:
activity_data['source'].value_counts()

naver                         1135
facebook                       469
(direct)                       447
instagram.com                  291
google                          74
review6.cre.ma                  36
facebook.com                    21
m.shopping.naver.com            20
IGShopping                      15
castbox.shopping.naver.com      15
m.pay.naver.com                 13
m.naver.com                     10
daum                             7
m.blog.naver.com                 7
naver.com                        7
tmpl.co.kr                       6
m.keep.naver.com                 6
order.pay.naver.com              4
cr2.shopping.naver.com           4
m.cafe.naver.com                 3
shopping.naver.com               2
cafe.naver.com                   2
cre.ma                           2
m.tmpl.co.kr                     2
pmon.navercorp.com               2
mup.mobilians.co.kr              1
bing                             1
m.bookmark.naver.com             1
ksmobile.inicis.com 

In [34]:
activity_data['medium'].value_counts()

cpc         644
organic     573
referral    458
(none)      447
social      378
display      91
Social       15
Name: medium, dtype: int64

In [35]:
activity_data['channelGrouping'].value_counts()

Social            800
Paid Search       633
Organic Search    573
Direct            447
Display           102
Referral           51
Name: channelGrouping, dtype: int64

In [38]:
activity_data['hostname'].value_counts()

m.tmpl.co.kr      2391
tmpl.co.kr         204
www.tmpl.co.kr      11
Name: hostname, dtype: int64

In [39]:
activity_data['transectionRevenue'].describe()

count      2606
unique      397
top       50000
freq        482
Name: transectionRevenue, dtype: int64

#### 3)제품 구매 테이블

In [1]:
print(len(product_data))
product_data.head()
# 정보 유출 문제로 생략

---

## 저장

In [57]:
session_data.to_excel('data/raw_data/raw_session_data.xlsx',index = False)
activity_data.to_excel('data/raw_data/raw_activity_data.xlsx',index = False)
product_data.to_excel('data/raw_data/raw_product_data.xlsx',index = False)