# Ta feng Data

In [41]:
%load_ext autoreload
%autoreload 2

import os
import random
import pickle
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [42]:
# for reproducibility
def seed_everything(seed=1234):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
seed_everything()

In [43]:
filename = "transaction.dat"
root_dir = "./data/ta_feng/"
csv_file = "ta_feng_all_months_merged.csv"

df = pd.read_csv(root_dir+csv_file)

In [44]:
df

Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,ASSET,SALES_PRICE
0,11/1/2000,1104905,45-49,115,110411,4710199010372,2,24,30
1,11/1/2000,418683,45-49,115,120107,4710857472535,1,48,46
2,11/1/2000,1057331,35-39,115,100407,4710043654103,2,142,166
3,11/1/2000,1849332,45-49,Others,120108,4710126092129,1,32,38
4,11/1/2000,1981995,50-54,115,100205,4710176021445,1,14,18
...,...,...,...,...,...,...,...,...,...
817736,2/28/2001,312790,35-39,114,530501,4713317035042,2,80,118
817737,2/28/2001,57486,40-44,115,530209,4710731060124,1,40,55
817738,2/28/2001,733526,>65,Unknown,510539,4716340052307,1,78,115
817739,2/28/2001,173704,45-49,115,520457,4714276145315,1,90,96


In [45]:
customer_id = set(df["CUSTOMER_ID"])

## Save dataset
Data structure is \
[(customer id, transaction),...]

In [46]:
%time
data = []
trans = []  # for removing stop word items
n_trans = 0 # before removing fewer trans and item 
cnt = 0

for i in customer_id:
    customer = df[df["CUSTOMER_ID"] == i]
    trans_date = list(set(customer["TRANSACTION_DT"]))
    n_trans += len(trans_date)
    # print("custormer ID    :", i, "\n" \
    #       "transaction date:", trans_date, len(trans_date))
    
    if len(trans_date) < 4:
        cnt += 1
        continue

    for d in trans_date:
        product = customer[customer["TRANSACTION_DT"] == d]
        
        # Remove users whose transactions are fewer than 3.
        if len(set(product["PRODUCT_ID"])) > 2:
            trans.append(list(product["PRODUCT_ID"]))
            # data.append((i, set(product["PRODUCT_ID"])))
            data.append([i, set(product["PRODUCT_ID"])])
        else:
            cnt += 1

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.53 µs


## Transaction from 0 to 3

In [47]:
data[0:3]

[[1835021,
  {4710094004711,
   4710105002019,
   4710105031613,
   4710114361114,
   4710421090059,
   4710543282318,
   4711390436916,
   4713045519333,
   4715140032311,
   4716447022203}],
 [1835021,
  {3228021990293,
   4710088434593,
   4710105031613,
   4710114362029,
   4710121023302,
   4714220680091,
   4714220680107}],
 [1835021,
  {20554705,
   29000070295,
   4710105031613,
   4710205005750,
   4710731060124,
   4710740600090,
   4710740600106,
   8851954102126}]]

## # transactions

In [48]:
len(data)

59191

## # items

In [100]:
itemset = set()
for i in data:
    itemset |= i[1]
    
print(len(itemset))
n_item = len(itemset)

21878


## # removed transactions

In [50]:
cnt

46373

## Average # item in a transaction

In [51]:
mean = 0
for i in data:
    mean += len(i[1])
    
mean /= len(data)
mean

8.334865097734452

## Remove stop words

### Counting items

In [52]:
dct = {item : 0 for item in itemset}
for t in trans:
    for item in t:
        dct[item] += 1

### Threshold

In [53]:
th = int(0.05*len(data))
th

2959

In [54]:
removed_items = {item for item, cnt in dct.items() if cnt>th}
print(len(removed_items), removed_items)

2 {4714981010038, 4711271000014}


### Remove

#### Remove from itemset

In [101]:
print("Before remove:", len(itemset))
itemset -= removed_items
print("After remove :", len(itemset))

Before remove: 21878
After remove : 21878


#### Remove from transaction

In [75]:
for i in data:
    i[1] -= removed_items

#### Convert list to tuple

In [96]:
data = [(i[0], i[1]) for i in data]

In [97]:
len(data)

59191

In [98]:
data = [(i[0], i[1]) for i in data if len(i[1]) > 2]

In [99]:
len(data)

58237

#### Make sure if items were removed

In [79]:
tmp = set()
for i in data:
    tmp |= i[1]
assert len(tmp)==len(itemset)
del tmp

## Save dataset 

In [80]:
filehandler = open(root_dir+filename, "wb") 
pickle.dump(data, filehandler)

## Load dataset from pickle

In [94]:
f = open(root_dir+filename,"rb")
data = pickle.load(f)

In [95]:
trans = []
for i in data:
    trans.append(list(i[1]))

## Make triple dataset for <br> BASKET-SENSITIVE FACTORIZATION MACHINE(BFM)

### Split dataset for train, test and validation
Ratio is \
train : test : valid = 6 : 2 : 2

In [117]:
from sklearn.model_selection import train_test_split
seed=1234

train, test = train_test_split(data, test_size=0.4, random_state=seed)
test, valid = train_test_split(test, test_size=0.5, random_state=seed)

In [118]:
len(train), len(test), len(valid)

(34942, 11647, 11648)

### Calculate pmi
We only need pmi for train dataset, so we calculated train dataset pmi with original dataset(valiable is `data`).

---
---

In [116]:
train[0]

(293044,
 {20570125,
  4710043100082,
  4710731070192,
  4712172200091,
  4714008271008,
  4719864067116,
  5998710132430,
  5998710132461})

In [105]:
# 内包表記のほうが速いっぽい
# ref: https://utgwkk.hateblo.jp/entry/2017/03/09/154314

# custormer id == 1835021
%time
custormer_id = 1835021
# target_custormer = lambda x: x[0]==custormer_id
# u_data = map(lambda x: x[1] if x[0]==custormer_id else None, trans_from_pickle)
# u_data = list(filter(target_custormer, trans_from_pickle))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


In [106]:
%time
target_tran = [i[1] for i in data if i[0]==custormer_id]

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [107]:
target_tran

[{4710094004711,
  4710105002019,
  4710105031613,
  4710114361114,
  4710421090059,
  4710543282318,
  4711390436916,
  4713045519333,
  4715140032311,
  4716447022203},
 {3228021990293,
  4710088434593,
  4710105031613,
  4710114362029,
  4710121023302,
  4714220680091,
  4714220680107},
 {20554705,
  29000070295,
  4710105031613,
  4710205005750,
  4710731060124,
  4710740600090,
  4710740600106,
  8851954102126},
 {20412074, 4710063312168, 4710088433268},
 {4710015102946,
  4710431324236,
  4710431330282,
  4710431397056,
  4713691062900,
  4714005052013,
  4715190002999,
  4715833010893,
  4715833014310}]