# Ta feng Data

In [18]:
%load_ext autoreload
%autoreload 2

import os
import random
import pickle
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
# for reproducibility
def seed_everything(seed=1234):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
seed_everything()

In [2]:
filename = "transaction.dat"
root_dir = "./data/ta_feng/"
csv_file = "ta_feng_all_months_merged.csv"

df = pd.read_csv(root_dir+csv_file)

In [38]:
df

Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,ASSET,SALES_PRICE
0,11/1/2000,1104905,45-49,115,110411,4710199010372,2,24,30
1,11/1/2000,418683,45-49,115,120107,4710857472535,1,48,46
2,11/1/2000,1057331,35-39,115,100407,4710043654103,2,142,166
3,11/1/2000,1849332,45-49,Others,120108,4710126092129,1,32,38
4,11/1/2000,1981995,50-54,115,100205,4710176021445,1,14,18
...,...,...,...,...,...,...,...,...,...
817736,2/28/2001,312790,35-39,114,530501,4713317035042,2,80,118
817737,2/28/2001,57486,40-44,115,530209,4710731060124,1,40,55
817738,2/28/2001,733526,>65,Unknown,510539,4716340052307,1,78,115
817739,2/28/2001,173704,45-49,115,520457,4714276145315,1,90,96


In [39]:
customer_id = set(df["CUSTOMER_ID"])

## Save dataset
Data structure is \
[(customer id, transaction),...]

In [60]:
data = []
trans = []  # for removing stop word items
n_trans = 0 # before removing fewer trans and item 
cnt = 0

for i in customer_id:
    customer = df[df["CUSTOMER_ID"] == i]
    trans_date = list(set(customer["TRANSACTION_DT"]))
    n_trans += len(trans_date)
    # print("custormer ID    :", i, "\n" \
    #       "transaction date:", trans_date, len(trans_date))
    
    if len(trans_date) < 4:
        cnt += 1
        continue

    for d in trans_date:
        product = customer[customer["TRANSACTION_DT"] == d]
        if len(set(product["PRODUCT_ID"])) > 2:
            trans.append(list(product["PRODUCT_ID"]))
            data.append((i, set(product["PRODUCT_ID"])))
        else:
            cnt += 1

## Transaction from 0 to 3

In [74]:
trans_from_pickle[0:3]

[(1835021,
  {4710094004711,
   4710105002019,
   4710105031613,
   4710114361114,
   4710421090059,
   4710543282318,
   4711390436916,
   4713045519333,
   4715140032311,
   4716447022203}),
 (1835021,
  {20554705,
   29000070295,
   4710105031613,
   4710205005750,
   4710731060124,
   4710740600090,
   4710740600106,
   8851954102126}),
 (1835021, {20412074, 4710063312168, 4710088433268})]

## # transactions

In [75]:
len(trans_from_pickle)

59191

## # items

In [76]:
itemset = set()
for i in trans_from_pickle:
    itemset |= i[1]
    
print(len(itemset))
n_item = len(itemset)

21886


## # removed transactions

In [77]:
cnt

46373

## Average # item in a transaction

In [78]:
mean = 0
for i in trans_from_pickle:
    mean += len(i[1])
    
mean /= len(trans_from_pickle)
mean

8.334865097734452

## Remove stop words

### Make adjacency matrix

In [83]:
def make_adj(num_items, itemset, transactions):
    adjMatrix = np.zeros((num_items, num_items), dtype=np.float128)

    for t in tqdm.tqdm(transactions):
        for i, item1 in enumerate(t):
            # sub_t = t[i:] # こっちは対角成分が自分の出現回数になる
            sub_t = t[i+1:] # こっちはただの隣接行列
            idx1 = itemset.index(item1)
            for item2 in sub_t:
                idx2 = itemset.index(item2)
                adjMatrix[idx1][idx2] += 1

    return adjMatrix.astype(np.int32)

In [84]:
adj = make_adj(n_item, itemset, trans)

MemoryError: Unable to allocate array with shape (21886, 21886) and data type float128

## Save dataset 

In [8]:
filehandler = open(root_dir+filename, "wb") 
pickle.dump(trans, filehandler)

## Load dataset from pickle

In [3]:
f = open(root_dir+filename,"rb")
trans_from_pickle = pickle.load(f)

## Make triple dataset for <br> BASKET-SENSITIVE FACTORIZATION MACHINE(BFM)

In [33]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(trans_from_pickle, test_size=0.4, random_state=1234)

In [34]:
len(train), len(test)

(35514, 23677)

In [35]:
train[0]

(1561241,
 {31200441058,
  31200441157,
  748675116250,
  4710032501692,
  4710032502064,
  4710032502828,
  4710054135202,
  4710186081088,
  4710603011018,
  4971883201098,
  8851111202652})

In [15]:
# 内包表記のほうが速いっぽい
# ref: https://utgwkk.hateblo.jp/entry/2017/03/09/154314

# custormer id == 1835021
%time
custormer_id = 1835021
# target_custormer = lambda x: x[0]==custormer_id
# u_data = map(lambda x: x[1] if x[0]==custormer_id else None, trans_from_pickle)
# u_data = list(filter(target_custormer, trans_from_pickle))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.01 µs


In [51]:
%time
target_tran = [i[1] for i in trans_from_pickle if i[0]==custormer_id]

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.96 µs


In [53]:
target_tran

[{4710094004711,
  4710105002019,
  4710105031613,
  4710114361114,
  4710421090059,
  4710543282318,
  4711390436916,
  4713045519333,
  4715140032311,
  4716447022203},
 {20554705,
  29000070295,
  4710105031613,
  4710205005750,
  4710731060124,
  4710740600090,
  4710740600106,
  8851954102126},
 {20412074, 4710063312168, 4710088433268},
 {4710015102946,
  4710431324236,
  4710431330282,
  4710431397056,
  4713691062900,
  4714005052013,
  4715190002999,
  4715833010893,
  4715833014310},
 {3228021990293,
  4710088434593,
  4710105031613,
  4710114362029,
  4710121023302,
  4714220680091,
  4714220680107}]