# Ta feng Data

In [186]:
%load_ext autoreload
%autoreload 2

import os
import random
import pickle
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [187]:
# for reproducibility
def seed_everything(seed=1234):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    random.seed(seed)
seed_everything()

In [188]:
filename = "transaction.dat"
root_dir = "./data/ta_feng/"
csv_file = "ta_feng_all_months_merged.csv"

df = pd.read_csv(root_dir+csv_file)

In [189]:
df

Unnamed: 0,TRANSACTION_DT,CUSTOMER_ID,AGE_GROUP,PIN_CODE,PRODUCT_SUBCLASS,PRODUCT_ID,AMOUNT,ASSET,SALES_PRICE
0,11/1/2000,1104905,45-49,115,110411,4710199010372,2,24,30
1,11/1/2000,418683,45-49,115,120107,4710857472535,1,48,46
2,11/1/2000,1057331,35-39,115,100407,4710043654103,2,142,166
3,11/1/2000,1849332,45-49,Others,120108,4710126092129,1,32,38
4,11/1/2000,1981995,50-54,115,100205,4710176021445,1,14,18
...,...,...,...,...,...,...,...,...,...
817736,2/28/2001,312790,35-39,114,530501,4713317035042,2,80,118
817737,2/28/2001,57486,40-44,115,530209,4710731060124,1,40,55
817738,2/28/2001,733526,>65,Unknown,510539,4716340052307,1,78,115
817739,2/28/2001,173704,45-49,115,520457,4714276145315,1,90,96


In [190]:
customer_id = set(df["CUSTOMER_ID"])

## Change Data structure
Data structure is \
[(customer id, transaction),...]

In [191]:
%time
import tqdm

data = []
trans = []  # for removing stop word items
n_trans = 0 # before removing fewer trans and item 
cnt = 0

for i in tqdm.tqdm(customer_id):
    customer = df[df["CUSTOMER_ID"] == i]
    trans_date = list(set(customer["TRANSACTION_DT"]))
    n_trans += len(trans_date)
    # print("custormer ID    :", i, "\n" \
    #       "transaction date:", trans_date, len(trans_date))

    for d in trans_date:
        product = customer[customer["TRANSACTION_DT"] == d]
        trans.append(list(product["PRODUCT_ID"]))
        # data.append((i, set(product["PRODUCT_ID"])))
        data.append([i, set(product["PRODUCT_ID"])])

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 3.34 µs


## Transaction from 0 to 3

In [192]:
data[0:3]

[[1441802, {4710549000039, 4712048022017}],
 [1441802, {4714981010038}],
 [1703948,
  {20246037,
   20341992,
   2250271000683,
   4710049003608,
   4710063021091,
   4710088434500,
   4710094014765,
   4710094020179,
   4710114606048,
   4710126092730,
   4710162000126,
   4710207010226,
   4710221682782,
   4710357302905,
   4710363604000,
   4710363732000,
   4710466103080,
   4710570002606,
   4710626622857,
   4710908111116,
   4711045228101,
   4711713030531,
   4714499363053,
   4714957234147,
   4901301036056,
   4901301043979,
   4902430492416,
   4902430493024,
   4902511003418,
   4973271967807,
   8999118200100}]]

## Statics class for data stat

In [193]:
class stat():
    """class about data statics"""
    def __init__(self, data):
        self.data = data
    
    def n_data(self):
        """
        The number of transactions
        """
        return len(self.data)
    
    def get_itemset(self):
        """
        return set of item
        """
        itemset = set()
        for d in self.data:
            itemset |= d[1]
        return itemset
    
    def n_item(self, itemset=None):
        """
        The number of items
        """
        if itemset is None:
            itemset = self.get_itemset()
        return len(itemset)

    def a_item(self):
        """
        Average number of item in a transaction
        """
        mean = 0
        for d in self.data:
            mean += len(d[1])
        mean /= self.n_data()
        return mean
    
    def get_usrset(self):
        """
        return set of user
        """
        usrset = set()
        for d in self.data:
            usrset |= {d[0]}
        return usrset
    
    def n_usr(self, usrset=None):
        """
        The number of user
        """
        if usrset is None:
            usrset = self.get_usrset()
        return len(usrset)
    
    def show(self):
        """
        print all statics
        """
        print(f"""
        # transactions: {self.n_data()}
        # items       : {self.n_item()}
        Ave item      : {self.a_item()}
        # users       : {self.n_usr()}
        """)

In [194]:
s = stat(data)

### # transactions

In [195]:
s.n_data()

119578

### # items

In [196]:
itemset = s.get_itemset()
s.n_item()

23812

### # users

In [197]:
s.n_usr()

32266

### Average # item in a transaction

In [198]:
s.a_item()

6.8385572596966

## Preprocess

### 1. Remove stop word items

#### Count items

In [199]:
item_dct = {item : 0 for item in itemset}
for t in trans:
    for item in t:
        item_dct[item] += 1

#### Threshold

In [200]:
th = int(0.05*len(data))
th

5978

In [201]:
removed_items = {item for item, cnt in item_dct.items() if cnt>th}
print("The number of removed items:", len(removed_items), \
      "\nRemoved items \n", removed_items)

The number of removed items: 2 
Removed items 
 {4714981010038, 4711271000014}


#### Remove stop word item from itemset

In [202]:
print("Before remove:", len(itemset))
itemset -= removed_items
print("After remove :", len(itemset))

Before remove: 23812
After remove : 23810


#### Remove stop word item from transaction

In [203]:
for i in data:
    i[1] -= removed_items

### 2. Remove transactions which include less than 3 items

In [204]:
print(f"""
Before
# Data : {len(data)}
# Items: {len(itemset)}
""")

data = [(i[0], i[1]) for i in data if len(i[1]) > 2]
itemset = set()
for i in data:
    itemset |= i[1]

print(f"""
After
# Data : {len(data)}
# Items: {len(itemset)}
""")


Before
# Data : 119578
# Items: 23810


After
# Data : 85277
# Items: 23598



#### Make sure if items were removed

In [205]:
tmp = set()
for i in data:
    tmp |= i[1]

print(len(tmp), len(itemset))
assert len(tmp)==len(itemset)
del tmp

23598 23598


## 3. Remove users whose transactions are fewer than 3

#### Count # transactions per user

In [206]:
ucnt = {d[0] : 0 for d in data}

for i in data:
    ucnt[i[0]] += 1

#### Remove users

In [207]:
data = [(d[0], d[1]) for d in data if ucnt[d[0]]>2]

### Check Stat

In [208]:
s = stat(data)
s.show()


        # transactions: 61991
        # items       : 22326
        Ave item      : 8.6352857672888
        # users       : 10599
        


## Convert list to tuple

In [209]:
data = [(i[0], i[1]) for i in data]

## Save dataset 

In [210]:
filehandler = open(root_dir+filename, "wb") 
pickle.dump(data, filehandler)

## Load dataset from pickle

In [211]:
import pickle
filename = "transaction.dat"
root_dir = "./data/ta_feng/"

f = open(root_dir+filename,"rb")
data = pickle.load(f)

In [212]:
trans = []
for i in data:
    trans.append(list(i[1]))
len(trans)

61991

## Make triple dataset for <br> BASKET-SENSITIVE FACTORIZATION MACHINE(BFM)

### Split dataset for train, test and validation based on user
Ratio is \
train : test : valid = 6 : 2 : 2

# <span style="color: red; ">TODO:</span> split based on user

In [213]:
# dataset ratio
r_train = 0.6
r_test = 0.2
r_valid = 0.2

最初にuserがkeyでtransactionのlistがvalueの辞書型を作ってsplitしたほうがいい?\
とりあえずやってみる

#### Itemset per user(customer)
`usr_dct`はkeyがuserでvalueがそのuserが購入したitemの集合が入っている． \
だから，`itemset - usr_dct[usr]`でusrが購入していないアイテム集合を取得できる．これをNegative sampleを作るときに利用する．

In [214]:
import copy

usr_dct = {}
tmp = 0
for i in data:
    if not i[0] in usr_dct.keys():
         # idk why but if I don't use deepcopy, set in data is changed
        usr_dct[i[0]] = copy.deepcopy(i[1])
    else:
        usr_dct[i[0]] |= i[1]

n_usr = len(usr_dct.keys())
print(n_usr)

10599


In [215]:
udata = {usr : [] for usr in usr_dct.keys()}

for d in data:
    udata[d[0]].append(d[1])

Itemが2以下のtransactionがないかcheck

In [216]:
cnt = 0
for u, t in udata.items():
    if len(t) < 3: 
        cnt+=1
assert cnt==0, "Invalid transaction"

### Split based on user

In [217]:
import copy
import math

train = {usr:[] for usr in udata.keys()}
test = copy.deepcopy(train)
valid = copy.deepcopy(train)

for u,t in udata.items():
    l_t = len(t)
    if l_t == 3:
        train[u] = [copy.deepcopy(t[0])]
        test[u] = [copy.deepcopy(t[1])]
        valid[u] = [copy.deepcopy(t[2])]
    else:
        l_train = math.ceil(l_t*r_train)

        train[u] = copy.deepcopy(t[:l_train])

        l_test = math.ceil((l_t-l_train)*0.5)
        test[u] = copy.deepcopy(t[l_train:l_train+l_test])

        valid[u] = copy.deepcopy(t[l_train+l_test:])

calculate sum of data

In [218]:
cnt = 0

for tr, te, va in zip(train.values(), test.values(), valid.values()):
    cnt += (len(tr) + len(te) + len(va))

assert cnt==len(data), f"Invalid split, before split: {len(data)}, after split: {cnt}"

### Calculate pmi
We only need pmi for train dataset, so we calculated train dataset pmi with original dataset(valiable is `data`).

### Make negative sample
Negative sampleはPositive sampleをベースに作られている．\
1つのPositive sampleにつき2つ作る．つまりNegative sampleはPositive sampleの数の2倍になる．\
Negative sampleはベースとなっているアイテム数(|B|+v)と同じになっている．\
Negative sampleのアイテムはそのUserが購入していないアイテムで構成される．

In [219]:
import tqdm

neg_train = {usr:[] for usr in udata.keys()}
for usr, trans in tqdm.tqdm(train.items()):
    for t in trans:
        # item length
        l_item = len(t)

        # negative itemset
        neg_itemset = itemset - usr_dct[usr]
        neg_data = random.sample(neg_itemset, k=l_item*2)
        neg_train[usr].append(set(neg_data[:l_item]))
        neg_train[usr].append(set(neg_data[l_item:]))

100%|██████████| 10599/10599 [00:30<00:00, 343.69it/s]


In [220]:
l_train = 0
for trans in train.values():
    l_train += len(trans)

l_neg = 0
for trans in neg_train.values():
    l_neg += len(trans)

assert 2*l_train==l_neg, \
       f"""The number of negative sample should be 2*train.
                But got train sample:{l_train}, negative sample:{l_neg}"""

##### Save negative sample dataset

In [221]:
filehandler = open(root_dir+"negative_sample.dat", "wb") 
pickle.dump(neg_train, filehandler)

---
---
---
---

Preprocessing when store data

In [6]:
%time
data = []
trans = []  # for removing stop word items
n_trans = 0 # before removing fewer trans and item 
cnt = 0

for i in customer_id:
    customer = df[df["CUSTOMER_ID"] == i]
    trans_date = list(set(customer["TRANSACTION_DT"]))
    n_trans += len(trans_date)
    # print("custormer ID    :", i, "\n" \
    #       "transaction date:", trans_date, len(trans_date))

    # Remove users whose transactions are fewer than 3.
    if len(trans_date) < 4:
        cnt += 1
        continue

    for d in trans_date:
        product = customer[customer["TRANSACTION_DT"] == d]
        
        # Store only transactions which include more than 2 items.
        if len(set(product["PRODUCT_ID"])) > 2:
            trans.append(list(product["PRODUCT_ID"]))
            # data.append((i, set(product["PRODUCT_ID"])))
            data.append([i, set(product["PRODUCT_ID"])])
        else:
            cnt += 1

CPU times: user 0 ns, sys: 1 µs, total: 1 µs
Wall time: 3.1 µs


## split data using sklearn func (not based on user)

In [53]:
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
seed=1234

# Shuffle data
data = shuffle(data, random_state=seed)

# Split train, test and validation
train, test = train_test_split(data, test_size=0.4, random_state=seed)
test, valid = train_test_split(test, test_size=0.5, random_state=seed)

In [54]:
len(train), len(test), len(valid)

(37194, 12398, 12399)

---
---
---
---

#### Adjacency matrix

In [27]:
import tqdm

def make_adj(num_items, itemset, transactions):
    adjMatrix = np.zeros((num_items, num_items), dtype=np.float32)
    for t in tqdm.tqdm(transactions):
        for i, item1 in enumerate(t):
            # item1がtrain_itemsetに入ってなかったら飛ばす
            if not item1 in itemset: break
                
            sub_t = t[i+1:] # こっちはただの隣接行列
            idx1 = itemset.index(item1)
            for item2 in sub_t:
                # item2がtrain_itemsetに入ってなかったら飛ばす
                if not item2 in itemset: break
                idx2 = itemset.index(item2)
                adjMatrix[idx1][idx2] += 1

    return adjMatrix.astype(np.int32)

##### Train itemset

In [28]:
train_itemset = set()
for i in train:
    train_itemset |= i[1]

train_itemset = list(train_itemset)
print(len(train_itemset))
n_train_item = len(train_itemset)

19865


##### Whole transaction is `trans`

In [29]:
adj = make_adj(n_train_item, train_itemset, trans)

100%|██████████| 59191/59191 [19:38<00:00, 50.24it/s]  


##### Save adjacency matrix as numpy object

In [31]:
print(adj)
np.save(f"{root_dir}adj", adj)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [116]:
train[0]

(293044,
 {20570125,
  4710043100082,
  4710731070192,
  4712172200091,
  4714008271008,
  4719864067116,
  5998710132430,
  5998710132461})

In [105]:
# 内包表記のほうが速いっぽい
# ref: https://utgwkk.hateblo.jp/entry/2017/03/09/154314

# custormer id == 1835021
%time
custormer_id = 1835021
# target_custormer = lambda x: x[0]==custormer_id
# u_data = map(lambda x: x[1] if x[0]==custormer_id else None, trans_from_pickle)
# u_data = list(filter(target_custormer, trans_from_pickle))

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.44 µs


In [106]:
%time
target_tran = [i[1] for i in data if i[0]==custormer_id]

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs


In [107]:
target_tran

[{4710094004711,
  4710105002019,
  4710105031613,
  4710114361114,
  4710421090059,
  4710543282318,
  4711390436916,
  4713045519333,
  4715140032311,
  4716447022203},
 {3228021990293,
  4710088434593,
  4710105031613,
  4710114362029,
  4710121023302,
  4714220680091,
  4714220680107},
 {20554705,
  29000070295,
  4710105031613,
  4710205005750,
  4710731060124,
  4710740600090,
  4710740600106,
  8851954102126},
 {20412074, 4710063312168, 4710088433268},
 {4710015102946,
  4710431324236,
  4710431330282,
  4710431397056,
  4713691062900,
  4714005052013,
  4715190002999,
  4715833010893,
  4715833014310}]