# Running to turn the Taobao dataset into the ones used for CKGAT

The process will be as follows : 
1. Creating the encodings for each of the users, items, relations, and items/categories
2. Creating teh knowledge graph by relating each of the items to their categories and vice versa
3. Creating the train and test set
    - For this one specifically, we will create two different one -- one with sparser connections by only including certain product interactions. The less sparse graph wil include more edges due to including more items 


## Encoding 

In [1]:
## importing the necessary libraries for the Amazon data set
import torch
import numpy as np
from torch_geometric.datasets import AmazonBook
import pandas as pd

In [3]:
## loading in the Taobao data
taobao_df = pd.read_csv("taobao/raw/UserBehavior.csv", header = None)
print(taobao_df.head())
taobao_df.columns = ['user_id', 'item_id', 'category_id', 'behavior_type', 'timestamp']
taobao_df.head()

   0        1        2   3           4
0  1  2268318  2520377  pv  1511544070
1  1  2333346  2520771  pv  1511561733
2  1  2576651   149192  pv  1511572885
3  1  3830808  4181361  pv  1511593493
4  1  4365585  2520377  pv  1511596146


Unnamed: 0,user_id,item_id,category_id,behavior_type,timestamp
0,1,2268318,2520377,pv,1511544070
1,1,2333346,2520771,pv,1511561733
2,1,2576651,149192,pv,1511572885
3,1,3830808,4181361,pv,1511593493
4,1,4365585,2520377,pv,1511596146


In [11]:
print('user id max:', taobao_df['user_id'].max())
print('user id min:', taobao_df['user_id'].min())
print('user id unique num:', len(taobao_df['user_id'].unique()))

print('item id max:',taobao_df['item_id'].max())
print('item id min:',taobao_df['item_id'].min())
print('item id unique num:', len(taobao_df['item_id'].unique()))

print('category id max:',taobao_df['category_id'].max())
print('category id min:',taobao_df['category_id'].min())
print('category id unique num:',len(taobao_df['category_id'].unique()))

print(len(taobao_df))


user id max: 1018011
user id min: 1
user id unique num: 987994
item id max: 5163070
item id min: 1
item id unique num: 4162024
category id max: 5162429
category id min: 80
category id unique num: 9439
100150807


In [14]:
## we'll start by extracting all of the unique user ids, item ids, and category ids

user_ids = taobao_df['user_id'].unique()
user_df = pd.DataFrame({'user_id': user_ids})
print(user_df.head())
print(len(user_df))

item_ids = taobao_df['item_id'].unique()
item_df = pd.DataFrame({'item_id': item_ids})
print(item_df.head())
print(len(item_df))

category_ids = taobao_df['category_id'].unique()
category_df = pd.DataFrame({'category_id': category_ids})
print(category_df.head())
print(len(category_df))

   user_id
0        1
1      100
2     1000
3  1000001
4  1000004
987994
   item_id
0  2268318
1  2333346
2  2576651
3  3830808
4  4365585
4162024
   category_id
0      2520377
1      2520771
2       149192
3      4181361
4      2735466
9439


In [None]:
## will now shuffle the entries to make the numberings more random

user_df = user_df.sample(frac = 1).reset_index()
item_df = item_df.sample(frac = 1).reset_index()
category_df = category_df.sample(frac = 1).reset_index()

print(user_df.head())
print(item_df.head())
print(category_df.head())

    index  user_id
0  818361   211596
1  158926   821582
2  189325   962532
3  608909    16129
4  585633   969003
     index  item_id
0  1920924  1793912
1  1657601  2395737
2  3833127  4867338
3   987263  1072235
4   397206   646728
   index  category_id
0   7335      4085091
1    655       235534
2   6456      4135923
3    707      1421739
4   7466      4649060


In [None]:
item_df.drop(columns = 'index', inplace = True)
user_df.drop(columns = 'index', inplace = True)
category_df.drop(columns = 'index', inplace = True)

In [25]:
print(item_df.head())
print(user_df.head())
print(category_df.head())

   item_id
0  1793912
1  2395737
2  4867338
3  1072235
4   646728
   user_id
0   211596
1   821582
2   962532
3    16129
4   969003
   category_id
0      4085091
1       235534
2      4135923
3      1421739
4      4649060


In [None]:
## just to check that the values were kept during the shuffling
print('user id max:', user_df['user_id'].max())
print('user id min:', user_df['user_id'].min())
print('user id unique num:', len(user_df['user_id'].unique()))

print('item id max:',item_df['item_id'].max())
print('item id min:',item_df['item_id'].min())
print('item id unique num:', len(item_df['item_id'].unique()))

print('category id max:',category_df['category_id'].max())
print('category id min:',category_df['category_id'].min())
print('category id unique num:',len(category_df['category_id'].unique()))


user id max: 1018011
user id min: 1
user id unique num: 987994
item id max: 5163070
item id min: 1
item id unique num: 4162024
category id max: 5162429
category id min: 80
category id unique num: 9439


In [31]:
# user_df = user_df.reset_index()
user_df.head()
user_df.rename(columns = {'index': 'usermap_id'}, inplace = True)

# item_df = item_df.reset_index()
item_df.rename(columns = {'index': 'itemmap_id'}, inplace = True)

# category_df = category_df.reset_index()
category_df.rename(columns = {'index': 'categorymap_id'}, inplace = True)

print(user_df.head())
print(user_df['usermap_id'].max())
print(item_df.head())
print(item_df['itemmap_id'].max())
print(category_df.head())
print(category_df['categorymap_id'].max())

   usermap_id  user_id
0           0   211596
1           1   821582
2           2   962532
3           3    16129
4           4   969003
987993
   itemmap_id  item_id
0           0  1793912
1           1  2395737
2           2  4867338
3           3  1072235
4           4   646728
4162023
   categorymap_id  category_id
0               0      4085091
1               1       235534
2               2      4135923
3               3      1421739
4               4      4649060
9438


In [32]:
## then creating the entity mappings

category_df['categorymap_id'] = category_df['categorymap_id'] + item_df['itemmap_id'].max() + 1
print(category_df.head())
print(category_df['categorymap_id'].min())
print(category_df['categorymap_id'].max())
print(len(category_df['categorymap_id'].unique()))

   categorymap_id  category_id
0         4162024      4085091
1         4162025       235534
2         4162026      4135923
3         4162027      1421739
4         4162028      4649060
4162024
4171462
9439


In [34]:
category_df2 = category_df.copy(deep = True)
item_df2 = item_df.copy(deep = True)

category_df2['type'] = 'category'
item_df2['type'] = 'item'

category_df2.rename(columns = {'categorymap_id':'map_id', 
                               'category_id': 'og_id'}, 
                    inplace = True)
item_df2.rename(columns = {'itemmap_id': 'map_id', 
                           'item_id': 'og_id'}, 
                inplace = True)

entity_df = pd.concat([item_df2, category_df2])



In [35]:
entity_df.head()

Unnamed: 0,map_id,og_id,type
0,0,1793912,item
1,1,2395737,item
2,2,4867338,item
3,3,1072235,item
4,4,646728,item


In [37]:
print('entity map id min', entity_df['map_id'].min())
print('entity map id max', entity_df['map_id'].max())
print('number of unique map id', len(entity_df['map_id'].unique()))
print('number of items', entity_df.groupby('type')['og_id'].count().reset_index())

entity map id min 0
entity map id max 4171462
number of unique map id 4171463
number of items        type    og_id
0  category     9439
1      item  4162024


In [42]:
# ## saving the user mapping, item mapping, entity mapping, and category mapping
# entity_df.to_csv('entity_list.csv', index=False)
# item_df.to_csv('item_list.csv', index=False)
# user_df.to_csv('user_list.csv', index = False)
# category_df.to_csv('category_list.csv', index = False)


In [47]:
entity_reload = pd.read_csv('entity_list.csv')
print('entity map id min', entity_reload['map_id'].min())
print('entity map id max', entity_reload['map_id'].max())
print('number of unique map id', len(entity_reload['map_id'].unique()))
print('number of items', entity_reload.groupby('type')['og_id'].count().reset_index())
entity_reload.head()


entity map id min 0
entity map id max 4171462
number of unique map id 4171463
number of items        type    og_id
0  category     9439
1      item  4162024


Unnamed: 0,map_id,og_id,type
0,0,1793912,item
1,1,2395737,item
2,2,4867338,item
3,3,1072235,item
4,4,646728,item


In [44]:
item_reload = pd.read_csv('item_list.csv')
item_reload.head()

Unnamed: 0,itemmap_id,item_id
0,0,1793912
1,1,2395737
2,2,4867338
3,3,1072235
4,4,646728


In [45]:
user_reload = pd.read_csv('user_list.csv')
user_reload.head()

Unnamed: 0,usermap_id,user_id
0,0,211596
1,1,821582
2,2,962532
3,3,16129
4,4,969003


In [46]:
category_reload = pd.read_csv('category_list.csv')
category_reload.head()

Unnamed: 0,categorymap_id,category_id
0,4162024,4085091
1,4162025,235534
2,4162026,4135923
3,4162027,1421739
4,4162028,4649060


In [48]:
relation_list = ['item_to_category', 'category_to_item']
relation_id = [0,1]

relation_df = pd.DataFrame({'relation_type': relation_list, 
                            'relationmap_id': relation_id})
print(relation_df)



      relation_type  relationmap_id
0  item_to_category               0
1  category_to_item               1


In [None]:
# relation_df.to_csv('relation_list.csv', index = False)

In [51]:
relation_reload = pd.read_csv('relation_list.csv')
relation_reload

Unnamed: 0,relation_type,relationmap_id
0,item_to_category,0
1,category_to_item,1


# Creating the knowledge graph 

1. Checking if every product only has one category
2. Then if so, will just extract every unique entry between the item and the category 
3. Then add in the first relation type
4. Then flip the head/tail and then add in the second relation type. 
5. Concatenate the dataset

In [None]:
taobao_df['']