# Part two of data processing 

This part is just knowledge graph creation

In [1]:
## importing the necessary libraries for the Amazon data set
import torch
import numpy as np
from torch_geometric.datasets import AmazonBook
import pandas as pd

In [2]:
## loading in the Taobao data
taobao_df = pd.read_csv("taobao/raw/UserBehavior.csv", header = None)
print(taobao_df.head())
taobao_df.columns = ['user_id', 'item_id', 'category_id', 'behavior_type', 'timestamp']
taobao_df.head()

   0        1        2   3           4
0  1  2268318  2520377  pv  1511544070
1  1  2333346  2520771  pv  1511561733
2  1  2576651   149192  pv  1511572885
3  1  3830808  4181361  pv  1511593493
4  1  4365585  2520377  pv  1511596146


Unnamed: 0,user_id,item_id,category_id,behavior_type,timestamp
0,1,2268318,2520377,pv,1511544070
1,1,2333346,2520771,pv,1511561733
2,1,2576651,149192,pv,1511572885
3,1,3830808,4181361,pv,1511593493
4,1,4365585,2520377,pv,1511596146


In [3]:
## loading in the various lists
category_df = pd.read_csv('category_list.csv')
entity_df = pd.read_csv('entity_list.csv')
item_df = pd.read_csv('item_list.csv')
relation_df = pd.read_csv('relation_list.csv')
user_df = pd.read_csv('user_list.csv')

In [4]:
print(category_df.head())
print(entity_df.head())
print(item_df.head())
print(relation_df.head())
print(user_df.head())

   categorymap_id  category_id
0         4162024      1003418
1         4162025      1047470
2         4162026      2819583
3         4162027      1363318
4         4162028      2560150
   map_id    og_id  type
0       0  4910621  item
1       1  4847970  item
2       2  2736814  item
3       3  2247338  item
4       4  3840111  item
   itemmap_id  item_id
0           0  4910621
1           1  4847970
2           2  2736814
3           3  2247338
4           4  3840111
      relation_type  relationmap_id
0  item_to_category               0
1  category_to_item               1
   usermap_id  user_id
0           0    10502
1           1   710733
2           2   412015
3           3   309344
4           4    49306


In [5]:
## going to create the knowledge graph
print(taobao_df.head())

   user_id  item_id  category_id behavior_type   timestamp
0        1  2268318      2520377            pv  1511544070
1        1  2333346      2520771            pv  1511561733
2        1  2576651       149192            pv  1511572885
3        1  3830808      4181361            pv  1511593493
4        1  4365585      2520377            pv  1511596146


In [6]:
## first pulling out the item to category relations

item_to_category_df = taobao_df[['item_id', 'category_id']]
category_to_item_df = taobao_df[['category_id', 'item_id']]

print(item_to_category_df.shape)
print(category_to_item_df.shape)


(100150807, 2)
(100150807, 2)


In [7]:
taobao_df_filtered = taobao_df[['item_id', 'category_id']]


In [8]:
## here is basically just getting the unique combinations between the item id and the category id
taobao_df_filtered = taobao_df_filtered[['item_id', 'category_id']].drop_duplicates()


In [9]:
print(taobao_df_filtered.shape)
print(taobao_df_filtered.head())

print(item_df.head())
print(category_df.head())

(4163442, 2)
   item_id  category_id
0  2268318      2520377
1  2333346      2520771
2  2576651       149192
3  3830808      4181361
4  4365585      2520377
   itemmap_id  item_id
0           0  4910621
1           1  4847970
2           2  2736814
3           3  2247338
4           4  3840111
   categorymap_id  category_id
0         4162024      1003418
1         4162025      1047470
2         4162026      2819583
3         4162027      1363318
4         4162028      2560150


In [10]:
## will map the item_id to the entity_remap id and the category_id to the entity_remap id as well
taobao_filtered_joined_df = taobao_df_filtered.merge(item_df, how = 'left', on = 'item_id')
print(taobao_filtered_joined_df.shape)
print(taobao_filtered_joined_df.head())

(4163442, 3)
   item_id  category_id  itemmap_id
0  2268318      2520377     2827803
1  2333346      2520771     3261306
2  2576651       149192     2446773
3  3830808      4181361     2281264
4  4365585      2520377     3676007


In [11]:
taobao_filtered_joined_df = taobao_filtered_joined_df.merge(category_df, how = 'left', on = 'category_id')
print(taobao_filtered_joined_df.shape)
print(taobao_filtered_joined_df.head())

(4163442, 4)
   item_id  category_id  itemmap_id  categorymap_id
0  2268318      2520377     2827803         4165957
1  2333346      2520771     3261306         4164386
2  2576651       149192     2446773         4165378
3  3830808      4181361     2281264         4170771
4  4365585      2520377     3676007         4165957


In [12]:
print(taobao_filtered_joined_df.dropna().shape)
### no empty entries!

(4163442, 4)


In [14]:
## here, will create the two knowledge graphs
taobao_kg_itemtocat = taobao_filtered_joined_df.copy(deep = True)
taobao_kg_cattoitem = taobao_filtered_joined_df.copy(deep = True)

taobao_kg_itemtocat = taobao_kg_itemtocat[['itemmap_id', 'categorymap_id']]
taobao_kg_cattoitem = taobao_kg_cattoitem[['categorymap_id', 'itemmap_id']]


taobao_kg_itemtocat['relation'] = 0
taobao_kg_cattoitem['relation'] = 1

taobao_kg_itemtocat = taobao_kg_itemtocat.rename(columns = {'itemmap_id': 'head', 
                                                              'categorymap_id': 'tail'})
taobao_kg_cattoitem = taobao_kg_cattoitem.rename(columns = {'categorymap_id': 'head', 
                                                            'itemmap_id': 'tail'})

print(taobao_kg_itemtocat.head())
print(taobao_kg_cattoitem.head())



      head     tail  relation
0  2827803  4165957         0
1  3261306  4164386         0
2  2446773  4165378         0
3  2281264  4170771         0
4  3676007  4165957         0
      head     tail  relation
0  4165957  2827803         1
1  4164386  3261306         1
2  4165378  2446773         1
3  4170771  2281264         1
4  4165957  3676007         1


In [15]:
final_kg  = pd.concat([taobao_kg_itemtocat, taobao_kg_cattoitem])
print(final_kg.head())
print(final_kg.shape)
print(len(final_kg['head'].unique()))
print(len(final_kg['tail'].unique()))

      head     tail  relation
0  2827803  4165957         0
1  3261306  4164386         0
2  2446773  4165378         0
3  2281264  4170771         0
4  3676007  4165957         0
(8326884, 3)
4171463
4171463


In [17]:
final_kg.to_csv('kg_final.csv', index = False)

In [19]:
kg_trial = pd.read_csv('kg_final.csv')
print(kg_trial.head())
print(kg_trial.shape)
print(len(kg_trial['head'].unique()))
print(len(kg_trial['tail'].unique()))

      head     tail  relation
0  2827803  4165957         0
1  3261306  4164386         0
2  2446773  4165378         0
3  2281264  4170771         0
4  3676007  4165957         0
(8326884, 3)
4171463
4171463


# linh 10/11/24

In [2]:
import pandas as pd

In [3]:
kg_final = pd.read_csv("kg_final.csv")

In [5]:
kg_final.shape

(8326884, 3)

In [6]:
kg_final.head()

Unnamed: 0,head,tail,relation
0,2120101,4168529,0
1,1013206,4168935,0
2,2612228,4170188,0
3,1728925,4166371,0
4,34476,4168529,0


In [8]:
kg_final = kg_final[['head', 'relation','tail']]

In [11]:
kg_final.to_csv(r'kg_final.txt', header=None, index=None, sep=' ', mode='a')