# This notebook is used to cut down the data that we are using for training the model


## importing the necessary libraries for the Amazon data set


In [1]:
import torch
import numpy as np
from torch_geometric.datasets import AmazonBook
import pandas as pd

In [2]:
## importing in the train and test sets
users_df = pd.read_csv('user_list.csv')
entity_df = pd.read_csv('entity_list.csv')
item_df = pd.read_csv('item_list.csv')


In [3]:
print(users_df.shape)
print(entity_df.shape)
print(item_df.shape)

(987994, 2)
(4171463, 3)
(4162024, 2)


In [4]:
### getting the filtered users list
users_filtered = users_df.sample(n = 50000, replace = False)
print(users_filtered.shape)
print(users_filtered.head())

(50000, 2)
        usermap_id  user_id
166482      166482   983246
821949      821949   748177
49365        49365   657111
453247      453247   376824
804351      804351   399877


In [117]:
## need to get the filtered items list as well

taobao_df = pd.read_csv("taobao/raw/UserBehavior.csv", header = None)
print(taobao_df.head())
taobao_df.columns = ['user_id', 'item_id', 'category_id', 'behavior_type', 'timestamp']
taobao_df.head()


   0        1        2   3           4
0  1  2268318  2520377  pv  1511544070
1  1  2333346  2520771  pv  1511561733
2  1  2576651   149192  pv  1511572885
3  1  3830808  4181361  pv  1511593493
4  1  4365585  2520377  pv  1511596146


Unnamed: 0,user_id,item_id,category_id,behavior_type,timestamp
0,1,2268318,2520377,pv,1511544070
1,1,2333346,2520771,pv,1511561733
2,1,2576651,149192,pv,1511572885
3,1,3830808,4181361,pv,1511593493
4,1,4365585,2520377,pv,1511596146


In [6]:
## filtering for the entries for the actual users : 

taobao_filtered = taobao_df[taobao_df['user_id'].isin(users_filtered['user_id'])]
print(taobao_filtered.shape)

(5065216, 5)


In [7]:
print(taobao_df.shape)

(100150807, 5)


In [8]:
taobao_filtered = taobao_filtered.merge(item_df, on = 'item_id', how = 'left')
print(taobao_filtered.shape)
print(taobao_filtered.head())

(5065216, 6)
   user_id  item_id  category_id behavior_type   timestamp  itemmap_id
0  1000040  3849409      4756105            pv  1511578189      764448
1  1000040  3386371      2355072            pv  1511578330     1075195
2  1000040  2331370      3607361            pv  1511578403       56983
3  1000040  3381470      4145813            pv  1511596534      765968
4  1000040  3083057      3102419            pv  1511596593     3969547


In [9]:
kg_df = pd.read_csv('kg_final.csv')

In [10]:
print(kg_df.shape)
print(kg_df.head())

(8326884, 3)
      head     tail  relation
0  2827803  4165957         0
1  3261306  4164386         0
2  2446773  4165378         0
3  2281264  4170771         0
4  3676007  4165957         0


In [11]:
kg_filtered = kg_df[(kg_df['head'].isin(taobao_filtered['itemmap_id']) )| (kg_df['tail'].isin(taobao_filtered['itemmap_id']))]

In [12]:
print(kg_filtered.shape)
print(kg_filtered.head())

(2221908, 3)
      head     tail  relation
0  2827803  4165957         0
1  3261306  4164386         0
2  2446773  4165378         0
3  2281264  4170771         0
4  3676007  4165957         0


In [29]:
len(kg_filtered['tail'].unique())

1117779

This essentially eliminates it down to 50,000 users and in total, 1117779 unique entities. 

Now, we will need to go through to redevelop the mappings of the users/items/etc such that they are capped at the total number of values here

In [14]:
print(users_filtered.head())
print(len(users_filtered['usermap_id'].unique()))
print(len(users_filtered['user_id'].unique()))

        usermap_id  user_id
166482      166482   983246
821949      821949   748177
49365        49365   657111
453247      453247   376824
804351      804351   399877
50000
50000


In [15]:
## now, will re-order the users to index appropriately

users_filtered.reset_index(inplace = True)
print(users_filtered.shape)
print(users_filtered.head())

(50000, 3)
    index  usermap_id  user_id
0  166482      166482   983246
1  821949      821949   748177
2   49365       49365   657111
3  453247      453247   376824
4  804351      804351   399877


In [16]:
users_filtered = users_filtered.drop(columns = 'index')

In [17]:
users_filtered.head()

Unnamed: 0,usermap_id,user_id
0,166482,983246
1,821949,748177
2,49365,657111
3,453247,376824
4,804351,399877


In [18]:
users_filtered = users_filtered.reset_index()
users_filtered.head()

Unnamed: 0,index,usermap_id,user_id
0,0,166482,983246
1,1,821949,748177
2,2,49365,657111
3,3,453247,376824
4,4,804351,399877


In [19]:
users_filtered = users_filtered.rename(columns = {'index':'new_usermap_id'})
users_filtered.head()

Unnamed: 0,new_usermap_id,usermap_id,user_id
0,0,166482,983246
1,1,821949,748177
2,2,49365,657111
3,3,453247,376824
4,4,804351,399877


In [20]:
print(len(users_filtered['new_usermap_id'].unique()))

50000


In [21]:
users_filtered.to_csv('users_filtered_list.csv')

In [22]:
## mapping the taobao items list 
print(len(users_filtered['user_id'].unique()))

50000


In [23]:
print(len(taobao_filtered['user_id'].unique()))

50000


In [24]:
taobao_filtered.shape

(5065216, 6)

In [25]:
taobao_filtered[taobao_filtered['user_id'].isin(users_filtered['user_id'])].shape
## just to make sure that it's the right data set

(5065216, 6)

In [27]:
len(taobao_filtered['item_id'].unique()) + len(taobao_filtered['category_id'].unique())

1117778

In [30]:
print(item_df.head())

   itemmap_id  item_id
0           0  4910621
1           1  4847970
2           2  2736814
3           3  2247338
4           4  3840111


In [31]:
item_filtered = item_df[item_df['item_id'].isin(taobao_filtered['item_id'])]
print(item_filtered.head())
print(item_filtered.shape)

    itemmap_id  item_id
6            6  1587638
15          15  4102184
16          16  4823083
24          24  3597637
25          25   884047
(1110292, 2)


In [32]:
item_filtered = item_filtered.reset_index()
item_filtered.head()

Unnamed: 0,index,itemmap_id,item_id
0,6,6,1587638
1,15,15,4102184
2,16,16,4823083
3,24,24,3597637
4,25,25,884047


In [33]:
item_filtered = item_filtered.drop(columns = 'index')
item_filtered = item_filtered.reset_index()
item_filtered.head()

Unnamed: 0,index,itemmap_id,item_id
0,0,6,1587638
1,1,15,4102184
2,2,16,4823083
3,3,24,3597637
4,4,25,884047


In [34]:
item_filtered.head()

Unnamed: 0,index,itemmap_id,item_id
0,0,6,1587638
1,1,15,4102184
2,2,16,4823083
3,3,24,3597637
4,4,25,884047


In [35]:
item_filtered = item_filtered.rename(columns = {'index': 'new_itemmap_id'})
item_filtered.head()

Unnamed: 0,new_itemmap_id,itemmap_id,item_id
0,0,6,1587638
1,1,15,4102184
2,2,16,4823083
3,3,24,3597637
4,4,25,884047


In [36]:
print(item_filtered['new_itemmap_id'].max())
print(item_filtered.shape)

1110291
(1110292, 3)


In [37]:
category_df = pd.read_csv('category_list.csv')
print(category_df.head())
print(category_df.shape)

   categorymap_id  category_id
0         4162024      1003418
1         4162025      1047470
2         4162026      2819583
3         4162027      1363318
4         4162028      2560150
(9439, 2)


In [38]:
category_filtered = category_df[category_df['category_id'].isin(taobao_filtered['category_id'])]
print(category_filtered.shape)
print(category_filtered.head())

(7486, 2)
   categorymap_id  category_id
1         4162025      1047470
2         4162026      2819583
3         4162027      1363318
5         4162029       360595
6         4162030      1234361


In [39]:
category_filtered = category_filtered.reset_index()
category_filtered.head()

Unnamed: 0,index,categorymap_id,category_id
0,1,4162025,1047470
1,2,4162026,2819583
2,3,4162027,1363318
3,5,4162029,360595
4,6,4162030,1234361


In [40]:
category_filtered = category_filtered.drop(columns = 'index')
category_filtered.head()

Unnamed: 0,categorymap_id,category_id
0,4162025,1047470
1,4162026,2819583
2,4162027,1363318
3,4162029,360595
4,4162030,1234361


In [41]:
category_filtered = category_filtered.reset_index()
category_filtered.head()

Unnamed: 0,index,categorymap_id,category_id
0,0,4162025,1047470
1,1,4162026,2819583
2,2,4162027,1363318
3,3,4162029,360595
4,4,4162030,1234361


In [42]:
category_filtered = category_filtered.rename(columns = {'index': 'new_categorymap_id'})

category_filtered['new_categorymap_id'] = category_filtered['new_categorymap_id'] + item_filtered['new_itemmap_id'].max() + 1
print(category_filtered.head())
print(category_filtered.shape)
print(len(category_filtered['new_categorymap_id'].unique()))


   new_categorymap_id  categorymap_id  category_id
0             1110292         4162025      1047470
1             1110293         4162026      2819583
2             1110294         4162027      1363318
3             1110295         4162029       360595
4             1110296         4162030      1234361
(7486, 3)
7486


In [44]:
print(category_filtered['new_categorymap_id'].max())
7486 + 1110291

1117777


1117777

In [49]:
print(item_filtered.head())
print(item_filtered['new_itemmap_id'].max())
print(len(item_filtered['new_itemmap_id'].unique()))
print(item_filtered.shape)

print(category_filtered.head())
print(category_filtered['new_categorymap_id'].max())
print(len(category_filtered['new_categorymap_id'].unique()))

   new_itemmap_id  itemmap_id  item_id
0               0           6  1587638
1               1          15  4102184
2               2          16  4823083
3               3          24  3597637
4               4          25   884047
1110291
1110292
(1110292, 3)
   new_categorymap_id  categorymap_id  category_id
0             1110292         4162025      1047470
1             1110293         4162026      2819583
2             1110294         4162027      1363318
3             1110295         4162029       360595
4             1110296         4162030      1234361
1117777
7486


In [46]:
print(item_filtered.shape)
print(category_filtered.shape)

(1110292, 3)
(7486, 3)


In [47]:
1110291 + 7486

1117777

In [50]:
category_filtered2 = category_filtered.copy(deep = True)
item_filtered2 = item_filtered.copy(deep = True)

category_filtered2['type'] = 'category'
item_filtered2['type'] = 'item'

category_filtered2.rename(columns = {'new_categorymap_id':'new_map_id', 
                               'category_id': 'og_id',
                               'categorymap_id': 'og_map_id'}, 
                    inplace = True)
item_filtered2.rename(columns = {'new_itemmap_id': 'new_map_id', 
                           'item_id': 'og_id', 
                           'itemmap_id': 'og_map_id'}, 
                inplace = True)

entity_filtered = pd.concat([item_filtered2, category_filtered2])

In [51]:
print(entity_filtered.head())
print(entity_filtered.shape)

   new_map_id  og_map_id    og_id  type
0           0          6  1587638  item
1           1         15  4102184  item
2           2         16  4823083  item
3           3         24  3597637  item
4           4         25   884047  item
(1117778, 4)


In [52]:
print(entity_filtered['new_map_id'].max())

1117777


In [None]:
print(len(entity_filtered['new_map_id'].unique()))
print(entity_filtered['new_map_id'].max())
print(entity_filtered.shape)

print(len(category_filtered['new_categorymap_id'].unique()))
print(category_filtered['new_categorymap_id'].max())
print(category_filtered['new_categorymap_id'].min())
print(category_filtered.shape)

print(len(item_filtered['new_itemmap_id'].unique()))

print(item_filtered['new_itemmap_id'].max())
print(item_filtered.shape)

1117778
1117777
(1117778, 4)
7486
1117777
1110292
(7486, 3)
1110292
1110291
(1110292, 3)


In [57]:
## saving the item list and entity list

entity_filtered.to_csv('entity_filtered_list.csv')
item_filtered.to_csv('item_filtered_list.csv')
category_filtered.to_csv('category_filtered_list.csv')

In [94]:
print(taobao_df.head())

   user_id  item_id  category_id behavior_type   timestamp
0        1  2268318      2520377            pv  1511544070
1        1  2333346      2520771            pv  1511561733
2        1  2576651       149192            pv  1511572885
3        1  3830808      4181361            pv  1511593493
4        1  4365585      2520377            pv  1511596146


In [146]:
print(taobao_df.shape)
taobao_filtered2 = taobao_df[['item_id', 'category_id']].drop_duplicates()
print(taobao_filtered2.shape)
print(taobao_filtered2.head())

(100150807, 5)
(4163442, 2)
   item_id  category_id
0  2268318      2520377
1  2333346      2520771
2  2576651       149192
3  3830808      4181361
4  4365585      2520377


In [147]:
item_filtered.head()

Unnamed: 0,new_itemmap_id,itemmap_id,item_id
0,0,6,1587638
1,1,15,4102184
2,2,16,4823083
3,3,24,3597637
4,4,25,884047


In [141]:
trial = taobao_df.groupby('item_id')['category_id'].nunique()

KeyboardInterrupt: 

In [119]:
trial = trial.reset_index()
print(trial.head())
trial[trial['category_id'] != 1]

   item_id  category_id
0        1            1
1        3            1
2        4            1
3        5            1
4        6            1


Unnamed: 0,item_id,category_id
1316,1608,2
2475,3035,2
8324,10307,2
16397,20353,2
20228,25084,2
...,...,...
4147343,5144889,2
4148256,5146010,2
4149316,5147331,2
4158829,5159113,2


In [148]:
taobao_filtered2 = taobao_filtered2[(taobao_filtered2['item_id'].isin(item_filtered['item_id'])) & (taobao_filtered2['category_id'].isin(category_filtered['category_id']))]
print(taobao_filtered2.shape)

(1110953, 2)


In [149]:
len(item_filtered['item_id'].unique())

1110292

In [150]:
print(len(taobao_filtered2['item_id'].unique()))

1110292


In [152]:
print(len(taobao_filtered2['category_id'].unique()))


7486


In [153]:
print(taobao_filtered2.shape)
print(taobao_df.shape)
print(taobao_filtered2.head())

(1110953, 2)
(100150807, 5)
   item_id  category_id
0  2268318      2520377
1  2333346      2520771
2  2576651       149192
3  3830808      4181361
4  4365585      2520377


In [154]:
## making the filtered kg graph
taobao_filtered2 = taobao_filtered2.merge(item_filtered, on = 'item_id', how = 'left')
taobao_filtered2.head()

Unnamed: 0,item_id,category_id,new_itemmap_id,itemmap_id
0,2268318,2520377,754333,2827803
1,2333346,2520771,869675,3261306
2,2576651,149192,652803,2446773
3,3830808,4181361,608696,2281264
4,4365585,2520377,980434,3676007


In [155]:
taobao_filtered2 = taobao_filtered2.merge(category_filtered, on = 'category_id', how = 'left')

In [156]:
taobao_filtered2.head()

Unnamed: 0,item_id,category_id,new_itemmap_id,itemmap_id,new_categorymap_id,categorymap_id
0,2268318,2520377,754333,2827803,1113413,4165957
1,2333346,2520771,869675,3261306,1112174,4164386
2,2576651,149192,652803,2446773,1112954,4165378
3,3830808,4181361,608696,2281264,1117217,4170771
4,4365585,2520377,980434,3676007,1113413,4165957


In [157]:
print(taobao_filtered2.shape)
taobao_filtered2.head()

(1110953, 6)


Unnamed: 0,item_id,category_id,new_itemmap_id,itemmap_id,new_categorymap_id,categorymap_id
0,2268318,2520377,754333,2827803,1113413,4165957
1,2333346,2520771,869675,3261306,1112174,4164386
2,2576651,149192,652803,2446773,1112954,4165378
3,3830808,4181361,608696,2281264,1117217,4170771
4,4365585,2520377,980434,3676007,1113413,4165957


In [158]:
print(len(taobao_filtered2['new_categorymap_id'].unique()))
print(len(taobao_filtered2['new_itemmap_id'].unique()))

7486
1110292


In [159]:
taobao_filtered3 = taobao_filtered2[['new_itemmap_id', 'new_categorymap_id']].drop_duplicates()
print(taobao_filtered3.shape)


(1110953, 2)


In [162]:
print(len(taobao_filtered3['new_categorymap_id'].unique()))
print(len(taobao_filtered3['new_itemmap_id'].unique()))

7486
1110292


In [163]:
1110292 + 7486

1117778

In [164]:
1117778 - 1110504

7274

In [165]:
print(taobao_filtered3.shape)

(1110953, 2)


In [174]:
taobao_kg_itemtocat = taobao_filtered3.copy(deep = True)
taobao_kg_cattoitem = taobao_filtered3.copy(deep = True)

taobao_kg_itemtocat = taobao_kg_itemtocat[['new_itemmap_id', 'new_categorymap_id']]
taobao_kg_cattoitem = taobao_kg_cattoitem[['new_categorymap_id', 'new_itemmap_id']]


taobao_kg_itemtocat['relation'] = 0
taobao_kg_cattoitem['relation'] = 1

taobao_kg_itemtocat = taobao_kg_itemtocat.rename(columns = {'new_itemmap_id': 'head', 
                                                              'new_categorymap_id': 'tail'})
taobao_kg_cattoitem = taobao_kg_cattoitem.rename(columns = {'new_categorymap_id': 'head', 
                                                            'new_itemmap_id': 'tail'})

print(taobao_kg_itemtocat.head())
print(taobao_kg_cattoitem.head())

     head     tail  relation
0  754333  1113413         0
1  869675  1112174         0
2  652803  1112954         0
3  608696  1117217         0
4  980434  1113413         0
      head    tail  relation
0  1113413  754333         1
1  1112174  869675         1
2  1112954  652803         1
3  1117217  608696         1
4  1113413  980434         1


In [175]:
kg_filtered = pd.concat([taobao_kg_itemtocat, taobao_kg_cattoitem])

In [177]:
print(len(kg_filtered['head'].unique()))
print(len(kg_filtered['tail'].unique()))

1117778
1117778


In [178]:
print(kg_filtered.shape)

(2221906, 3)


In [179]:
kg_filtered.head()

Unnamed: 0,head,tail,relation
0,754333,1113413,0
1,869675,1112174,0
2,652803,1112954,0
3,608696,1117217,0
4,980434,1113413,0


In [180]:
kg_filtered.to_csv('kg_filtered.csv')

# ignore down below

In [184]:
len(item_filtered['new_itemmap_id'].unique())

1110292

In [182]:
item_filtered[~item_filtered['new_itemmap_id'].isin(kg_filtered['head'])]

Unnamed: 0,new_itemmap_id,itemmap_id,item_id


In [185]:
category_filtered[~category_filtered['new_categorymap_id'].isin(kg_filtered['head'])]

Unnamed: 0,new_categorymap_id,categorymap_id,category_id


In [186]:
taobao_filtered_trial = taobao_df[(taobao_df['item_id'].isin(item_filtered['item_id'])) & (taobao_df['category_id'].isin(category_filtered['category_id']))]


In [187]:
print(taobao_filtered_trial.head())

   user_id  item_id  category_id behavior_type   timestamp
0        1  2268318      2520377            pv  1511544070
1        1  2333346      2520771            pv  1511561733
2        1  2576651       149192            pv  1511572885
3        1  3830808      4181361            pv  1511593493
4        1  4365585      2520377            pv  1511596146


In [189]:
users_filtered[~users_filtered['user_id'].isin(taobao_filtered_trial['user_id'])]

Unnamed: 0,new_usermap_id,usermap_id,user_id


In [190]:
taobao_filtered_trial[~taobao_filtered_trial['item_id'].isin(kg_filtered['head'])]

Unnamed: 0,user_id,item_id,category_id,behavior_type,timestamp
0,1,2268318,2520377,pv,1511544070
1,1,2333346,2520771,pv,1511561733
2,1,2576651,149192,pv,1511572885
3,1,3830808,4181361,pv,1511593493
4,1,4365585,2520377,pv,1511596146
...,...,...,...,...,...
100150802,999999,4797808,11120,pv,1512293403
100150803,999999,4613472,4602841,pv,1512293766
100150804,999999,3647364,2304296,pv,1512293792
100150805,999999,1903801,2304296,pv,1512293827
