In [2]:
import pandas as pd 

In [3]:
train = pd.read_csv('data/train.txt', header = None).reset_index()
test = pd.read_csv('data/test.txt', header = None).reset_index()
train[0] = train[0].apply(lambda x: ' '.join(x.split())[1:])
test[0] = test[0].apply(lambda x: ' '.join(x.split())[1:])
train = train.rename(columns = {'index': 'user_id', 
                                0:'item_id'})
test = test.rename(columns = {'index':'user_id', 
                              0:'item_id'})

In [None]:
user_list = pd.read_csv('data/user_list.txt', sep = ' ')
item_list = pd.read_csv('data/item_list.txt', sep = ' ')


In [67]:
kg_final = pd.read_csv('data/kg_final.txt', sep = ' ', header = None)
kg_final = kg_final.rename(columns = {0:'head_entity_id',
                           1: 'relation', 
                           2:'tail_entity_id'})

# To Build CKG:

1. Extract User-item interactions
use train.txt to connect (user_list) with (item_list)
example: (user_id, interacts, item_id)
2. Map items to entities 
use item_list to replace item_id with entity_id (from freebase_id)
3. Combine user-entity interactions with kg_final.txt
append user-entity interactions to the existing triplets in kg_final.txt 

In [6]:
train.head()

Unnamed: 0,user_id,item_id
0,0,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 1...
1,1,32 33 34 35
2,2,36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
3,3,52 53 54 55 56 57 58 59 60 61 62 63 64 65 66
4,4,67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 ...


In [7]:
test.head()

Unnamed: 0,user_id,item_id
0,0,4828 774 207 7460 7465 3768 10221 22435
1,1,2825
2,2,10184 3500 10241 10185 1700
3,3,5365 699 3016 6675
4,4,1168 15939 23677 9813 4683 17241 19662 8720 2...


In [8]:
user_list.head()

Unnamed: 0,org_id,remap_id
0,A3RTKL9KB8KLID,0
1,A38LAIK2N83NH0,1
2,A3PPXVR5J6U2JD,2
3,A2ULDDL3MLJPUR,3
4,A2I6MHMAZZDCRX,4


In [9]:
item_list.head()

Unnamed: 0,org_id,remap_id,freebase_id
0,0553092626,0,m.045wq1q
1,0393316041,1,m.03_28m
2,038548254X,2,m.0h2q1cq
3,0385307756,3,m.04y9jxd
4,038531258X,4,m.060c1r


In [68]:
kg_final.head()

Unnamed: 0,head_entity_id,relation,tail_entity_id
0,24915,0,24916
1,24917,1,5117
2,24918,0,24917
3,24919,1,24920
4,24921,2,24922


## Create user-entity interactions 

user_id | relation |entity_id 

user_id from train 

relation: "interacts" 

entity_id: from freebase_id in item_list

In [11]:
#1. need to expand the items in train 
expanded_train = train.set_index('user_id')['item_id'].str.split().explode().reset_index()

In [12]:
expanded_train['item_id'] = expanded_train['item_id'].astype(int)

In [13]:
item_list['remap_id'] = item_list['remap_id'].astype(int)

In [14]:
# Merge with item_list to get freebase_id
user_entity_interactions = expanded_train.merge(item_list, left_on='item_id', right_on='remap_id')



In [15]:

# Step 4: Keep only necessary columns
user_entity_interactions = user_entity_interactions[['user_id', 'freebase_id']].rename(columns={'freebase_id': 'entity_id'})

# Step 5: Add the interaction relation
#there are already 38 relationship types in the relation_list.txt file 
#I add a 39th to indicate an interaction specifically for this use case
user_entity_interactions['relation'] = 39 

# Reorder columns for clarity
user_entity_interactions = user_entity_interactions[['user_id', 'relation', 'entity_id']]



In [16]:
user_entity_interactions

Unnamed: 0,user_id,relation,entity_id
0,0,39,m.045wq1q
1,10,39,m.045wq1q
2,20,39,m.045wq1q
3,30,39,m.045wq1q
4,40,39,m.045wq1q
...,...,...,...
723178,70512,39,m.011c3tgn
723179,70537,39,m.0115wv74
723180,70550,39,m.010s0my0
723181,70610,39,m.01131zx0


In [23]:
entity_list = pd.read_csv("data/entity_list.txt", header = None).reset_index()


In [43]:
entity_list = pd.DataFrame(entity_list[0].str.split().tolist())

In [46]:
entity_list = entity_list[[0,1]]

In [50]:
entity_list.columns = entity_list.iloc[0]

In [55]:
entity_list = entity_list.drop(entity_list.index[0])

In [56]:
entity_list

Unnamed: 0,org_id,remap_id
1,m.045wq1q,0
2,m.03_28m,1
3,m.0h2q1cq,2
4,m.04y9jxd,3
5,m.060c1r,4
...,...,...
83459,m.06388l,113482
83460,m.0dc6ph,113483
83461,m.0j1d6ks,113484
83462,m.06hg41c,113485


In [60]:
user_entity = pd.merge(user_entity_interactions, entity_list, how = 'left',  left_on = ['entity_id'], right_on = ['org_id'])

In [62]:
user_entity = user_entity[['user_id', 'relation', 'entity_id', 'remap_id']]

In [82]:
user_entity['remap_id'] = user_entity['remap_id'].astype(int)

In [83]:
user_entity.head()

Unnamed: 0,user_id,relation,entity_id,remap_id
0,0,39,m.045wq1q,0
1,10,39,m.045wq1q,0
2,20,39,m.045wq1q,0
3,30,39,m.045wq1q,0
4,40,39,m.045wq1q,0


In [65]:
user_entity.shape

(723183, 4)

In [69]:
kg_final.head()

Unnamed: 0,head_entity_id,relation,tail_entity_id
0,24915,0,24916
1,24917,1,5117
2,24918,0,24917
3,24919,1,24920
4,24921,2,24922


In [70]:
kg_final.shape

(2557746, 3)

**Collaborative Knowledge Graph = user_entity + kg_final**

In [86]:
ckg = kg_final.rename(columns = {'head_entity_id':'head', 
                                 'tail_entity_id': 'tail'})
ckg2 = user_entity[['user_id', 'relation', 'remap_id']].rename(columns = {'user_id': 'head',
                                                                          'remap_id':'tail'})

In [89]:
ckg = pd.concat([ckg, ckg2], ignore_index = True)

In [90]:
ckg

Unnamed: 0,head,relation,tail
0,24915,0,24916
1,24917,1,5117
2,24918,0,24917
3,24919,1,24920
4,24921,2,24922
...,...,...,...
3280924,70512,39,24910
3280925,70537,39,24911
3280926,70550,39,24912
3280927,70610,39,24913


In [None]:
#ckg.to_csv("data/ckg.csv")

## Negative Sampling in KGAT
chatgpt 

In [92]:
import random

def generate_negative_samples(ckg, num_neg_samples=1):
    """
    Generate negative samples for the CKG data.
    
    Args:
        ckg (DataFrame): Collaborative Knowledge Graph with columns ['head', 'relation', 'tail'].
        num_neg_samples (int): Number of negative samples to generate per positive sample.
    
    Returns:
        neg_samples (DataFrame): DataFrame containing negative samples.
    """
    neg_samples = []
    entities = list(set(ckg['head']).union(set(ckg['tail'])))  # All entities (users, items, entities)

    for _, row in ckg.iterrows():
        head, relation, tail = row['head'], row['relation'], row['tail']
        
        # Generate negative samples by corrupting the tail
        for _ in range(num_neg_samples):
            neg_tail = random.choice(entities)
            while (head, relation, neg_tail) in ckg.values:
                neg_tail = random.choice(entities)  # Ensure the negative sample is invalid
            neg_samples.append([head, relation, neg_tail])

    neg_samples_df = pd.DataFrame(neg_samples, columns=['head', 'relation', 'tail'])
    return neg_samples_df


# CKG Embedding Layer

In passage 3.1, the KGAT paper discusses how they employe TransR on CKG. This method learns embeds from each entity and relation by optimizing the translation principle $e_h^r + e_r \approx e^r_t$

Our ckg data has the triplet (h,r, t)

``def _build_model_phase_II(self):
        self.h_e, self.r_e, self.pos_t_e, self.neg_t_e = self._get_kg_inference(self.h, self.r, self.pos_t, self.neg_t)
        self.A_kg_score = self._generate_transE_score(h=self.h, t=self.pos_t, r=self.r)
        self.A_out = self._create_attentive_A_out()``