In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import torch
from torch_geometric.data import Data
import numpy as np

In [6]:
df = pd.read_csv('../data/processed_data/data.csv')

In [7]:
df = df.dropna()
df.head()

Unnamed: 0,Id,ASIN,title,group,salesrank,similar,categories,total_reviews,avg_rating
0,1,827229534,Patterns of Preaching,Book,396585,0804215715 156101074X 0687023955 0687074231 08...,|Books[283155]|Subjects[1000]|Religion & Spiri...,2,5.0
1,2,738700797,Candlemas,Book,168596,0738700827 1567184960 1567182836 0738700525 07...,|Books[283155]|Subjects[1000]|Religion & Spiri...,12,4.5
3,4,842328327,Life Application Bible Commentary,Book,631289,0842328130 0830818138 0842330313 0842328610 08...,|Books[283155]|Subjects[1000]|Religion & Spiri...,1,4.0
4,5,1577943082,Prayers That Avail Much for Business,Book,455160,157794349X 0892749504 1577941829 0892749563 15...,|Books[283155]|Subjects[1000]|Religion & Spiri...,0,0.0
5,6,486220125,How the Other Half Lives,Book,188784,0486401960 0452283612 0486229076 0714840343 03...,|Books[283155]|Subjects[1000]|Arts & Photograp...,17,4.0


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 372183 entries, 0 to 542681
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Id             372183 non-null  int64  
 1   ASIN           372183 non-null  object 
 2   title          372183 non-null  object 
 3   group          372183 non-null  object 
 4   salesrank      372183 non-null  int64  
 5   similar        372183 non-null  object 
 6   categories     372183 non-null  object 
 7   total_reviews  372183 non-null  int64  
 8   avg_rating     372183 non-null  float64
dtypes: float64(1), int64(3), object(5)
memory usage: 28.4+ MB


In [9]:
df['text'] = df['title'] + ' ' + df['categories']

vectorizer = TfidfVectorizer(max_features=100)
tfidf_features = vectorizer.fit_transform(df['text']).toarray()

print(tfidf_features.shape)

(372183, 100)


In [28]:
group_encoder = LabelEncoder()
df['group_encoded'] = group_encoder.fit_transform(df['group'])
labels = torch.tensor(df['group_encoded'].values, dtype=torch.long)
df['salesrank_normalized'] = df['salesrank'] / df['salesrank'].max()

other_features = df[['salesrank_normalized', 'group_encoded']].to_numpy()
combined_features = np.hstack([other_features, tfidf_features])


In [29]:
edges = []
asin_to_index = {asin: idx for idx, asin in enumerate(df['ASIN'].unique())}

for _, row in df.iterrows():
    node_idx = asin_to_index[row['ASIN']]
    if pd.isna(row['similar']) or row['similar'] == '':
        continue
    similar_asins = row['similar'].split()
    for asin in similar_asins:
        if asin in asin_to_index:
            similar_idx = asin_to_index[asin]
            edges.append((node_idx, similar_idx))


In [30]:
if edges:
    print(f"Found {len(edges)} edges.")
    edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
else:
    print("No edges found. Check the edge creation process.")


Found 1199750 edges.


In [31]:
x = torch.tensor(combined_features, dtype=torch.float)
data = Data(x=x, edge_index=edge_index, y=labels)
torch.save(data, '../data/processed_data/graph_data.pt')