In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

DATA_DIR = os.path.join("dataset", "book_crossing")

In [None]:
train_edge_pkl = os.path.join(DATA_DIR, "interact_train.pkl")
test_edge_pkl = os.path.join(DATA_DIR, "interact_test.pkl")

train_edge_df = pd.read_pickle(train_edge_pkl)
test_edge_df = pd.read_pickle(test_edge_pkl)

print(train_edge_df.shape, test_edge_df.shape)
display(train_edge_df.head())
display(test_edge_df.head())

In [None]:
def plot_item_degree_dist(edge_df):
    item_degree = edge_df["itemid"].value_counts()
    degree_distribution = item_degree.value_counts().sort_index()

    plt.figure(figsize=(10, 6))
    plt.loglog(degree_distribution.index, degree_distribution.values, marker='o')
    plt.xlabel('Degree')
    plt.ylabel('Number of Items')
    plt.title('Item Degree Distribution')
    plt.grid(True)
    plt.show()

plot_item_degree_dist(train_edge_df)

In [None]:
all_edge_df = pd.concat([train_edge_df, test_edge_df], ignore_index=True)
plot_item_degree_dist(all_edge_df)

In [None]:
item_feat_pkl = os.path.join(DATA_DIR, "item_feature.pkl")
item_feat_df = pd.read_pickle(item_feat_pkl)

print(item_feat_df.shape)
item_feat_df.head()

In [None]:
enc_item_feat_pkl = os.path.join(DATA_DIR, "encoded_item_feature.pkl")
enc_item_feat_df = pd.read_pickle(enc_item_feat_pkl)

print(enc_item_feat_df.shape)
enc_item_feat_df.head()

In [None]:
user_map_csv = os.path.join(DATA_DIR, "user_encoder_map.csv")
user_map_df = pd.read_csv(user_map_csv)

print(user_map_df.shape)
user_map_df.head()

In [None]:
item_map_csv = os.path.join(DATA_DIR, "item_encoder_map.csv")
item_map_df = pd.read_csv(item_map_csv)

print(item_map_df.shape)
item_map_df.head()

In [None]:
item_ids_feat = set(item_feat_df["item"].tolist())
item_ids_enc = set(enc_item_feat_df["item"].tolist())
item_ids_map = set(item_map_df["item"].tolist())

print(len(item_ids_feat), len(item_ids_enc), len(item_ids_map))
print(len(item_ids_feat & item_ids_enc))

In [None]:
enc_item_feat_df.dtypes

In [None]:
enc_item_feat_df.head(10)