In [55]:
import torch
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from model import Model_R
from sklearn.metrics import roc_auc_score
import itertools
import tqdm
import torch.nn.functional as F

In [56]:
data = HeteroData()
product_df = pd.read_csv("data/processed_product.csv", index_col=0)
user_df = pd.read_csv("./data/processed_user.csv", index_col=0)
purchase_df = pd.read_csv("data/processed_purchase.csv", index_col=0)
product_df = product_df.fillna(product_df.mean())
edge_index_user2product = torch.from_numpy(purchase_df.values[:,:-1].T)

data["user"].node_id = torch.arange(len(user_df))
data["product"].node_id = torch.arange(len(product_df))

  """


In [57]:
ratings = torch.from_numpy(purchase_df.values[:,-1].T)

In [58]:
def preprocess_user_review(df, method=None, num_review_feat = None):
  df = df.drop('review', axis=1)
  if method == None:
    return df

  else:
    review_df = pd.read_csv('./data/{}.csv'.format(method))
    
    num2uid = pd.read_csv("data/dict_num2uid.csv", index_col=0)
    uid2num = {val:key for key,val in num2uid.to_dict()['0'].items()}
    review_df.iloc[:,0] = review_df.iloc[:,0].apply(lambda x:uid2num[x])
    review_df = review_df.sort_values(by='리뷰자')
    review_df = review_df.drop(['리뷰자'], axis=1)

    if num_review_feat != None:
      pca = PCA(n_components=num_review_feat)
      review_df = pca.fit_transform(review_df)

    df = np.concatenate([df,review_df], axis =1)
    return df

def preprocess_product_pid(df, method=None):
  if method == None:
    return df.drop('pid', axis=1)

In [59]:
# method = None
methods = ['kobart_emb_kobart' ,'kobart_tfidf' ,'kobart_word2vec' ,"lexrank_emb_kobart" ,"lexrank_tfidf" ,"lexrank_word2vec" ,"gpt_emb_kobart" ,"gpt_tfidf" ,"gpt_word2vec"]
num_review_feats = [None, 64, 32, 8]

In [60]:
settings = list(itertools.product(methods,num_review_feats))+[(None,None)]

In [65]:
for method, num_review_feat in settings:
# for method, num_review_feat in [('kobart_emb_kobart', 64)]:
    print(method, num_review_feat, end=' ')
    user_df_processed = preprocess_user_review(user_df, method=method, num_review_feat=num_review_feat)
    product_df_processed = preprocess_product_pid(product_df)

    # user_scaler = MinMaxScaler((0,1))
    # product_scaler = MinMaxScaler((0,1))
    user_scaler = StandardScaler()
    product_scaler = StandardScaler()

    user_scaled = user_scaler.fit_transform(user_df_processed)
    product_scaled = product_scaler.fit_transform(product_df_processed)
    num_user_feat = user_scaled.shape[1]
    num_product_feat = product_scaled.shape[1]
    data["product"].x = torch.from_numpy(product_scaled).to(torch.float)
    data["user"].x = torch.from_numpy(user_scaled).to(torch.float)
    data["user", "purchase", "product"].edge_index = edge_index_user2product
    data["user", "purchase", "product"].edge_label = ratings.to(torch.float)  # [num_ratings]
    data["product", "rev_purchase", "user"].edge_index = edge_index_user2product[[1,0]]  # TODO
    # data = T.ToUndirected()(data)
    del data["product", "rev_purchase", "user"].edge_label

    transform = T.RandomLinkSplit(
        num_val=0.1,
        num_test=0.1,
        neg_sampling_ratio=0.0,
        edge_types=("user", "purchase", "product"),
        rev_edge_types=("product", "rev_purchase", "user"), 
    )
    test_loss =  test_total_examples = 0
    for iter in range(10):
        train_data, val_data, test_data = transform(data)

        ## Training a Heterogeneous Link-level GNN

        model = Model_R(hidden_channels=64, data=data)
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        model = model.to(device)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

        train_losses = []
        val_losses = []
        for epoch in range(1, 20):
            model = model.train()
            total_loss = total_examples = 0
            val_loss = val_total_examples=  0
            # for sampled_data in tqdm.tqdm(train_loader):

            optimizer.zero_grad()
            train_data = train_data.to(device)

            pred = model(train_data.x_dict, train_data.edge_index_dict,
                        train_data['user', 'product'].edge_label_index)
            target = train_data['user', 'product'].edge_label
            loss = F.mse_loss(pred, target)

            # TODO: Move `sampled_data` to the respective `device`
            # TODO: Run `forward` pass of the model
            # TODO: Apply binary cross entropy via
            # `F.binary_cross_entropy_with_logits(pred, ground_truth)`
            # raise NotImplementedError

            loss.backward()
            optimizer.step()
            total_loss += float(loss)

            model=model.eval()
            # for sampled_data in tqdm.tqdm(val_loader):
            with torch.no_grad():
                # TODO: Collect predictions and ground-truths and write them into
                # `preds` and `ground_truths`.
                # raise NotImplementedError

                val_data = val_data.to(device)
                pred = model(val_data.x_dict, val_data.edge_index_dict,
                            val_data['user', 'product'].edge_label_index)
                pred = pred.clamp(min=0, max=5)
                # pred = pred.clamp(min=-4, max=0)
                target = val_data['user', 'product'].edge_label.float()
                rmse = F.mse_loss(pred, target).sqrt()

                val_loss += float(rmse)
            train_losses.append(total_loss)
            val_losses.append(val_loss)

            # print(f"Epoch: {epoch:03d}, Train_Loss: {train_loss:.4f}, Val_Loss: {val_loss:.4f}")
        ## Evaluating a Heterogeneous Link-level GNN

        preds = []
        ground_truths = []
        # for sampled_data in tqdm.tqdm(val_loader):
        with torch.no_grad():
            # TODO: Collect predictions and ground-truths and write them into
            # `preds` and `ground_truths`.
            # raise NotImplementedError
            test_data = test_data.to(device)
            pred = model(test_data.x_dict, test_data.edge_index_dict,
                        test_data['user', 'product'].edge_label_index)
            pred = pred.clamp(min=0, max=5)
            # pred = pred.clamp(min=-4, max=0)
            target = test_data['user', 'product'].edge_label.float()
            rmse = F.mse_loss(pred, target).sqrt()

            test_loss += float(rmse)

    print(test_loss/10)

kobart_emb_kobart None 1.1011335134506226
kobart_emb_kobart 64 0.9995232343673706
kobart_emb_kobart 32 1.0505940914154053
kobart_emb_kobart 8 1.0650047779083252
kobart_tfidf None 1.4174956202507019
kobart_tfidf 64 1.0318030893802643
kobart_tfidf 32 1.0684087753295899
kobart_tfidf 8 1.109273773431778
kobart_word2vec None 0.9642676830291748
kobart_word2vec 64 1.113122248649597
kobart_word2vec 32 0.987657779455185
kobart_word2vec 8 1.1227283596992492
lexrank_emb_kobart None 1.200694215297699
lexrank_emb_kobart 64 1.0428575217723846
lexrank_emb_kobart 32 0.9833005726337433
lexrank_emb_kobart 8 1.1566920518875121
lexrank_tfidf None 1.2491906106472015
lexrank_tfidf 64 1.034512597322464
lexrank_tfidf 32 0.9835293292999268
lexrank_tfidf 8 1.1173545241355896
lexrank_word2vec None 0.9643010020256042
lexrank_word2vec 64 0.9485349595546723
lexrank_word2vec 32 0.9794614851474762
lexrank_word2vec 8 0.9966721713542939
gpt_emb_kobart None 1.239701509475708
gpt_emb_kobart 64 1.05324729681015
gpt_emb_ko

In [66]:
with torch.no_grad():
    test_data = test_data.to(device)
    pred = model(test_data.x_dict, test_data.edge_index_dict,
                 test_data['user', 'product'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    # pred = pred.clamp(min=-4, max=0)
    target = test_data['user', 'product'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    print(f'Test RMSE: {rmse:.4f}')

userId = test_data['user', 'product'].edge_label_index[0].cpu().numpy()
productID = test_data['user', 'product'].edge_label_index[1].cpu().numpy()
pred = pred.cpu().numpy()
target = target.cpu().numpy()

print(pd.DataFrame({'userId': userId, 'productId': productID, 'rating': pred, 'target': target}))

Test RMSE: 1.1308
     userId  productId    rating  target
0      2778        224  5.000000     5.0
1      1065         39  3.679582     5.0
2      1567         61  2.728793     5.0
3      1220         43  5.000000     5.0
4      2878        123  2.498407     5.0
5      2588         46  3.637355     5.0
6      1253         19  5.000000     4.0
7      2293         90  3.027956     5.0
8       394         11  3.948282     5.0
9      1352          4  5.000000     5.0
10     3013        128  3.524855     5.0
11     1921        144  3.390735     3.0
12      471         13  5.000000     5.0
13     2994         27  5.000000     5.0
14     1746        216  4.425145     5.0
15      875        377  3.231181     4.0
16      196         30  3.679967     4.0
17     3371        107  3.721126     5.0
18     2706        114  5.000000     5.0
19     1832        195  2.828726     5.0
20      545         14  5.000000     5.0
21     3147         14  5.000000     5.0
22     2991        128  4.697078     5.

In [67]:
result_df = pd.DataFrame({'userId': userId, 'productId': productID, 'rating': pred, 'target': target})
result_df[result_df.target==5.0]

Unnamed: 0,userId,productId,rating,target
0,2778,224,5.0,5.0
1,1065,39,3.679582,5.0
2,1567,61,2.728793,5.0
3,1220,43,5.0,5.0
4,2878,123,2.498407,5.0
5,2588,46,3.637355,5.0
7,2293,90,3.027956,5.0
8,394,11,3.948282,5.0
9,1352,4,5.0,5.0
10,3013,128,3.524855,5.0
