# Import

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
else:
    print(gpu_info)

In [None]:
import os

# Get the current process ID (PID)
pid = os.getpid()

print("Current Process ID:", pid)

In [None]:
import torch
import torch.nn as nn
import torch.nn.init as init
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import torch.nn.functional as F
# from torch_geometric.nn import GCNConv, GATConv
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

In [None]:
# torch.multiprocessing.set_start_method('forkserver')

In [None]:
torch.__version__

In [None]:
import numpy as np
import pickle
import pandas as pd
import random

# Load Mapping Dict

train / test 的 brand / inf ID，以及我有轉成另一種 node ID，是 train 的 brand / inf 一起編號，test 亦然。

In [None]:
with open("discover_train_brand_index.pickle", 'rb') as f:
    train_brand_ids = pickle.load(f)
with open("discover_test_brand_index.pickle", 'rb') as f:
    test_brand_ids = pickle.load(f)
with open("discover_train_inf_index.pickle", 'rb') as f:
    train_inf_ids = pickle.load(f)
with open("discover_test_inf_index.pickle", 'rb') as f:
    test_inf_ids = pickle.load(f)

In [None]:
with open("train_test_split/train_node_brand_id.pkl", 'rb') as f:
    train_node_brand_id = pickle.load(f)
with open("train_test_split/train_node_inf_id.pkl", 'rb') as f:
    train_node_inf_id = pickle.load(f)
with open("train_test_split/test_node_brand_id.pkl", 'rb') as f:
    test_node_brand_id = pickle.load(f)
with open("train_test_split/test_node_inf_id.pkl", 'rb') as f:
    test_node_inf_id = pickle.load(f)

In [None]:
with open("train_test_split/train_brand_node_id.pkl", 'rb') as f:
    train_brand_node_id = pickle.load(f)
with open("train_test_split/train_inf_node_id.pkl", 'rb') as f:
    train_inf_node_id = pickle.load(f)
with open("train_test_split/test_brand_node_id.pkl", 'rb') as f:
    test_brand_node_id = pickle.load(f)
with open("train_test_split/test_inf_node_id.pkl", 'rb') as f:
    test_inf_node_id = pickle.load(f)

In [None]:
with open('sorted_360_brand_list.pkl', 'rb') as f:
    all_brands = pickle.load(f)
with open('sorted_3748_inf_list.pkl', 'rb') as f:
    all_infs = pickle.load(f)
with open('category_list.pickle', 'rb') as f:
    category_list = pickle.load(f)

In [None]:
TRAIN_BRAND_NUM = 286
TRAIN_INF_NUM = 3075
TEST_BRAND_NUM = 74
TEST_INF_NUM = 797

# Negative Sampling

In [None]:
train_label_list = np.load('train_test_split/train_label_list_with_category.npy')

In [None]:
neg_num = 3
hard_neg_num = 1

In [None]:
def sampling_on_the_fly(brand_node_id, label_list, neg_num=neg_num, hard_neg_num=hard_neg_num):
    """
    return a list of brand samples, each mapping to a brand
    a sample looks like [pos, neg, neg, neg, hard_neg]
    each brand has several (not fixed) number of samples
    """
    # labels = label_list[brand_node_id]
    all_negative_indices_sampled = []
    all_hard_neg_indices_sampled = []
    all_brand_samples = []
    
    for i in range(brand_node_id.shape[0]):
        brand_samples = []
        label = label_list[brand_node_id[i]]
        positive_indices = torch.nonzero(label == 1).squeeze(1)
        negative_indices = torch.nonzero((label == 0) | (label == 2)).squeeze(1)
        hard_neg_indices = torch.nonzero(label == 2).squeeze(1)

        num_positive_samples = len(positive_indices)
        num_negative_samples = len(negative_indices)
        num_hard_neg_samples = len(hard_neg_indices)
        # print(num_positive_samples, num_negative_samples, num_hard_neg_samples)

        for j in range(num_positive_samples):
            # print(positive_indices[j])
            negative_indices_sampled = torch.randperm(num_negative_samples)[:neg_num]
            negative_indices_sampled = negative_indices[negative_indices_sampled]
            hard_neg_indices_sampled = torch.randperm(num_hard_neg_samples)[:hard_neg_num]
            hard_neg_indices_sampled = hard_neg_indices[hard_neg_indices_sampled]

            all_negative_indices_sampled.append(negative_indices_sampled)
            all_hard_neg_indices_sampled.append(hard_neg_indices_sampled)

            four_comb = torch.cat((torch.tensor([positive_indices[j]]).to(device), negative_indices_sampled, hard_neg_indices_sampled))
            brand_samples.append(four_comb)
        brand_samples = torch.stack(brand_samples)
        all_brand_samples.append(brand_samples)
    # all_brand_samples = torch.stack(all_brand_samples)

    return all_brand_samples

In [None]:
# 變成 [brand, pos, (hard)neg] 的 triplet

n_samples = 5
five_times = np.array([])
for k in range(n_samples):
    tt = sampling_on_the_fly(torch.arange(0, TRAIN_BRAND_NUM).to(device), torch.from_numpy(train_label_list).to(device), 3, 1)
#     print(tt)

    all_hard_samples = []
    for brand_id, brand_samples in enumerate(tt):
        for i in range(brand_samples.shape[0]): # pos inf number
            for j in range(1, 5):
                all_samples = []
                all_samples.append(brand_id) # brand node id
                all_samples.append(brand_samples[i][0].item()) # positive inf node id
                all_samples.append(brand_samples[i][j].item())
                all_hard_samples.append(all_samples)
#     print(all_hard_samples)

    all_hard_samples = torch.tensor(all_hard_samples)
#     print(all_hard_samples.shape)

    all_hard_samples = all_hard_samples.numpy()
    
    if k == 0:
        five_times = all_hard_samples
    else:
        five_times = np.concatenate((five_times, all_hard_samples), axis=0)
    print(five_times.shape)
        

# Handle Input Data

## Text - LDA

因為 train/test 時已經把 50 篇的字接在一起再斷詞了，所以不需再做 history pooling

In [None]:
train_lda = np.load('lda/lda_topic19_it50_train_pred.npy')
test_lda = np.load('lda/lda_topic19_it50_test_pred.npy')

In [None]:
train_lda.shape, test_lda.shape

In [None]:
train_brand_lda = [0] * TRAIN_BRAND_NUM
train_inf_lda = [0] * TRAIN_INF_NUM
for i in range(train_lda.shape[0]):
    if i < TRAIN_BRAND_NUM:   # brand
        aid = train_brand_ids[i]
        nid = train_brand_node_id[aid]
        train_brand_lda[nid] = train_lda[i]
    else:   # inf
        aid = train_inf_ids[i-TRAIN_BRAND_NUM]
        nid = train_inf_node_id[aid]-TRAIN_BRAND_NUM
        train_inf_lda[nid] = train_lda[i]
train_brand_lda = np.array(train_brand_lda)
train_inf_lda = np.array(train_inf_lda)
len(train_brand_lda), len(train_inf_lda)

In [None]:
train_brand_lda.shape, train_inf_lda.shape

In [None]:
for i in range(286):
    if type(train_brand_lda[i]) == int:
        print(i)

In [None]:
for i in range(3075):
    if type(train_inf_lda[i]) == int:
        print(i)

In [None]:
test_brand_lda = [0] * TEST_BRAND_NUM
test_inf_lda = [0] * TEST_INF_NUM
for i in range(test_lda.shape[0]):
    if i < TEST_BRAND_NUM:   # brand
        aid = test_brand_ids[i]
        nid = test_brand_node_id[aid]
        test_brand_lda[nid] = test_lda[i]
    else:   # inf
        aid = test_inf_ids[i-TEST_BRAND_NUM]
        nid = test_inf_node_id[aid]-TEST_BRAND_NUM
        test_inf_lda[nid] = test_lda[i]
test_brand_lda = np.array(test_brand_lda)
test_inf_lda = np.array(test_inf_lda)
len(test_brand_lda), len(test_inf_lda)

In [None]:
test_brand_lda.shape, test_inf_lda.shape

In [None]:
for i in range(TEST_BRAND_NUM):
    if type(test_brand_lda[i]) == int:
        print(i)

In [None]:
for i in range(TEST_INF_NUM):
    if type(test_inf_lda[i]) == int:
        print(i)

In [None]:
torch.save(torch.from_numpy(train_brand_lda), 'train_test_split/train_brand_text_feature_lda.pt')
torch.save(torch.from_numpy(train_inf_lda), 'train_test_split/train_inf_text_feature_lda.pt')
torch.save(torch.from_numpy(test_brand_lda), 'train_test_split/test_brand_text_feature_lda.pt')
torch.save(torch.from_numpy(test_inf_lda), 'train_test_split/test_inf_text_feature_lda.pt')

## Image

### Upernet

In [None]:
def img_preprocess(img_feat, thres=0.1):
    """ 
    filter insignificant class by thres -> 0
    """

    # Use boolean indexing to select elements above threshold
    above_threshold = img_feat > thres

    # Zero out the elements below the threshold
    filtered_output = torch.zeros_like(img_feat)
    filtered_output[above_threshold] = img_feat[above_threshold]
    
    # sum across 50 history posts
    filtered_output_sum = torch.sum(filtered_output, dim=0)
    
    # l2 normalization -> sum not 1
#     l2_norm = torch.norm(filtered_output_sum, p=2)
#     normalized_output = filtered_output_sum / l2_norm

    # softmax
    softmax_output = F.softmax(filtered_output_sum, dim=0)

    return softmax_output

In [None]:
b_folder = 'image_vgg/unifiedparsing/new_brand_result/'
i_folder = 'image_vgg/unifiedparsing/new_inf_result/'

In [None]:
all_brand_upernet_pre = [0]*360
for i, a in enumerate(all_brands):
    a_img = np.load(b_folder+a+'.npy')
    a_img_pre = img_preprocess(torch.from_numpy(a_img))
    all_brand_upernet_pre[i] = a_img_pre

In [None]:
for i in range(360):
    if type(all_brand_upernet_pre[i]) == int:
        print(i)

In [None]:
all_inf_upernet_pre = [0]*len(all_infs)
for i, a in enumerate(all_infs):
    a_img = np.load(i_folder+a+'.npy')
    a_img_pre = img_preprocess(torch.from_numpy(a_img))
    all_inf_upernet_pre[i] = a_img_pre

In [None]:
len(all_inf_upernet_pre)

In [None]:
for i in range(len(all_infs)):
    if type(all_inf_upernet_pre[i]) == int:
        print(i)

In [None]:
torch.save(all_brand_upernet_pre, 'train_test_split/all_brand_upernet_preprocessed.pt')
torch.save(all_inf_upernet_pre, 'train_test_split/all_inf_upernet_preprocessed.pt')

In [None]:
train_brand_upernet = [0]*TRAIN_BRAND_NUM
test_brand_upernet = [0]*TEST_BRAND_NUM
for i in range(len(all_brands)):
    if i in train_brand_ids:
        nid = train_brand_node_id[i]
        train_brand_upernet[nid] = all_brand_upernet_pre[i]
    if i in test_brand_ids:
        nid = test_brand_node_id[i]
        test_brand_upernet[nid] = all_brand_upernet_pre[i]

In [None]:
for t in train_brand_upernet:
    if type(t) == int:
        print(t)
for t in test_brand_upernet:
    if type(t) == int:
        print(t)

In [None]:
train_inf_upernet = [0]*TRAIN_INF_NUM
test_inf_upernet = [0]*TEST_INF_NUM

for i in range(len(all_infs)):
    if i in train_inf_ids:
        nid = train_inf_node_id[i] - TRAIN_BRAND_NUM
        if nid < 0:
            print(i)
        train_inf_upernet[nid] = all_inf_upernet_pre[i]
    if i in test_inf_ids:  ## 注意！ train / test inf 有重複，所以不能用 elif
        nid = test_inf_node_id[i] - TEST_BRAND_NUM
        if nid < 0:
            print(i)
        test_inf_upernet[nid] = all_inf_upernet_pre[i]


In [None]:
for t in train_inf_upernet:
    if type(t) == int:
        print(t)
for i, t in enumerate(test_inf_upernet):
    if type(t) == int:
        print(i)

In [None]:
train_brand_upernet = torch.stack(train_brand_upernet)
test_brand_upernet = torch.stack(test_brand_upernet)
train_inf_upernet = torch.stack(train_inf_upernet)
test_inf_upernet = torch.stack(test_inf_upernet)

In [None]:
train_brand_upernet.shape, test_brand_upernet.shape, train_inf_upernet.shape, test_inf_upernet.shape

In [None]:
torch.save(train_brand_upernet, 'train_test_split/train_brand_upernet.pt')
torch.save(test_brand_upernet, 'train_test_split/test_brand_upernet.pt')
torch.save(train_inf_upernet, 'train_test_split/train_inf_upernet.pt')
torch.save(test_inf_upernet, 'train_test_split/test_inf_upernet.pt')

### ResNet

In [None]:
# load image features
brand_post_images_resnet101 = np.load("image_vgg/brand_post_images_resnet101.npy")

with open("image_vgg/inf_post_images_resnet101_gpu.pickle", 'rb') as f:
    inf_post_images_resnet101 = pickle.load(f) # dict

brand_post_images_resnet101.shape, len(inf_post_images_resnet101)

In [None]:
all_brand_resnet = [0]*360
for i in range(360):
    img_pre = img_preprocess(torch.from_numpy(brand_post_images_resnet101[i]))
    if img_pre.shape[0] != 1000:
        print(i)
    all_brand_resnet[i] = img_pre

In [None]:
for t in all_brand_resnet:
    if type(t) == int:
        print(t)

In [None]:
sorted(list(inf_post_images_resnet101.keys())) == list(range(len(all_infs)))

In [None]:
all_inf_resnet = [0]*len(all_infs)
for i in range(len(all_infs)):
    if inf_post_images_resnet101[i].shape != (50, 1000):
        print(i)
    img_pre = img_preprocess(torch.from_numpy(inf_post_images_resnet101[i]))
    if img_pre.shape[0] != 1000:
        print(i)
    all_inf_resnet[i] = img_pre

In [None]:
torch.save(all_brand_resnet, 'train_test_split/all_brand_resnet_preprocessed.pt')
torch.save(all_inf_resnet, 'train_test_split/all_inf_resnet_preprocessed.pt')

In [None]:
train_brand_resnet = [0]*TRAIN_BRAND_NUM
test_brand_resnet = [0]*TEST_BRAND_NUM
for i in range(len(all_brands)):
    if i in train_brand_ids:
        nid = train_brand_node_id[i]
        train_brand_resnet[nid] = all_brand_resnet[i]
    if i in test_brand_ids:
        nid = test_brand_node_id[i]
        test_brand_resnet[nid] = all_brand_resnet[i]

In [None]:
for t in train_brand_resnet:
    if type(t) == int:
        print(t)
for t in test_brand_resnet:
    if type(t) == int:
        print(t)

In [None]:
train_inf_resnet = [0]*TRAIN_INF_NUM
test_inf_resnet = [0]*TEST_INF_NUM

for i in range(len(all_infs)):
    if i in train_inf_ids:
        nid = train_inf_node_id[i] - TRAIN_BRAND_NUM
        if nid < 0:
            print(i)
        train_inf_resnet[nid] = all_inf_resnet[i]
    if i in test_inf_ids:
        nid = test_inf_node_id[i] - TEST_BRAND_NUM
        if nid < 0:
            print(i)
        test_inf_resnet[nid] = all_inf_resnet[i]


In [None]:
for t in train_inf_resnet:
    if type(t) == int:
        print(t)
for i, t in enumerate(test_inf_resnet):
    if type(t) == int:
        print(i)

In [None]:
train_brand_resnet = torch.stack(train_brand_resnet)
test_brand_resnet = torch.stack(test_brand_resnet)
train_inf_resnet = torch.stack(train_inf_resnet)
test_inf_resnet = torch.stack(test_inf_resnet)

In [None]:
train_brand_resnet.shape, test_brand_resnet.shape, train_inf_resnet.shape, test_inf_resnet.shape

In [None]:
torch.save(train_brand_resnet, 'train_test_split/train_brand_resnet.pt')
torch.save(test_brand_resnet, 'train_test_split/test_brand_resnet.pt')
torch.save(train_inf_resnet, 'train_test_split/train_inf_resnet.pt')
torch.save(test_inf_resnet, 'train_test_split/test_inf_resnet.pt')

### concat

In [None]:
train_brand_mor_image_feature = torch.cat((train_brand_resnet, train_brand_upernet), dim=1)
test_brand_mor_image_feature = torch.cat((test_brand_resnet, test_brand_upernet), dim=1)
train_inf_mor_image_feature = torch.cat((train_inf_resnet, train_inf_upernet), dim=1)
test_inf_mor_image_feature = torch.cat((test_inf_resnet, test_inf_upernet), dim=1)

In [None]:
train_brand_mor_image_feature.shape, test_brand_mor_image_feature.shape

In [None]:
train_inf_mor_image_feature.shape, test_inf_mor_image_feature.shape

In [None]:
torch.save(train_brand_mor_image_feature, 'train_test_split/train_brand_mor_image_feature.pt')
torch.save(train_inf_mor_image_feature, 'train_test_split/train_inf_mor_image_feature.pt')
torch.save(test_brand_mor_image_feature, 'train_test_split/test_brand_mor_image_feature.pt')
torch.save(test_inf_mor_image_feature, 'train_test_split/test_inf_mor_image_feature.pt')

# Load Data

In [None]:
train_brand_text_feature = torch.load('train_test_split/train_brand_text_feature_lda.pt')
train_inf_text_feature = torch.load('train_test_split/train_inf_text_feature_lda.pt')
test_brand_text_feature = torch.load('train_test_split/test_brand_text_feature_lda.pt')
test_inf_text_feature = torch.load('train_test_split/test_inf_text_feature_lda.pt')

In [None]:
train_brand_image_feature = torch.load('train_test_split/train_brand_mor_image_feature.pt')
train_inf_image_feature = torch.load('train_test_split/train_inf_mor_image_feature.pt')
test_brand_image_feature = torch.load('train_test_split/test_brand_mor_image_feature.pt')
test_inf_image_feature = torch.load('train_test_split/test_inf_mor_image_feature.pt')

In [None]:
train_brand_node_emb = torch.load('train_test_split/train_brand_hashtag_weighted_nod2vec_0615_r1_sqrt.pt')
test_brand_node_emb = torch.load('train_test_split/test_brand_hashtag_weighted_nod2vec_0615_r1_sqrt.pt')
train_inf_node_emb = torch.load('train_test_split/train_inf_hashtag_weighted_nod2vec_0615_r1_sqrt.pt')
test_inf_node_emb = torch.load('train_test_split/test_inf_hashtag_weighted_nod2vec_0615_r1_sqrt.pt')

In [None]:
# train_label_triplet = np.load('train_test_split/train_label_triplet.npy')
train_label_triplet = np.load('train_test_split/train_label_triplet_hard13_five_samebrand.npy')
test_label_list = np.load('train_test_split/test_label_list.npy')

In [None]:
len(test_label_list), train_label_triplet.shape

# Triplet Training Dataset

In [None]:
class BrandInfluencerTripletDataset(Dataset):
    def __init__(self, brand_node_feat, brand_txt_feat, brand_img_feat,\
                        inf_node_feat, inf_txt_feat, inf_img_feat, label_triplet):
        """
        feature is all ordered by node id.
        """
        self.brand_node_feat = brand_node_feat
        self.brand_txt_feat = brand_txt_feat
        self.brand_img_feat = brand_img_feat

        self.inf_node_feat = inf_node_feat
        self.inf_txt_feat = inf_txt_feat
        self.inf_img_feat = inf_img_feat
        
        self.label_triplet = label_triplet       # row: brand, col: inf, val: 1/0

    def __len__(self):
        return self.label_triplet.shape[0] # len of sample triplets
    
    def __getitem__(self, index):

        brand_node_id = self.label_triplet[index][0]
        pos_neg_inf_ids = self.label_triplet[index][1:]       # this is relative node id, which is inf_node_id - 286
        
        # get the features for the brand
        # brand_node_id = index    # the node index of this brand, to get node embedding after GCN and the label_list[brand_node_id] to cal loss
        brand_text_feature = self.brand_txt_feat[brand_node_id].reshape(1, -1)   # (1, 200)
        brand_image_feature = self.brand_img_feat[brand_node_id].reshape(1, -1)   # (1, 1000)
        brand_node_feature = self.brand_node_feat[brand_node_id].reshape(1, -1)

        # features of influencers (only of positive and negative sample in this pair)
        inf_text_feature = self.inf_txt_feat[pos_neg_inf_ids]
        inf_image_feature = self.inf_img_feat[pos_neg_inf_ids]
        inf_node_feature = self.inf_node_feat[pos_neg_inf_ids]

        return brand_text_feature, brand_image_feature, brand_node_feature,\
            inf_text_feature, inf_image_feature, inf_node_feature,\
            self.label_triplet, brand_node_id
            # , brand_node_id, self.brand_num, self.inf_num


In [None]:
# fix random seed
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)  
    np.random.seed(seed)  
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [None]:
def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

In [None]:
train_dataset = BrandInfluencerTripletDataset(train_brand_node_emb, train_brand_text_feature, train_brand_image_feature,\
                                      train_inf_node_emb, train_inf_text_feature, train_inf_image_feature,\
                                       train_label_triplet)


In [None]:
len(train_dataset)

In [None]:
train_batch_size = 32
train_num_workers = 0
train_shuffle = True

In [None]:
seed = 24
same_seeds(24)

g = torch.Generator()
g.manual_seed(seed)

train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=train_shuffle, 
                          num_workers=train_num_workers, pin_memory=True)
#                               num_workers=train_num_workers, worker_init_fn=seed_worker, generator=g, pin_memory=True)


# Test Dataset

In [None]:
class BrandInfluencerDataset(Dataset):
    def __init__(self, brand_node_feat, brand_txt_feat, brand_img_feat,\
                        inf_node_feat, inf_txt_feat, inf_img_feat, label_list):
        """
        feature is all ordered by node id.
        """

        self.brand_node_feat = brand_node_feat
        self.brand_txt_feat = brand_txt_feat
        self.brand_img_feat = brand_img_feat

        self.inf_node_feat = inf_node_feat
        self.inf_txt_feat = inf_txt_feat
        self.inf_img_feat = inf_img_feat
        
        self.label_list = label_list       # row: brand, col: inf, val: 1/0

    def __len__(self):
        return len(self.label_list) # len of brand
    
    def __getitem__(self, index):
        
        # get the features for the brand
        brand_node_id = index    # the node index of this brand, to get node embedding after GCN and the label_list[brand_node_id] to cal loss
        brand_text_feature = self.brand_txt_feat[index].reshape(1, -1)   # (1, 200)
        brand_image_feature = self.brand_img_feat[index].reshape(1, -1)   # (1, 1000)
        brand_node_feature = self.brand_node_feat[index].reshape(1, -1)
        # print(brand_text_feature.shape, brand_image_feature.shape)

        # features of all influencers
        inf_text_feature = self.inf_txt_feat
        inf_image_feature = self.inf_img_feat
        inf_node_feature = self.inf_node_feat

        return brand_text_feature, brand_image_feature, brand_node_feature,\
            inf_text_feature, inf_image_feature, inf_node_feature,\
            self.label_list, brand_node_id
            # , brand_node_id, self.brand_num, self.inf_num


In [None]:
test_dataset = BrandInfluencerDataset(test_brand_node_emb, test_brand_text_feature, test_brand_image_feature,\
                                      test_inf_node_emb, test_inf_text_feature, test_inf_image_feature,\
                                      test_label_list)

In [None]:
test_batch_size = 32
test_num_workers = 0

In [None]:
same_seeds(24)
# if __name__ == '__main__':
test_dataloader = DataLoader(test_dataset, batch_size=test_batch_size, shuffle=False, 
                             num_workers=test_num_workers, pin_memory=True)

# Model

In [None]:
def init_weights(module):
    if isinstance(module, nn.Linear):
        init.normal_(module.weight, std=0.1)
        # init.xavier_uniform_(module.weight)
#         nn.init.kaiming_normal_(module.weight)  # he
        if module.bias is not None:
            init.normal_(module.bias, std=0.1)  
            # init.constant_(module.bias, 0)
# self.apply(init_weights)

## Text

In [None]:
text_input_size = 19
text_layer1_size = 128
text_layer2_size = 512

In [None]:
class TextEncoder(nn.Module):
    def __init__(self, input_size=text_input_size, text_layer1_size=text_layer1_size, text_layer2_size=text_layer2_size):
        super(TextEncoder, self).__init__()
        self.linear1 = nn.Linear(input_size, text_layer1_size)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.relu1 = nn.LeakyReLU()
        self.linear2 = nn.Linear(text_layer1_size, text_layer2_size)

        self.apply(init_weights)

    def forward(self, x):
        x = x.to(self.linear1.weight.dtype)
        out = self.linear1(x)
        out = self.dropout1(out)
        out = self.relu1(out)
        out = self.linear2(out)
        out = out.squeeze()

        return out

## Image

In [None]:
image_input_size = 1365
image_layer1_size = 1024
image_layer2_size = 1024
image_layer3_size = 1024
image_layer4_size = 512

In [None]:
class ImageEncoder(nn.Module):
    def __init__(self, input_size=image_input_size, image_layer1_size=image_layer1_size, image_layer2_size=image_layer2_size, image_layer3_size=image_layer3_size, image_layer4_size=image_layer4_size):
        super(ImageEncoder, self).__init__()
        self.linear1 = nn.Linear(input_size, image_layer1_size)
        self.dropout1 = nn.Dropout(dropout_prob)
        self.relu1 = nn.LeakyReLU()
        self.linear2 = nn.Linear(image_layer1_size, image_layer2_size)
        self.relu2 = nn.LeakyReLU()
        self.linear3 = nn.Linear(image_layer2_size, image_layer3_size)
        self.relu3 = nn.LeakyReLU()
        self.linear4 = nn.Linear(image_layer3_size, image_layer4_size)

        self.apply(init_weights)

    def forward(self, x):
        x = x.to(self.linear1.weight.dtype)
        
        out = self.linear1(x)
        out = self.dropout1(out)
        out = self.relu1(out)

        out = self.linear2(out)
        out = self.relu2(out)

        out = self.linear3(out)
        out = self.relu3(out)

        out = self.linear4(out)
        
        out = out.squeeze()
        return out

## Node (Attribute)

In [None]:
node_input_size = 128
node_layer1_size = 1024
node_layer2_size = 512

In [None]:
class NodeEncoder(nn.Module):
    def __init__(self, input_size=node_input_size, layer1_size=node_layer1_size, layer2_size=node_layer2_size):
        super(NodeEncoder, self).__init__()

        self.linear1 = nn.Linear(input_size, layer1_size)
        self.relu1 = nn.LeakyReLU()
        self.linear2 = nn.Linear(layer1_size, layer2_size)
        
        self.apply(init_weights)

    def forward(self, x):
        x = x.to(self.linear1.weight.dtype)
        out = self.linear1(x)
        out = self.relu1(out)
        out = self.linear2(out)
        
        out = out.squeeze()

        return out

## Influencer Ranker

In [None]:
class InfluencerRanker(nn.Module):
    def __init__(self):
        super(InfluencerRanker, self).__init__()
        
        self.text_encoder = TextEncoder()
        self.image_encoder = ImageEncoder()
        self.node_encoder = NodeEncoder()

    def forward(self, brand_text_feature, brand_image_feature, brand_node_feature,\
                inf_text_feature, inf_image_feature, inf_node_feature, is_train=True):

        if is_train:
            batch_size = inf_text_feature.shape[0]
            pair_size = inf_text_feature.shape[1]
            

        # get text embeddings
        brand_text_emb = self.text_encoder(brand_text_feature)           # (1, output_size)
        inf_text_emb = self.text_encoder(inf_text_feature)

        # get image embeddings
        brand_image_emb = self.image_encoder(brand_image_feature)        # (1, output_size)
        inf_image_emb = self.image_encoder(inf_image_feature)

        # get node embeddings
        brand_node_emb = self.node_encoder(brand_node_feature)
        inf_node_emb = self.node_encoder(inf_node_feature)

        
        # bilinear pooling
        brand_content = torch.mul(brand_text_emb, brand_image_emb)     
        inf_content = torch.mul(inf_text_emb, inf_image_emb)

        if is_train:
            scores_content = torch.sum(brand_content.unsqueeze(1) * inf_content, 2)         # brand_bil -> (batch_size, 1, dim) -> scores: (batch_size, 2)
            scores_node = torch.sum(brand_node_emb.unsqueeze(1) * inf_node_emb, 2)  # scores: col 0 是 pos, col 1 是 neg
        else:
            scores_content = torch.matmul(brand_content, inf_content.transpose(0, 1))
            scores_node = torch.matmul(brand_node_emb, inf_node_emb.transpose(0, 1))


        return scores_content, scores_node


# Loss

In [None]:
margin = 4
valid_margin = 1e-16

In [None]:
def triplet_ranking_loss_fixed(all_positive_scores, all_negative_scores, margin=margin, valid_margin=valid_margin, times=None):
    # Do not need label anymore
    # Create the margin tensor
    margin1 = (torch.ones(1, all_negative_scores.shape[0]) * margin).to(device)

    # Calculate the triplet loss
    triplet_loss1 = all_negative_scores - all_positive_scores + margin1
    triplet_loss1 = torch.max(triplet_loss1, torch.tensor(0.0))
    triplet_loss1 = triplet_loss1.float()
    valid_triplets1 = torch.gt(triplet_loss1, torch.tensor(valid_margin))   # if loss > valid_margin, it's a valid loss
    valid_triplets1 = valid_triplets1.float()
    num_positive_triplets1 = torch.sum(valid_triplets1) # Calculate the number of positive triplets
    # print(num_positive_triplets1)

    # Calculate the sum of triplet loss
    triplet_loss1_sum = torch.sum(triplet_loss1)
    epsilon = torch.tensor(1e-16)                      # Add a small epsilon to avoid division by zero
    num_positive_triplets1 = num_positive_triplets1 + epsilon

    # Calculate the final triplet loss as average of triplet losses
    triplet_loss1_avg = triplet_loss1_sum / num_positive_triplets1

    return triplet_loss1_sum


In [None]:
def triplet_cross_entropy(all_positive_scores, all_negative_scores, times=None):
    
    triplet_ce = -1 * torch.mean( torch.log(all_positive_scores) )

    return triplet_ce


# Training

In [None]:
# check device
def get_device():
    return 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# fix random seed for reproducibility
# same_seeds(24)

# get device 
device = get_device()
# device = 'cuda:1'
print(f'DEVICE: {device}')

In [None]:
num_epochs = 100
lr = 0.002
# weight_decayx = 0.001
dropout_prob = 0.5
l1_lambda = 0.001

In [None]:
# Define the model and loss function
model = InfluencerRanker()
model = model.to(device)

In [None]:
criterion = triplet_ranking_loss_fixed

In [None]:
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.95)

In [None]:
train_triplet_label = torch.from_numpy(train_label_triplet).to(device)

In [None]:
for name, param in model.named_parameters():
    if 'weight' in name:
        print(name)

## train

In [None]:
model_name = "bamir"
save = True

In [None]:
model.train()

In [None]:
# 順便存每個 epoch 的 testing 結果
test_fixed_label_list = torch.from_numpy(test_label_list).to(device)
test_inf_text_feature = test_inf_text_feature.to(device)
test_inf_image_feature = test_inf_image_feature.to(device)
test_inf_node_feature = test_inf_node_emb.to(device)
test_brand_text_feature = test_brand_text_feature.to(device)
test_brand_image_feature = test_brand_image_feature.to(device)
test_brand_node_feature = test_brand_node_emb.to(device)

In [None]:
auc_list = []
rec10_list = []
rec50_list = []
mrr_list = []
map_list = []
medr_list = []

In [None]:
# Train the model
train_loss_values = []
all_all_scores = []
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    if epoch == 50: # save 100 epochs as another file
        model_name = model_name[:-2]+'100'
    pbar = tqdm(enumerate(train_dataloader), total=len(train_dataloader))
    for batch_id, batch in pbar:

        brand_text_feature, brand_image_feature, brand_node_feature,\
            inf_text_feature, inf_image_feature, inf_node_feature, _, _ = batch

        optimizer.zero_grad()

        # Move input data to GPU
        brand_text_feature = brand_text_feature.to(device)
        brand_image_feature = brand_image_feature.to(device)
        brand_node_feature = brand_node_feature.to(device)
        inf_text_feature = inf_text_feature.to(device)
        inf_image_feature = inf_image_feature.to(device)
        inf_node_feature = inf_node_feature.to(device)

        scores_content, scores_node = model(brand_text_feature, brand_image_feature, brand_node_feature,\
                       inf_text_feature, inf_image_feature, inf_node_feature)
        
        batch_loss_content = criterion(scores_content[:, 0], scores_content[:, 1], margin=4, times=None)
        batch_loss_node = criterion(scores_node[:, 0], scores_node[:, 1], margin=2, times=None)

        # global cross entropy loss
        global_score = torch.softmax((scores_content + scores_node), dim=1)
        epsilon = 1e-16
        global_score = global_score + epsilon
        
        global_loss = triplet_cross_entropy(global_score[:, 0::2], global_score[:, 1::2], times=None)

        # define batch loss
        batch_loss = batch_loss_content + batch_loss_node + global_loss

        # L1 regularization
        l1_reg = torch.tensor(0., requires_grad=True)
        for name, param in model.named_parameters():
            if 'weight' in name:
                l1_reg = l1_reg + torch.norm(param, p=1)
        batch_loss += l1_lambda * l1_reg

        batch_loss.backward()
        optimizer.step()

        running_loss += batch_loss.item()

        description = f'Epoch {epoch+1}/{num_epochs}, Batch {batch_id+1}/{len(train_dataloader)}, Gloabl Loss: {batch_loss:.4f}, Separate Loss: {batch_loss_content:.4f}, {batch_loss_node:.4f}, {global_loss:.4f} L1: {l1_lambda * l1_reg}'
        pbar.set_description(description)

    scheduler.step()
    train_loss = running_loss / len(train_dataloader)    # average loss of this epoch (/ number of batch)
    train_loss_values.append(train_loss)
    torch.cuda.empty_cache()


    ################ Test the model
    model.eval()
    with torch.no_grad():
        scores_content, scores_node = model(test_brand_text_feature, test_brand_image_feature, test_brand_node_feature,\
                           test_inf_text_feature, test_inf_image_feature, test_inf_node_feature, is_train=False)
        all_scores = scores_content + scores_node
        all_all_scores.append(all_scores)
        print(all_scores.shape)
    auc, rec10, rec50, mrr, map_, medr = cal_metrics(test_fixed_label_list.cpu(), all_scores.cpu())
    auc_list.append(auc)
    rec10_list.append(rec10)
    rec50_list.append(rec50)
    mrr_list.append(mrr)
    map_list.append(map_)
    medr_list.append(medr)

    print(f"***** Epoch {epoch+1}: Train Loss={train_loss:.4f}, lr={scheduler.get_last_lr()[0]} *****")
    checkpoint = {
            'epoch': epoch+1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'losses': train_loss_values,
            'scheduler_state_dict': scheduler.state_dict(),
            'all_all_scores': all_all_scores
        }
    if save:
        torch.save(checkpoint, 'models/'+model_name+'.pth')
        if (epoch+1) == 40:
            torch.save(checkpoint, 'models/'+model_name[:-2]+'40.pth')

all_all_scores = torch.stack(all_all_scores)
if save:
    with open('models/'+model_name+"_architecture.txt", "w") as file:
        print(model, file=file)
    torch.save(all_all_scores, 'models/'+model_name+"predict.pt")
# Create a plot
plt.plot(train_loss_values)
plt.xlabel('Epochs')
plt.ylabel('Training Loss')
plt.title('Training Loss over Epochs')

# Show or save the plot
if save:
    plt.savefig('models/'+model_name+'.png')
plt.show()  # Show the plot


# Testing

## Evaluation Metrics

In [None]:
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score

In [None]:
test_auc_df = pd.read_csv('train_test_split/test_auc.csv')

In [None]:
# Define a function to calculate Recall@k
def recall_at_k(scores, labels, k):
    """
    Calculate Recall@k metric.
    
    Args:
        scores (np.ndarray): 2D array of shape (num_samples, num_classes) containing the predicted scores.
        labels (np.ndarray): 2D array of shape (num_samples, num_classes) containing the binary labels.
        k (int): The value of k for Recall@k.
        
    Returns:
        float: The Recall@k score.
    """
    # Sort the scores in descending order
    sorted_indices = np.argsort(-scores, axis=1)
    
    # Get the top k predicted labels for each sample
    top_k_indices = sorted_indices[:, :k]
    
    # Calculate the number of true positive predictions for each sample
    # true_positives = np.sum(labels[np.arange(labels.shape[0])[:, None], top_k_indices], axis=1)
    true_positives = torch.sum(labels[torch.arange(labels.shape[0])[:, None], top_k_indices], dim=1)
    
    # Calculate the total number of positive labels for each sample
    total_positives = torch.sum(labels, axis=1)
    
    # Calculate Recall@k
    recall_at_k = torch.mean(true_positives / total_positives)
    
    return recall_at_k

In [None]:
def cal_metrics(label_list, scores): # input should be cpu
    # Calculate AUC
    auc = roc_auc_score(label_list.ravel(), scores.ravel())
#     print("AUC:", auc)

    # Calculate Recall@k (e.g. k = 5)
    rec10 = recall_at_k(scores, label_list, 10)
#     print("Recall@10:", rec10.item())
    rec50 = recall_at_k(scores, label_list, 50)
#     print("Recall@50:", rec50.item())

    # Calculate MRR (Mean Reciprocal Rank)
    sorted_indices = np.argsort(-scores, axis=1)
    #   ranks = np.array([np.where(sorted_indices[i] == np.argmax(label_list[i]))[0][0] + 1 for i in range(label_list.shape[0])])
    ranks = np.array([np.argmax(np.isin(sorted_indices[i], np.where(label_list[i] == 1)))+1 for i in range(label_list.shape[0])])
    mrr = np.mean(1 / ranks)
#     print("MRR:", mrr)

    # Calculate MAP (Mean Average Precision)
    map_ = np.mean([average_precision_score(label_list[i], scores[i]) for i in range(label_list.shape[0])])
#     print("MAP:", map_)

    # Calculate MedR (Median Rank)
    medr = np.median(ranks)
#     print("MedR:", medr)
    
    display(pd.DataFrame({'AUC': [auc], 'R@10': [rec10.item()], 'R@50': [rec50.item()], 'MRR': [mrr], 'MAP': [map_], 'MedR': [medr]}))

    return auc, rec10.item(), rec50.item(), mrr, map_, medr


In [None]:
def cal_auc(score, brand_node_id, inf_node_id, auc_df):
    # AUC cAUC
    err = 0
    AUC = 0.0
    AUC_all = 0.0
    cAUC = 0.0
    cAUC_all = 0.0

    brand_num = score.shape[0]

    # iterate through the dataframe
    for i in range(len(auc_df)):
        AUC_all += 1
        score1 = 0.0
        score2 = 0.0
        
        b1_node_id = brand_node_id[auc_df['b1'][i]]  # brand_to_node[brand_id]
        i1_node_id = inf_node_id[auc_df['i1'][i]] - brand_num
        b2_node_id = brand_node_id[auc_df['b2'][i]]
        i2_node_id = inf_node_id[auc_df['i2'][i]] - brand_num
        c1 = auc_df['c1'][i]
        c2 = auc_df['c2'][i]
        
        score1 = score[b1_node_id][i1_node_id]
        score2 = score[b1_node_id][i2_node_id]
        # print(score1, score2)
        
        if (score1 == 0.0 or score2 == 0.0):
            err += 1
        if (c1 == c2):
            cAUC_all += 1
        if (score1 > score2):
            AUC += 1
            if (c1 == c2):
                # print(score1, score2)
                cAUC += 1

    print('AUC:', AUC/AUC_all)
    print('cAUC:', cAUC/cAUC_all)
    print(err)
    print(AUC_all, cAUC_all)
    return AUC / AUC_all, cAUC / cAUC_all

## test

In [None]:
# Load the fixed feature
test_fixed_label_list = torch.from_numpy(test_label_list).to(device)
test_inf_text_feature = test_inf_text_feature.to(device)
test_inf_image_feature = test_inf_image_feature.to(device)
test_inf_node_feature = test_inf_node_emb.to(device)

In [None]:
# Evaluate the model
model.eval()
with torch.no_grad():
    all_scores = [0 for _ in range(len(test_dataset))]
    # all_scores = []
    all_labels = []
    for i, (brand_text_feature, brand_image_feature, brand_node_feature,\
            _, _, _, _, brand_node_id) in enumerate(test_dataloader):
        
        brand_text_feature = brand_text_feature.to(device)
        brand_image_feature = brand_image_feature.to(device)
        brand_node_feature = brand_node_feature.to(device)
        brand_node_id = brand_node_id.to(device)
        # print(brand_node_id.shape)

        labels = test_fixed_label_list[brand_node_id]
        scores_content, scores_node = model(brand_text_feature, brand_image_feature, brand_node_feature,\
                       test_inf_text_feature, test_inf_image_feature, test_inf_node_feature, is_train=False)
        
        # all_labels.append(labels)
        # all_scores.append(scores)
        for i in range(brand_node_id.shape[0]):
            bid = brand_node_id[i].item()
            all_scores[bid] = scores_content[i] + scores_node[i]

    all_scores = torch.stack(all_scores, dim=0) 
    print(all_scores.shape)
    ranked_influencers = torch.argsort(all_scores, dim=1, descending=True)

In [None]:
cal_metrics(test_fixed_label_list.cpu(), all_scores.cpu())

In [None]:
cal_auc(all_scores, test_brand_node_id, test_inf_node_id, test_auc_df)

## Load from checkpoint

In [None]:
model_name = 'bamir'

In [None]:
model_path = 'models/'+ model_name +'.pth'

In [None]:
checkpoint = torch.load(model_path)

In [None]:
model = InfluencerRanker().to(device)

In [None]:
model.load_state_dict(checkpoint['model_state_dict'])