<a href="https://colab.research.google.com/github/leonswl/ntu-msds-ai6103/blob/daeun/Sequential.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
def gen_seq(corpus, train_size, test_size, window_size, word_id_map, row_tfidf, col_tfidf, weight_tfidf, vocab):
    windows = []
    row, col, weight = [],[],[]
    t = time.time()
    vocab_size = len(vocab)
    print("Generating sequential graph...")
    print("windows generating...")
    for doc_words in corpus:
        words = doc_words.split()
        length = len(words)
        if length <= window_size:
            windows.append(words)
        else:
            for j in range(length - window_size + 1):
                window = words[j: j + window_size]
                windows.append(window)

    print("Calculating word frequency...")
    word_freq = Counter()
    for window in tqdm(windows):
        word_freq.update(set(window))

    print("Calculating TF-IDF scores...")
    num_docs = len(corpus)
    idf = {}
    for word, freq in word_freq.items():
        idf[word] = log(num_docs / (freq + 1))  # Adding 1 to avoid division by zero

    print("Creating TF-IDF graph...")
    for window in windows:
        word_set = set(window)
        for i, word_i in enumerate(word_set):
            word_i_id = word_id_map[word_i]
            tf_idf_i = idf[word_i] * window.count(word_i) / len(window)
            for word_j in word_set:
                if word_i != word_j:
                    word_j_id = word_id_map[word_j]
                    tf_idf_j = idf[word_j] * window.count(word_j) / len(window)
                    row.append(train_size + word_i_id)
                    col.append(train_size + word_j_id)
                    weight.append(tf_idf_i * tf_idf_j)

    print("Sequential graph finish! Time spent {:.2f} seconds.".format(time.time() - t))

    num_edges = len(row)
    row.extend(row_tfidf)
    col.extend(col_tfidf)
    weight.extend(weight_tfidf)

    node_size = train_size + vocab_size + test_size
    adj = sp.csr_matrix((weight, (row, col)), shape=(node_size, node_size))
    _ = []

    return _, adj, row, col

In [None]:
import numpy as np
from math import log

def gen_seq(corpus, train_size, test_size, window_size, word_id_map, row_tfidf, col_tfidf, weight_tfidf, vocab):
    windows = []
    row, col, weight = [],[],[]
    t = time.time()
    vocab_size = len(vocab)
    print("Generating sequential graph...")
    print("windows generating...")
    for doc_words in corpus:
        words = doc_words.split()
        length = len(words)
        if length <= window_size:
            windows.append(words)
        else:
            for j in range(length - window_size + 1):
                window = words[j: j + window_size]
                windows.append(window)

    print("calculating word frequency...")
    word_window_freq = {}
    for window in tqdm(windows):
        appeared = set()
        for i in range(len(window)):
            if window[i] in appeared:
                continue
            if window[i] in word_window_freq:
                word_window_freq[window[i]] += 1
            else:
                word_window_freq[window[i]] = 1
            appeared.add(window[i])

    print("calculating word pair frequency...")
    word_pair_count = {}
    for window in windows:
        for i in range(1, len(window)):
            for j in range(0, i):
                word_i = window[i]
                word_i_id = word_id_map[word_i]
                word_j = window[j]
                word_j_id = word_id_map[word_j]
                if word_i_id == word_j_id:
                    continue
                word_pair_str = str(word_i_id) + ',' + str(word_j_id)
                if word_pair_str in word_pair_count:
                    word_pair_count[word_pair_str] += 1
                else:
                    word_pair_count[word_pair_str] = 1
                # two orders
                word_pair_str = str(word_j_id) + ',' + str(word_i_id)
                if word_pair_str in word_pair_count:
                    word_pair_count[word_pair_str] += 1
                else:
                    word_pair_count[word_pair_str] = 1

    num_window = len(windows)
    npmi_dict = {}
    print("calculating npmi...")
    for key in word_pair_count:
        temp = key.split(',')
        i = int(temp[0])
        j = int(temp[1])
        count = word_pair_count[key]
        word_freq_i = word_window_freq[vocab[i]]
        word_freq_j = word_window_freq[vocab[j]]
        pmi = log((1.0 * count / num_window) /
                  (1.0 * word_freq_i * word_freq_j / (num_window * num_window)))
        pmi_max = -log((1.0 * count) / num_window)
        if pmi_max == 0:
            npmi = 0  # Avoid division by zero
        else:
            npmi = pmi / pmi_max
        if npmi <= 0:
            continue
        row.append(train_size + i)
        col.append(train_size + j)
        weight.append(npmi)
        npmi_dict[key] = npmi

    print("create npmi graph...")
    weight = weight + weight_tfidf
    num_edges = len(row)
    row = row + row_tfidf
    col = col + col_tfidf
    node_size = train_size + vocab_size + test_size
    adj = sp.csr_matrix(
        (weight, (row, col)), shape=(node_size, node_size))
    print("Sequential graph finish! Time spent {:2f} number of edges {}".format(time.time()-t, num_edges))
    return npmi_dict, adj, row, col


In [None]:
import argparse

def parse_args(args):
    arg_dict = {
        'gen_syn': False,
        'gen_sem': False,
        'gen_seq': True,
        'dataset': 'R8',
        'window_size': 7,
        'lr': 1e-3,
        'batch_size': 32,
        'embed_size': 200,
        'max_len': 512,
        'hidden_size': 200,
        'dropout': 0,
        'weight_decay': 1e-6,
        'epochs': 20,
        'seed': 32,
        'corenlp': '/content/stanford',
        'thres': 0.05
    }

    namespace = argparse.Namespace(**arg_dict)
    return namespace

args = parse_args([])


In [None]:
args = parse_args([])
main(args)

1
148
cuda is being used
Generating sequential graph...
windows generating...
calculating word frequency...


100%|██████████| 458343/458343 [00:01<00:00, 347054.73it/s]


calculating word pair frequency...
calculating npmi...
create npmi graph...
Sequential graph finish! Time spent 22.440358 number of edges 1465792


New


In [None]:
def parse_args(args):
    arg_dict = {

    'do_train': True,
    'do_valid': True,
    'do_test': True,
    'no_sparse': True,
    'load_ckpt': False,
    'featureless': True,
    'save_path': './saved_model',
    'dataset': 'mr',
    'model': 'gcn',
    'lr': 0.00002,
    'epochs': 300,
    'hidden': 200,
    'layers': 2,
    'dropout': 0.8,
    'weight_decay': 0.000001,
    'early_stop': 2000,
    'max_degree': 3,
    "model_name":'model',
    'run_id':2
}


    namespace = argparse.Namespace(**arg_dict)
    return namespace

from datetime import datetime
import pytz

In [None]:
#R8-npmi
sgt = pytz.timezone('Asia/Singapore')
timestamp = datetime.now(sgt).strftime("%Y-%m-%d_%H-%M-%S")


log_path = '/content/saved_model'
logger = setup_logging(log_path=log_path, log_name='training_log', timestamp=timestamp)

args = parse_args([])
main(args,timestamp)

2024-04-27 06:16:13,512 - INFO - Training is running on cuda
2024-04-27 06:16:13,512 - INFO - Training is running on cuda
INFO:training_log:Training is running on cuda
2024-04-27 06:16:13,517 - INFO - Seed used: 147
2024-04-27 06:16:13,517 - INFO - Seed used: 147
INFO:training_log:Seed used: 147
2024-04-27 06:16:13,523 - INFO - Loading seq graph
2024-04-27 06:16:13,523 - INFO - Loading seq graph
INFO:training_log:Loading seq graph
2024-04-27 06:16:13,537 - INFO - Successfully loaded seq graph
2024-04-27 06:16:13,537 - INFO - Successfully loaded seq graph
INFO:training_log:Successfully loaded seq graph
2024-04-27 06:16:13,542 - INFO - Loading sem graph
2024-04-27 06:16:13,542 - INFO - Loading sem graph
INFO:training_log:Loading sem graph
2024-04-27 06:16:13,609 - INFO - Successfully loaded sem graph
2024-04-27 06:16:13,609 - INFO - Successfully loaded sem graph
INFO:training_log:Successfully loaded sem graph
2024-04-27 06:16:13,612 - INFO - Loading syn graph
2024-04-27 06:16:13,612 - IN

15362


2024-04-27 06:16:14,332 - DEBUG - adj:
   (0, 6327)	3.711032870061612
  (0, 6439)	2.0141214652074444
  (0, 6455)	5.446034005471058
  (0, 6910)	2.8408000383919125
  (0, 7497)	5.013767638082572
  (0, 7579)	6.643008177812852
  (0, 7615)	1.8219204856022904
  (0, 7636)	0.686135075474489
  (0, 7774)	1.93593634404461
  (0, 7844)	6.643008177812852
  (0, 7862)	3.1404583018904093
  (0, 8001)	2.9541287236989158
  (0, 8300)	1.8765698442286383
  (0, 8544)	1.2102745157814285
  (0, 8554)	2.639317983858882
  (0, 8584)	6.0055877913603934
  (0, 8698)	0.8247072497140591
  (0, 8864)	4.209394822412402
  (0, 8932)	5.084038145744166
  (0, 9443)	1.3557517586242402
  (0, 9710)	3.516247641852457
  (0, 9724)	4.868055826901179
  (0, 9775)	5.256713816692962
  (0, 10502)	3.045695917224406
  (0, 10576)	5.901070833083475
  :	:
  (15361, 9089)	3.948380997042783
  (15361, 9177)	4.563566636133016
  (15361, 9204)	6.380643913345361
  (15361, 9381)	12.993999137205973
  (15361, 9467)	5.095445669096839
  (15361, 9469)	17.180

./saved_model/run_2024-04-27_14-16-13


2024-04-27 06:16:25,294 - INFO - 
 Epoch: 0070 train_loss= 65.02930 train_acc= 0.61353 val_loss= 9.70459 val_acc= 0.76825 test_loss= 8.34115 test_acc= 0.82321 time= 0.14684
2024-04-27 06:16:25,294 - INFO - 
 Epoch: 0070 train_loss= 65.02930 train_acc= 0.61353 val_loss= 9.70459 val_acc= 0.76825 test_loss= 8.34115 test_acc= 0.82321 time= 0.14684
INFO:training_log:
 Epoch: 0070 train_loss= 65.02930 train_acc= 0.61353 val_loss= 9.70459 val_acc= 0.76825 test_loss= 8.34115 test_acc= 0.82321 time= 0.14684
2024-04-27 06:16:25,300 - INFO - Configurations for training:
2024-04-27 06:16:25,300 - INFO - Configurations for training:
INFO:training_log:Configurations for training:
2024-04-27 06:16:25,304 - DEBUG - {'do_train': True, 'do_valid': True, 'do_test': True, 'no_sparse': True, 'load_ckpt': False, 'featureless': True, 'save_path': './saved_model', 'dataset': 'R8', 'model': 'gcn', 'lr': 2e-05, 'epochs': 300, 'hidden': 200, 'layers': 2, 'dropout': 0.8, 'weight_decay': 1e-06, 'early_stop': 2000,

In [None]:
#mr-npmi
sgt = pytz.timezone('Asia/Singapore')
timestamp = datetime.now(sgt).strftime("%Y-%m-%d_%H-%M-%S")


log_path = '/content/saved_model'
logger = setup_logging(log_path=log_path, log_name='training_log', timestamp=timestamp)

args = parse_args([])
main(args,timestamp)

2024-04-27 06:18:24,956 - INFO - Training is running on cuda
2024-04-27 06:18:24,956 - INFO - Training is running on cuda
2024-04-27 06:18:24,956 - INFO - Training is running on cuda
INFO:training_log:Training is running on cuda
2024-04-27 06:18:24,961 - INFO - Seed used: 147
2024-04-27 06:18:24,961 - INFO - Seed used: 147
2024-04-27 06:18:24,961 - INFO - Seed used: 147
INFO:training_log:Seed used: 147
2024-04-27 06:18:24,966 - INFO - Loading seq graph
2024-04-27 06:18:24,966 - INFO - Loading seq graph
2024-04-27 06:18:24,966 - INFO - Loading seq graph
INFO:training_log:Loading seq graph
2024-04-27 06:18:24,978 - INFO - Successfully loaded seq graph
2024-04-27 06:18:24,978 - INFO - Successfully loaded seq graph
2024-04-27 06:18:24,978 - INFO - Successfully loaded seq graph
INFO:training_log:Successfully loaded seq graph
2024-04-27 06:18:24,982 - INFO - Loading sem graph
2024-04-27 06:18:24,982 - INFO - Loading sem graph
2024-04-27 06:18:24,982 - INFO - Loading sem graph
INFO:training_l

29426


2024-04-27 06:18:25,284 - DEBUG - adj:
   (0, 8914)	9.240961894450367
  (0, 10588)	3.1674184096404523
  (0, 12378)	4.774631627052441
  (0, 12452)	1.5195310253612764
  (0, 14339)	6.229918859659283
  (0, 15036)	2.030213781779356
  (0, 15040)	6.709491939921169
  (0, 15974)	3.3800384631178555
  (0, 16916)	4.997775178366651
  (0, 16965)	1.6673331910012603
  (0, 17016)	1.9258537664551136
  (0, 19907)	6.016344759361225
  (0, 20145)	6.229918859659283
  (0, 20955)	4.983981856234315
  (0, 21056)	2.764182956859557
  (0, 22371)	1.240134361043218
  (0, 22798)	0.8349934545913218
  (0, 23113)	1.5008468300225124
  (0, 24356)	3.374543943800215
  (1, 8212)	3.0841258915295593
  (1, 8369)	7.665003384948606
  (1, 9418)	1.0930021499776208
  (1, 11045)	6.9718562043886605
  (1, 12694)	4.7971044829044995
  (1, 14619)	5.011761420341391
  :	:
  (29425, 12973)	5.040334792785447
  (29425, 13272)	3.258284137684353
  (29425, 13286)	5.719093235893292
  (29425, 13334)	4.29083467567437
  (29425, 13808)	9.27444129738270

./saved_model/run_2024-04-27_14-18-24


2024-04-27 06:18:26,032 - INFO - Current best loss 2.07318
2024-04-27 06:18:26,032 - INFO - Current best loss 2.07318
2024-04-27 06:18:26,032 - INFO - Current best loss 2.07318
INFO:training_log:Current best loss 2.07318
2024-04-27 06:18:26,137 - INFO - 
 Epoch: 0002 train_loss= 5.39500 train_acc= 0.49562 val_loss= 1.71610 val_acc= 0.50986 test_loss= 1.61633 test_acc= 0.50732 time= 0.09844
2024-04-27 06:18:26,137 - INFO - 
 Epoch: 0002 train_loss= 5.39500 train_acc= 0.49562 val_loss= 1.71610 val_acc= 0.50986 test_loss= 1.61633 test_acc= 0.50732 time= 0.09844
2024-04-27 06:18:26,137 - INFO - 
 Epoch: 0002 train_loss= 5.39500 train_acc= 0.49562 val_loss= 1.71610 val_acc= 0.50986 test_loss= 1.61633 test_acc= 0.50732 time= 0.09844
INFO:training_log:
 Epoch: 0002 train_loss= 5.39500 train_acc= 0.49562 val_loss= 1.71610 val_acc= 0.50986 test_loss= 1.61633 test_acc= 0.50732 time= 0.09844
2024-04-27 06:18:26,145 - INFO - Configurations for training:
2024-04-27 06:18:26,145 - INFO - Configurati

In [None]:
#R8-tf-idf
sgt = pytz.timezone('Asia/Singapore')
timestamp = datetime.now(sgt).strftime("%Y-%m-%d_%H-%M-%S")


log_path = '/content/saved_model'
logger = setup_logging(log_path=log_path, log_name='training_log', timestamp=timestamp)

args = parse_args([])
main(args,timestamp)

2024-04-27 06:20:47,815 - INFO - Training is running on cuda
2024-04-27 06:20:47,815 - INFO - Training is running on cuda
2024-04-27 06:20:47,815 - INFO - Training is running on cuda
2024-04-27 06:20:47,815 - INFO - Training is running on cuda
INFO:training_log:Training is running on cuda
2024-04-27 06:20:47,822 - INFO - Seed used: 147
2024-04-27 06:20:47,822 - INFO - Seed used: 147
2024-04-27 06:20:47,822 - INFO - Seed used: 147
2024-04-27 06:20:47,822 - INFO - Seed used: 147
INFO:training_log:Seed used: 147
2024-04-27 06:20:47,833 - INFO - Loading seq graph
2024-04-27 06:20:47,833 - INFO - Loading seq graph
2024-04-27 06:20:47,833 - INFO - Loading seq graph
2024-04-27 06:20:47,833 - INFO - Loading seq graph
INFO:training_log:Loading seq graph
2024-04-27 06:20:47,853 - INFO - Successfully loaded seq graph
2024-04-27 06:20:47,853 - INFO - Successfully loaded seq graph
2024-04-27 06:20:47,853 - INFO - Successfully loaded seq graph
2024-04-27 06:20:47,853 - INFO - Successfully loaded seq

15362


2024-04-27 06:20:48,718 - DEBUG - adj:
   (0, 6327)	3.711032870061612
  (0, 6439)	2.0141214652074444
  (0, 6455)	5.446034005471058
  (0, 6910)	2.8408000383919125
  (0, 7497)	5.013767638082572
  (0, 7579)	6.643008177812852
  (0, 7615)	1.8219204856022904
  (0, 7636)	0.686135075474489
  (0, 7774)	1.93593634404461
  (0, 7844)	6.643008177812852
  (0, 7862)	3.1404583018904093
  (0, 8001)	2.9541287236989158
  (0, 8300)	1.8765698442286383
  (0, 8544)	1.2102745157814285
  (0, 8554)	2.639317983858882
  (0, 8584)	6.0055877913603934
  (0, 8698)	0.8247072497140591
  (0, 8864)	4.209394822412402
  (0, 8932)	5.084038145744166
  (0, 9443)	1.3557517586242402
  (0, 9710)	3.516247641852457
  (0, 9724)	4.868055826901179
  (0, 9775)	5.256713816692962
  (0, 10502)	3.045695917224406
  (0, 10576)	5.901070833083475
  :	:
  (15361, 9089)	3.948380997042783
  (15361, 9177)	4.563566636133016
  (15361, 9204)	6.380643913345361
  (15361, 9381)	12.993999137205973
  (15361, 9467)	5.095445669096839
  (15361, 9469)	17.180

./saved_model/run_2024-04-27_14-20-47


2024-04-27 06:21:36,829 - INFO - Optimization Finished!
INFO:training_log:Optimization Finished!
2024-04-27 06:21:36,843 - INFO - Successfully pickled file 'model_train_results.pkl' with loss and accuracy metrics to ./saved_model
2024-04-27 06:21:36,843 - INFO - Successfully pickled file 'model_train_results.pkl' with loss and accuracy metrics to ./saved_model
2024-04-27 06:21:36,843 - INFO - Successfully pickled file 'model_train_results.pkl' with loss and accuracy metrics to ./saved_model
2024-04-27 06:21:36,843 - INFO - Successfully pickled file 'model_train_results.pkl' with loss and accuracy metrics to ./saved_model
INFO:training_log:Successfully pickled file 'model_train_results.pkl' with loss and accuracy metrics to ./saved_model
2024-04-27 06:21:36,851 - INFO - Starting validation
2024-04-27 06:21:36,851 - INFO - Starting validation
2024-04-27 06:21:36,851 - INFO - Starting validation
2024-04-27 06:21:36,851 - INFO - Starting validation
INFO:training_log:Starting validation
202

In [None]:
#mr-tf-idf
sgt = pytz.timezone('Asia/Singapore')
timestamp = datetime.now(sgt).strftime("%Y-%m-%d_%H-%M-%S")


log_path = '/content/saved_model'
logger = setup_logging(log_path=log_path, log_name='training_log', timestamp=timestamp)

args = parse_args([])
main(args,timestamp)

2024-04-27 06:22:31,551 - INFO - Training is running on cuda
2024-04-27 06:22:31,551 - INFO - Training is running on cuda
2024-04-27 06:22:31,551 - INFO - Training is running on cuda
2024-04-27 06:22:31,551 - INFO - Training is running on cuda
2024-04-27 06:22:31,551 - INFO - Training is running on cuda
INFO:training_log:Training is running on cuda
2024-04-27 06:22:31,559 - INFO - Seed used: 147
2024-04-27 06:22:31,559 - INFO - Seed used: 147
2024-04-27 06:22:31,559 - INFO - Seed used: 147
2024-04-27 06:22:31,559 - INFO - Seed used: 147
2024-04-27 06:22:31,559 - INFO - Seed used: 147
INFO:training_log:Seed used: 147
2024-04-27 06:22:31,567 - INFO - Loading seq graph
2024-04-27 06:22:31,567 - INFO - Loading seq graph
2024-04-27 06:22:31,567 - INFO - Loading seq graph
2024-04-27 06:22:31,567 - INFO - Loading seq graph
2024-04-27 06:22:31,567 - INFO - Loading seq graph
INFO:training_log:Loading seq graph
2024-04-27 06:22:31,581 - INFO - Successfully loaded seq graph
2024-04-27 06:22:31,58

29426


2024-04-27 06:22:31,903 - DEBUG - adj:
   (0, 8914)	9.240961894450367
  (0, 10588)	3.1674184096404523
  (0, 12378)	4.774631627052441
  (0, 12452)	1.5195310253612764
  (0, 14339)	6.229918859659283
  (0, 15036)	2.030213781779356
  (0, 15040)	6.709491939921169
  (0, 15974)	3.3800384631178555
  (0, 16916)	4.997775178366651
  (0, 16965)	1.6673331910012603
  (0, 17016)	1.9258537664551136
  (0, 19907)	6.016344759361225
  (0, 20145)	6.229918859659283
  (0, 20955)	4.983981856234315
  (0, 21056)	2.764182956859557
  (0, 22371)	1.240134361043218
  (0, 22798)	0.8349934545913218
  (0, 23113)	1.5008468300225124
  (0, 24356)	3.374543943800215
  (1, 8212)	3.0841258915295593
  (1, 8369)	7.665003384948606
  (1, 9418)	1.0930021499776208
  (1, 11045)	6.9718562043886605
  (1, 12694)	4.7971044829044995
  (1, 14619)	5.011761420341391
  :	:
  (29425, 12973)	5.040334792785447
  (29425, 13272)	3.258284137684353
  (29425, 13286)	5.719093235893292
  (29425, 13334)	4.29083467567437
  (29425, 13808)	9.27444129738270

./saved_model/run_2024-04-27_14-22-31


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO:training_log:
 Epoch: 0007 train_loss= 7.92649 train_acc= 0.51016 val_loss= 0.89216 val_acc= 0.51408 test_loss= 0.84282 test_acc= 0.53489 time= 0.09579
2024-04-27 06:22:35,680 - INFO - Configurations for training:
2024-04-27 06:22:35,680 - INFO - Configurations for training:
2024-04-27 06:22:35,680 - INFO - Configurations for training:
2024-04-27 06:22:35,680 - INFO - Configurations for training:
2024-04-27 06:22:35,680 - INFO - Configurations for training:
INFO:training_log:Configurations for training:
2024-04-27 06:22:35,688 - DEBUG - {'do_train': True, 'do_valid': True, 'do_test': True, 'no_sparse': True, 'load_ckpt': False, 'featureless': True, 'save_path': './saved_model', 'dataset': 'mr', 'model': 'gcn', 'lr': 2e-05, 'epochs': 300, 'hidden': 200, 'layers': 2, 'dropout': 0.8, 'weight_decay': 1e-06, 'early_stop': 2000, 'max_degree': 3, 'model_name': 'model', 'run_id': 2}
2024-04-27 06:22:35,688 - DEBUG - {'do_tra