# oposum dataset processing

- dealing with vocab, which is loaded from **word2vec** model or **Glove** model
- transform sentence string to index using vocab, save for easy accessibility, `data_idx`
- also the length of corresponding sentence, `data_length`

In [1]:
%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/drive')
# %cd /content/drive/MyDrive/group-1.3-master/group-1.3-master/LeverageJustAFewKeywords/
# %cd /content/drive/MyDrive/LeverageJustAFewKeywords/
%cd /content/drive/MyDrive/group-1.3/LeverageJustAFewKeywords/


Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1n0oSoMBR4TlxDwAce51xBgon3LxJjCkE/group-1.3/LeverageJustAFewKeywords


In [2]:
import nltk
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
import os
import json
import gensim
from utils import *
from tqdm import tqdm
import numpy as np
import pickle

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


# parameter setting and function definition

In [3]:
def pickle_load(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

In [4]:
oposum_domains = ['bags_and_cases', 'bluetooth', 'boots', 'keyboards', 'tv', 'vacuums']
domain = oposum_domains[0]
dataset_mode = 'train'
w2v_folder = '../wv/oposum_w2v/'
wv_mode =  'pretrained' #'tuned'
data_folder = './data/'
processed_folder = './data/'
pretrained = 'glove' # 'word2vec'

In [5]:
def load_data(file):
    '''we use our own vocab, loading original maybe enough'''
    with open(file) as f:
        data = json.load(f)
    data = [s for d in data['original'] for s in d]
    return data

def build_shift_vocab_word2vec(model_file, num_tag=2):
    emb = gensim.models.KeyedVectors.load_word2vec_format(model_file, binary=True)
    shift = num_tag
    vocab = {token: i + shift for i, token in enumerate(emb.wv.index2word)}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

def build_shift_vocab_glove(glove_filename, num_tag=2):
    shift = num_tag
    with open(glove_filename, 'rb') as f:
      glove = pickle.load(f)
    vocab = {token: i + shift for i, token in enumerate(glove.keys())}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

def build_text_index(sentence, vocab):
    '''transform sentence string to index according to vocab'''
    # senc = tokenize_sentence(sentence)
    senc = sentence.split() # oposum dataset don't need tokenizer, just .split()
    senc = remove_wordlist(senc, set(stopwords.words('english')))
    senc = lemmatize_sentence(senc)
    idx = [vocab.get(token, vocab['<UNK>']) for token in senc]  # not existing token is <UNK>
    return idx

def write_vocab(vocab, file_name):
    with open(file_name, 'w') as f:
        for token, idx in vocab.items():
            f.write(f"{token}\t{idx}\n")

# example for debug

In [None]:
data_file = os.path.join(data_folder, f'{domain}_train.json')
data_orig = load_data(data_file)

In [None]:
if pretrained == 'word2vec':
  model_file = os.path.join(w2v_folder, f"{domain}_{wv_mode}.bin")
  vocab = build_shift_vocab_word2vec(model_file)
elif pretrained == 'glove':
  model_file = os.path.join(w2v_folder, f"{domain}_glove_{wv_mode}.bin")
  vocab = build_shift_vocab_glove(model_file)

In [None]:
data_idx = []
for s in tqdm(data_orig):
    data_idx.append(build_text_index(s, vocab))
# data_idx = [build_text_index(s, vocab) for s in data_orig]

100%|██████████| 588229/588229 [01:52<00:00, 5250.66it/s]


In [None]:
data_length = [len(s) for s in data_idx]

In [None]:
# data_orig
vocab['<PAD>']
# print(vocab)
# np.max(data_length)
# vocab['happy']

12184

In [None]:
if pretrained == 'word2vec':
  vocab_file = os.path.join(data_folder, f'{domain}_vocab_w2v.txt')
elif pretrained == 'glove':
  vocab_file = os.path.join(data_folder, f'{domain}_vocab_glove.txt')
write_vocab(vocab, vocab_file)

In [None]:
supplement_data = {'data_idx': data_idx, 'data_length': data_length}
if pretrained == 'word2vec':
  supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_supplement_w2v.pkl')
elif pretrained == 'glove':
  supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_supplement_glove.pkl')


In [None]:
with open(supplement_data_path, 'wb') as f:
    pickle.dump(supplement_data, f)

# group together

In [6]:
for domain in oposum_domains:
    print(f'process {domain} ...\n')
    print(f'type {pretrained}...\n')
    data_file = os.path.join(data_folder, f'{domain}_{dataset_mode}.json')
    data_orig = load_data(data_file)
    if pretrained == 'word2vec':
      model_file = os.path.join(w2v_folder, f"{domain}_{wv_mode}.bin")
      vocab = build_shift_vocab_word2vec(model_file)
    elif pretrained == 'glove':
      model_file = os.path.join(w2v_folder, f"{domain}_glove_{wv_mode}.bin")
      vocab = build_shift_vocab_glove(model_file)
    print(f'data file: {data_file}\nmodel file: {model_file}')
    # model_file = os.path.join(w2v_folder, f"{domain}_{wv_mode}.bin")
    # vocab = build_shift_vocab_word2vec(model_file)
    print(f'vocab length: {len(vocab)}')
    data_idx = []
    print('transforming to index ...')
    for s in tqdm(data_orig):
        data_idx.append(build_text_index(s, vocab))
    data_length = [len(s) for s in data_idx]
    
    # vocab_file = os.path.join(data_folder, f'{domain}_vocab_w2v.txt')
    if pretrained == 'word2vec':
      vocab_file = os.path.join(data_folder, f'{domain}_vocab_w2v.txt')
    elif pretrained == 'glove':
      vocab_file = os.path.join(data_folder, f'{domain}_vocab_glove.txt')

    if dataset_mode == 'train':
        write_vocab(vocab, vocab_file)
        print('finish writing vocab file')

    # supplement_data = {'data_idx': data_idx,
    #                    'data_length': data_length}
    # supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_supplement_w2v.pkl')
    supplement_data = {'data_idx': data_idx, 'data_length': data_length}
    if pretrained == 'word2vec':
      supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_supplement_w2v.pkl')
    elif pretrained == 'glove':
      supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_supplement_glove.pkl')
    with open(supplement_data_path, 'wb') as f:
        pickle.dump(supplement_data, f)
    print(f'finish processing {domain}\n\n')

process bags_and_cases ...

type glove pretrained...



  0%|          | 0/588229 [00:00<?, ?it/s]

data file: ./data/bags_and_cases_train.json
model file: ../wv/oposum_w2v/bags_and_cases_glove_pretrained.bin
vocab length: 15431
transforming to index ...


100%|██████████| 588229/588229 [01:53<00:00, 5167.96it/s]


finish writing vocab file
finish processing bags_and_cases


process bluetooth ...

type glove pretrained...



  0%|          | 534/1431839 [00:00<04:28, 5339.22it/s]

data file: ./data/bluetooth_train.json
model file: ../wv/oposum_w2v/bluetooth_glove_pretrained.bin
vocab length: 23616
transforming to index ...


100%|██████████| 1431839/1431839 [04:27<00:00, 5356.43it/s]


finish writing vocab file
finish processing bluetooth


process boots ...

type glove pretrained...



  0%|          | 623/963866 [00:00<02:34, 6226.19it/s]

data file: ./data/boots_train.json
model file: ../wv/oposum_w2v/boots_glove_pretrained.bin
vocab length: 16088
transforming to index ...


100%|██████████| 963866/963866 [02:59<00:00, 5377.11it/s]


finish writing vocab file
finish processing boots


process keyboards ...

type glove pretrained...



  0%|          | 624/608801 [00:00<01:37, 6239.65it/s]

data file: ./data/keyboards_train.json
model file: ../wv/oposum_w2v/keyboards_glove_pretrained.bin
vocab length: 16542
transforming to index ...


100%|██████████| 608801/608801 [01:54<00:00, 5313.96it/s]


finish writing vocab file
finish processing keyboards


process tv ...

type glove pretrained...



  0%|          | 612/1432384 [00:00<03:54, 6114.09it/s]

data file: ./data/tv_train.json
model file: ../wv/oposum_w2v/tv_glove_pretrained.bin
vocab length: 27449
transforming to index ...


100%|██████████| 1432384/1432384 [04:31<00:00, 5281.65it/s]


finish writing vocab file
finish processing tv


process vacuums ...

type glove pretrained...



  0%|          | 450/1465525 [00:00<05:26, 4492.25it/s]

data file: ./data/vacuums_train.json
model file: ../wv/oposum_w2v/vacuums_glove_pretrained.bin
vocab length: 22645
transforming to index ...


100%|██████████| 1465525/1465525 [04:33<00:00, 5353.16it/s]


finish writing vocab file
finish processing vacuums




In [7]:
dataset_mode = 'test'

In [8]:
for domain in oposum_domains:
    print(f'process {domain} ...\n')
    print(f'pretrained {pretrained} ...\n')
    data_file = os.path.join(data_folder, f'{domain}_{dataset_mode}.json')
    data_orig = load_data(data_file)
    if pretrained == 'word2vec':
      model_file = os.path.join(w2v_folder, f"{domain}_{wv_mode}.bin")
      vocab = build_shift_vocab_word2vec(model_file)
    elif pretrained == 'glove':
      model_file = os.path.join(w2v_folder, f"{domain}_glove_{wv_mode}.bin")
      vocab = build_shift_vocab_glove(model_file)
    print(f'data file: {data_file}\nmodel file: {model_file}')
    # model_file = os.path.join(w2v_folder, f"{domain}_{wv_mode}.bin")
    # vocab = build_shift_vocab_word2vec(model_file)
    print(f'vocab length: {len(vocab)}')
    data_idx = []
    print('transforming to index ...')
    for s in tqdm(data_orig):
        data_idx.append(build_text_index(s, vocab))
    data_length = [len(s) for s in data_idx]
    
    # vocab_file = os.path.join(data_folder, f'{domain}_vocab_w2v.txt')
    if pretrained == 'word2vec':
      vocab_file = os.path.join(data_folder, f'{domain}_vocab_w2v.txt')
    elif pretrained == 'glove':
      vocab_file = os.path.join(data_folder, f'{domain}_vocab_glove.txt')

    if dataset_mode == 'train':
        write_vocab(vocab, vocab_file)
        print('finish writing vocab file')

    # supplement_data = {'data_idx': data_idx,
    #                    'data_length': data_length}
    # supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_supplement_w2v.pkl')
    supplement_data = {'data_idx': data_idx, 'data_length': data_length}
    if pretrained == 'word2vec':
      supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_supplement_w2v.pkl')
    elif pretrained == 'glove':
      supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_supplement_glove.pkl')
    with open(supplement_data_path, 'wb') as f:
        pickle.dump(supplement_data, f)
    print(f'finish processing {domain}\n\n')

process bags_and_cases ...

pretrained glove ...



100%|██████████| 653/653 [00:00<00:00, 4636.39it/s]

data file: ./data/bags_and_cases_test.json
model file: ../wv/oposum_w2v/bags_and_cases_glove_pretrained.bin
vocab length: 15431
transforming to index ...





finish processing bags_and_cases


process bluetooth ...

pretrained glove ...



100%|██████████| 667/667 [00:00<00:00, 4939.77it/s]

data file: ./data/bluetooth_test.json
model file: ../wv/oposum_w2v/bluetooth_glove_pretrained.bin
vocab length: 23616
transforming to index ...
finish processing bluetooth


process boots ...

pretrained glove ...




100%|██████████| 631/631 [00:00<00:00, 5435.00it/s]

data file: ./data/boots_test.json
model file: ../wv/oposum_w2v/boots_glove_pretrained.bin
vocab length: 16088
transforming to index ...
finish processing boots


process keyboards ...

pretrained glove ...




100%|██████████| 698/698 [00:00<00:00, 4973.29it/s]

data file: ./data/keyboards_test.json
model file: ../wv/oposum_w2v/keyboards_glove_pretrained.bin
vocab length: 16542
transforming to index ...
finish processing keyboards


process tv ...

pretrained glove ...




100%|██████████| 764/764 [00:00<00:00, 5411.54it/s]

data file: ./data/tv_test.json
model file: ../wv/oposum_w2v/tv_glove_pretrained.bin
vocab length: 27449
transforming to index ...
finish processing tv


process vacuums ...

pretrained glove ...




100%|██████████| 741/741 [00:00<00:00, 5227.84it/s]

data file: ./data/vacuums_test.json
model file: ../wv/oposum_w2v/vacuums_glove_pretrained.bin
vocab length: 22645
transforming to index ...
finish processing vacuums





