# organic dataset processing

read and process `.json` data file to get some auxiliary files, e.g. indexed data, for fast data loading

- dealing with vocab, which is loaded from **word2vec** model
- transform sentence string to index using vocab, save for easy accessibility, `data_idx`
- also the length of corresponding sentence, `data_length`

In [1]:
%load_ext autoreload
%autoreload 2

from google.colab import drive
drive.mount('/content/drive')
# %cd /content/drive/MyDrive/group-1.3-master/group-1.3-master/LeverageJustAFewKeywords/
# %cd /content/drive/MyDrive/LeverageJustAFewKeywords/
%cd /content/drive/MyDrive/group-1.3/LeverageJustAFewKeywords/


Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1n0oSoMBR4TlxDwAce51xBgon3LxJjCkE/group-1.3/LeverageJustAFewKeywords


In [2]:
import nltk
nltk.download('words')
nltk.download('stopwords')
nltk.download('wordnet')
import os
import json
import gensim
from utils import *
from tqdm import tqdm
import numpy as np
import pickle

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [3]:
 import os
import json
import gensim
from tqdm import tqdm
import numpy as np
import pickle

%cd ../LeverageJustAFewKeywords/
from utils import *

/content/drive/.shortcut-targets-by-id/1n0oSoMBR4TlxDwAce51xBgon3LxJjCkE/group-1.3/LeverageJustAFewKeywords


# parameter setting and function definition

In [47]:
domain = 'organic'
dataset_mode = 'train'
w2v_folder = '../wv/'
wv_mode = 'tuned'# 'pretrained'
data_folder = '../processed/'
processed_folder = './processed/'
wv_type = 'glove' #'w2v'

# wv_file = "../wv/w2v_corpus_wotf1_wostw_tuned.bin"

In [48]:
def load_data(file):
    '''we use our own vocab, loading original maybe enough'''
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    # data = [s for d in data['original'] for s in d]
    data = data['original']
    return data

def build_shift_vocab_word2vec(model_file, num_tag=2):
    emb = gensim.models.KeyedVectors.load_word2vec_format(model_file, binary=True)
    shift = num_tag
    vocab = {token: i + shift for i, token in enumerate(emb.wv.index2word)}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab

def build_shift_vocab_glove(glove_filename, num_tag=2):
    shift = num_tag
    with open(glove_filename, 'rb') as f:
      glove = pickle.load(f)
    vocab = {token: i + shift for i, token in enumerate(glove.keys())}
    vocab['<PAD>'] = 0
    vocab['<UNK>'] = 1
    return vocab
    
def build_text_index(sentence, vocab):
    '''transform sentence string to index according to vocab'''
    senc = tokenize_sentence(sentence)
    # senc = sentence.split() # oposum dataset don't need tokenizer, just .split()
    senc = lemmatize_sentence(senc)
    senc = remove_wordlist(senc, set(stopwords.words('english')))
    idx = [vocab.get(token, vocab['<UNK>']) for token in senc]  # not existing token is <UNK>
    if len(idx) == 0:
        idx = [vocab['<UNK>']]
    return idx

def write_vocab(vocab, file_name):
    with open(file_name, 'w', encoding='utf-8') as f:
        for token, idx in vocab.items():
            f.write(f"{token}\t{idx}\n")

# train dataset

In [49]:
data_file = os.path.join(data_folder, f'{domain}_{dataset_mode}.json')
data_orig = load_data(data_file)
print(len(data_orig))
print(data_orig[:5])

438241
['That was another administration.', 'Why is the ex CEO of Monsanto appointed head of the FDA ?', 'because he greased the palms of our "elected" leaders.', 'and judges too', 'the photo shown above is from a feed lot in California.it is labeled as such in the article.the claims are that the environmental impact of grass fed cows and feed lot cows are the same.which of course is not.just like you said- animals that graze on pastures continue the growth/life cycle of the forage if managed well']


vocab

In [50]:
model_file = os.path.join(w2v_folder, f"{wv_type}_corpus_wotf1_wostw_{wv_mode}.bin")
if wv_type == 'w2v':
  vocab = build_shift_vocab_word2vec(model_file)
elif wv_type == 'glove':
  vocab = build_shift_vocab_glove(model_file)

print(len(vocab))
# 25658 -> 25696 after fixing issue of missing punctuation marks

44082


index representation

In [51]:
data_idx = []
for s in tqdm(data_orig):
    data_idx.append(build_text_index(s, vocab))
# data_idx = [build_text_index(s, vocab) for s in data_orig]

100%|██████████| 438241/438241 [01:49<00:00, 3994.87it/s]


In [None]:
# sorted(data_idx, key=len)

data length

In [52]:
data_length = [len(s) for s in data_idx]

In [53]:
len(data_length)

438241

In [54]:
# np.max(data_length)
max_id = np.argmax(data_length)
print(data_length[max_id])
data_orig[max_id]

211


'In which of the following fields would you actually suggest that no progress, no improvement in fallibility, has been made over the last few centuries:Agricultural EconomicsAgricultural Animal BreedingAnimal NutritionPoultry ScienceAnimal ScienceAgronomy and Crop ScienceAgricultural and Horticultural Plant BreedingPlant Pathology/PhytopathologyPlant SciencesFood ScienceFood Science and Food TechnologySoil Chemistry/MicrobiologySoil SciencesHorticulture ScienceFishing and Fisheries Sciences/Fisheries managementForest Sciences and BiologyForest/Resources ManagementWood Science and Pulp/Paper TechnologyNatural resources/ConservationForestry and Related ScienceWildlife/Range ManagementEnvironmental ScienceAgriculture, GeneralAgricultural ScienceBiochemistryBiomedical sciencesBiophysicsBiotechnologyBacteriologyPlant geneticsPlant Pathology/PhytopathologyPlant PhysiologyBotany/Plant BiologyAnatomyBiometrics and BiostatisticsCell/Cellular Biology and HistologyEcologyDevelopmental biology/Emb

In [55]:
# data_orig
vocab['<PAD>']
# print(vocab)
# np.max(data_length)

0

In [56]:
vocab_file = os.path.join(data_folder, f'{domain}_vocab_{wv_type}.txt')
write_vocab(vocab, vocab_file)

# vocab_file = os.path.join(data_folder, f'{domain}_vocab_w2v.txt')
# write_vocab(vocab, vocab_file)

In [57]:
supplement_data = {'data_idx': data_idx, 'data_length': data_length}
supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_supplement_{wv_type}.pkl')

# supplement_data = {'data_idx': data_idx, 'data_length': data_length}
# supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_supplement_w2v.pkl')

In [58]:
with open(supplement_data_path, 'wb') as f:
    pickle.dump(supplement_data, f)

# test dataset

In [59]:
dataset_mode = 'test'

In [60]:
data_folder = '../processed'
data_file_test = os.path.join(data_folder, f'annotated_test_coarse.json')
with open(data_file_test, 'r', encoding='utf-8') as f:
    data = json.load(f)
data_orig_test = data['original']
data_label_test = data['label']
print(len(data_orig_test))

4687


In [61]:
a = [[2,3], [1,3]]
b = [4]
print(np.shape(a))
print(np.size(a))
b.extend(a)
print(b)

(2, 2)
4
[4, [2, 3], [1, 3]]


In [62]:
print(np.shape(data_orig_test))
print(np.shape([['str1', 'str2'], ['str3', 'str4']]))
print(np.shape([['str1', 'str2'], ['str3', 'str4', 'str5']]))

(4687,)
(2, 2)
(2,)


  return array(a, dtype, copy=False, order=order)


In [63]:
np.shape(data_label_test)

(4687, 6)

In [64]:
data_orig_test[0]

'Industrialization is everything about productivity and efficiency.'

In [65]:
data_idx_test = []
for s in tqdm(data_orig_test):
    data_idx_test.append(build_text_index(s, vocab))

100%|██████████| 4687/4687 [00:01<00:00, 3374.85it/s]


In [66]:
data_length_test = [len(s) for s in data_idx_test]

In [67]:
supplement_data = {'data_idx': data_idx_test, 'data_length': data_length_test}
supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_supplement_{wv_type}.pkl')

# supplement_data = {'data_idx': data_idx_test, 'data_length': data_length_test}
# supplement_data_path = os.path.join(data_folder, f'{domain}_{dataset_mode}_coarse_supplement_w2v.pkl')

In [68]:
with open(supplement_data_path, 'wb') as f:
    pickle.dump(supplement_data, f)

# data inspection

not useful

In [72]:
assist_data = pickle_load("../processed/organic_train_supplement_glove.pkl")

In [73]:
data_idx = assist_data['data_idx']
data_length = assist_data['data_length']

In [74]:
# sorted(data_idx, key=len)
for i, idx in enumerate(data_idx):
    if len(idx) == 0:
        print(data_orig[i])

In [77]:
build_text_index('administration', vocab)

[32640]

In [76]:
comment = pickle_load('../processed/processed_comments.pkl')

In [78]:
len(comment)

130516