In [None]:
from google.colab import drive
from google.colab import files
import pandas as pd
import json
import os
import tensorflow as tf
import torch
import codecs
import logging
from tqdm import tqdm
import re

In [None]:
drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/TA/Sem 8")

Mounted at /content/drive


In [None]:
os.listdir()

['toy-ende',
 'dataset',
 'indonlu-master',
 'notebook',
 'Draft Bab III.docx',
 '13517031_Karina Iswara_Laporan M1.docx',
 '13517031_Karina Iswara_Resume TA.gdoc',
 'question-generator',
 '13517031_Karina Iswara_Laporan M2.docx',
 'models',
 'view.1',
 'test_ner.txt',
 'test_dict_ner.txt',
 'BERT-KPE',
 'Multilingual-BERT-KPE']

In [None]:
def load_json(data_type):
  path = 'dataset/SQuAD/v2.0/' + data_type + '_reconstructed.json'
  print("Reading file at", path)
  return pd.read_json(path)

def save_to_json(df, data_type, description):
  path = 'dataset/SQuAD/v2.0/' + data_type + '_' + description + '.json'
  df.to_json(path)
  print('Saving file to', path)

def load_df_from_json(data_type):
  path = 'dataset/SQuAD/v2.0/' + data_type + '-v2.0.json'
  print("Reading file at", path)
  return pd.read_json(path)

def save_df_to_json(df, data_type, description):
  path = 'dataset/SQuAD/v2.0/' + data_type + '_' + description + '.json'
  df.to_json(path)
  print('Saving file to', path)

def save_list_to_txt(path, list_obj, overwrite=False):
  if not os.path.exists(path) or overwrite:
    with open(path, 'w') as f:
      f.writelines("%s\n" % element.replace("\n", " ") for element in list_obj)
    print(f'Save to {path}')
  else :
    print(f'Cancel saving, Path already exist : {path}')

def load_list_from_txt(path):
  print("loading file at", path)
  mainlist = []
  infile = open(path,'r')
  for line in infile:
    mainlist.append(line)
  infile.close()
  return mainlist

def save_listDict_to_txt(path, list_obj, overwrite=False):
  if not os.path.exists(path) or overwrite:
    with open(path, 'w') as f:
      f.write(json.dumps(list_obj))
      f.write("\n")
    print(f'Save to {path}')
  else :
    print(f'Cancel saving, Path already exist : {path}')

def load_listDict_from_txt(path):
  print("loading file at", path)
  mainlist = []
  infile = open(path,'r')
  for line in infile:
    mainlist.append(json.loads(line))
  infile.close()
  return mainlist[0]

In [None]:
logger = logging.getLogger()

In [None]:
def kp20k_loader(mode, source_dataset_dir, 
                 src_fields = ['title', 'abstract'], 
                 trg_fields = ['keyword'], trg_delimiter=';'):
    
    ''' load source Kp20k dataset :'title', 'abstract', 'keyword' 
    return : tuple : src_string, trg_string'''
    
    logger.info("start loading %s data ..." % mode)
    source_path = os.path.join(source_dataset_dir, 'kp20k_%s.json' % mode)
    
    data_pairs = []
    with codecs.open(source_path, "r", "utf-8") as corpus_file:
        for idx, line in enumerate(tqdm(corpus_file)):
            json_ = json.loads(line)
            print(line)
            trg_strs = []
            src_str = '.'.join([json_[f] for f in src_fields])
            [trg_strs.extend(re.split(trg_delimiter, json_[f])) for f in trg_fields]
            data_pairs.append((src_str, trg_strs))
            break
    return data_pairs

In [None]:
dataset = "kp20k"
path = "BERT-KPE/dataset/kp20k"
# + "/" + dataset + "_" + data_type + ".json"
data_type = "training"
data_kp20k = kp20k_loader(data_type, path)
data_kp20k[0]

0it [00:00, ?it/s]

{"abstract": "This paper proposes using virtual reality to enhance the perception of actions by distant users on a shared application. Here, distance may refer either to space ( e.g. in a remote synchronous collaboration) or time ( e.g. during playback of recorded actions). Our approach consists in immersing the application in a virtual inhabited 3D space and mimicking user actions by animating avatars. We illustrate this approach with two applications, the one for remote collaboration on a shared application and the other to playback recorded sequences of user actions. We suggest this could be a low cost enhancement for telepresence.", "keyword": "telepresence;animation;avatars;application sharing;collaborative virtual environments", "title": "virtually enhancing the perception of user actions"}

<class 'str'>





('virtually enhancing the perception of user actions.This paper proposes using virtual reality to enhance the perception of actions by distant users on a shared application. Here, distance may refer either to space ( e.g. in a remote synchronous collaboration) or time ( e.g. during playback of recorded actions). Our approach consists in immersing the application in a virtual inhabited 3D space and mimicking user actions by animating avatars. We illustrate this approach with two applications, the one for remote collaboration on a shared application and the other to playback recorded sequences of user actions. We suggest this could be a low cost enhancement for telepresence.',
 ['telepresence',
  'animation',
  'avatars',
  'application sharing',
  'collaborative virtual environments'])

In [None]:
print(type(data_kp20k[0][0]))

<class 'str'>


In [None]:
def squad_loader(mode, source_dataset_dir, max_row,
                 src_fields = ['title', 'abstract'], 
                 trg_fields = ['keyword'], trg_delimiter=';'):
    ''' load source OpenKP dataset :'url', 'VDOM', 'text', 'KeyPhrases' '''
    
    logger.info("start loading %s data ..." % mode)
    source_path = os.path.join(source_dataset_dir, '%s_reconstruct_for_bert_kpe.json' % mode)
    data_pairs = []
    i = 0
    with codecs.open(source_path, "r", "utf-8") as corpus_file:
        for idx, line in enumerate(tqdm(corpus_file)):
            json_ = json.loads(line)
            trg_strs = []
            src_str = '.'.join([json_[f] for f in src_fields])
            [trg_strs.extend(json_[f]) for f in trg_fields]
            data_pairs.append((src_str, trg_strs))
            i += 1
            if (i == max_row):
                break
            break
    return data_pairs

In [None]:
dataset = "squad"
path = "dataset/SQuAD/v2.0"
data_type = "test"
data_squad = squad_loader(data_type, path, 100)
data_squad[0]

0it [00:00, ?it/s]

{"title":"Beyonc\u00e9","abstract":"Beyonc\u00e9 Giselle Knowles-Carter (\/bi\u02d0\u02c8j\u0252nse\u026a\/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyonc\u00e9's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles \"Crazy in Love\" and \"Baby Boy\".","keyword":["Houston, Texas","Dangerously in Love","singing and dancing","Mathew Knowles","in the late 1990s","late 1990s","Destiny's Child","lead singer","2003","five"]}

{'title': 'Beyoncé', 'abstract': 'Beyoncé Giselle Knowles-Car




('Beyoncé.Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 ['Houston, Texas',
  'Dangerously in Love',
  'singing and dancing',
  'Mathew Knowles',
  'in the late 1990s',
  'late 1990s',
  "Destiny's Child",
  'lead singer',
  '2003',
  'five'])

In [None]:
def openkp_loader(mode, source_dataset_dir):
    ''' load source OpenKP dataset :'url', 'VDOM', 'text', 'KeyPhrases' '''
    
    logger.info("start loading %s data ..." % mode)
    source_path = os.path.join(source_dataset_dir, 'OpenKP%s.jsonl' % mode)
    data_pairs = []
    with codecs.open(source_path, "r", "utf-8") as corpus_file:
        for idx, line in enumerate(tqdm(corpus_file)):
            json_ = json.loads(line)
            data_pairs.append(json_)
    return data_pairs

In [None]:
dataset = "openkp"
path = dataset 
# + "/" + dataset + "_" + data_type + ".json"
data_type = "Dev"
data_openkp = openkp_loader(data_type, path)
data_openkp[0]

6616it [00:08, 776.67it/s] 


{'KeyPhrases': [['NBA', '2K17'], ['Key', 'Generator'], ['Xbox']],
 'VDOM': '[{"Id":0,"text":"Home","feature":[32.0,34.0,199.0,15.0,0.0,0.0,0.0,0.0,12.0,0.0,32.0,610.0,194.0,62.0,1.0,0.0,0.0,0.0,12.0,0.0],"start_idx":0,"end_idx":1},{"Id":0,"text":"Keygen","feature":[79.0,41.0,199.0,15.0,0.0,0.0,0.0,0.0,12.0,0.0,32.0,610.0,194.0,62.0,1.0,0.0,0.0,0.0,12.0,0.0],"start_idx":1,"end_idx":2},{"Id":0,"text":"NBA 2K17","feature":[129.0,56.0,199.0,15.0,0.0,0.0,0.0,0.0,12.0,0.0,32.0,610.0,194.0,62.0,1.0,0.0,0.0,0.0,12.0,0.0],"start_idx":2,"end_idx":4},{"Id":0,"text":"nba 2k17 activation key generator","feature":[194.0,182.0,199.0,15.0,0.0,0.0,0.0,0.0,12.0,0.0,32.0,610.0,194.0,62.0,1.0,0.0,0.0,0.0,12.0,0.0],"start_idx":4,"end_idx":9},{"Id":0,"text":"nba 2k17 beta keygen free","feature":[385.0,144.0,199.0,15.0,0.0,0.0,0.0,0.0,12.0,0.0,32.0,610.0,194.0,62.0,1.0,0.0,0.0,0.0,12.0,0.0],"start_idx":9,"end_idx":14},{"Id":0,"text":"nba 2k17 cd codes free","feature":[32.0,573.0,199.0,31.0,0.0,0.0,0.0,0.0,12

In [None]:
dataset = "kp20k"
data_type = "training"
path = dataset + "/" + dataset + "_" + data_type + ".json"
with open(path, 'r') as json_file:
  data = json.load(json_file)
data

JSONDecodeError: ignored