In [None]:
from google.colab import drive
from google.colab import files
import pandas as pd
import json
import os
import requests
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/TA/Sem 8/")

Mounted at /content/drive


In [None]:
os.listdir()

['toy-ende',
 'dataset',
 'indonlu-master',
 'notebook',
 'Draft Bab III.docx',
 '13517031_Karina Iswara_Laporan M1.docx',
 '13517031_Karina Iswara_Resume TA.gdoc',
 'question-generator',
 '13517031_Karina Iswara_Laporan M2.docx',
 'models',
 'view.1',
 'test_ner.txt',
 'test_dict_ner.txt',
 'BERT-KPE',
 'Multilingual-BERT-KPE']

In [None]:
def load_df_from_json(data_type, description):
  path = 'dataset/SQuAD/v2.0/' + data_type + description + '.json'
  print("Reading file at", path)
  return pd.read_json(path)

def save_df_to_json(df, data_type, description):
  path = 'dataset/SQuAD/v2.0/' + data_type + '_' + description + '.json'
  df.to_json(path)
  print('Saving file to', path)

def save_to_json(df, data_type, description):
  path = 'dataset/SQuAD/v2.0/' + data_type + '_' + description + '.json'
  with open(path, 'w') as f:
    df.apply(lambda row: f.write('%s\n' % row.to_json()), axis=1)
  print('Saving file to', path)

def save_list_to_txt(path, list_obj):
  if os.path.exists(path):
    with open(path, 'w') as f:
      f.writelines("%s\n" % element.replace("\n", " ") for element in list_obj)
    print(f'Save to {path}')
  else :
    print(f'Cancel saving, Path already exist : {path}')

def load_list_from_txt(path):
  print("loading file at", path)
  mainlist = []
  infile = open(path,'r')
  for line in infile:
    mainlist.append(line)
  infile.close()
  return mainlist

def save_listDict_to_txt(path, list_obj):
  if not os.path.exists(path):
    with open(path, 'w') as f:
      f.write(json.dumps(list_obj))
      f.write("\n")
    print(f'Save to {path}')
  else :
    print(f'Cancel saving, Path already exist : {path}')

def load_listDict_from_txt(path):
  print("loading file at", path)
  mainlist = []
  infile = open(path,'r')
  for line in infile:
    mainlist.append(json.loads(line))
  infile.close()
  return mainlist[0]

In [None]:
df_squad = load_df_from_json('train', '-v2.0')
print(df_squad.shape)
df_squad.head(5)

Reading file at dataset/SQuAD/v2.0/train-v2.0.json
(442, 2)


Unnamed: 0,version,data
0,v2.0,"{'title': 'Beyoncé', 'paragraphs': [{'qas': [{..."
1,v2.0,"{'title': 'Frédéric_Chopin', 'paragraphs': [{'..."
2,v2.0,{'title': 'Sino-Tibetan_relations_during_the_M...
3,v2.0,"{'title': 'IPod', 'paragraphs': [{'qas': [{'qu..."
4,v2.0,{'title': 'The_Legend_of_Zelda:_Twilight_Princ...


In [None]:
def preprocess_ori_for_bert_kpe(df):
  temp_title = []
  temp_abstract = []
  for row in df['data']:
    temp_title.append(row['title'])
    temp_abstract.append(row['paragraphs'])
  df['title'] = temp_title
  df['abstract'] = temp_abstract
  df.drop(columns=['data', 'version'], inplace=True)
  return df

def reconstruct_ori_for_bert_kpe(df_squad):
  data = {'title': [], 'abstract' : [], 'keyword':[]}

  no_keyword = [0, []]
  no_answer = [0, []]
  impossible_keywords = 0
  for index in range(len(df_squad['abstract'])):
    title = df_squad['title'][index]
    for index_abstract in  range(len(df_squad['abstract'][index])):
      keyword = []
      for qa in df_squad['abstract'][index][index_abstract]['qas']:
          answers = qa.get('answers') or qa.get('plausible_answers')
          if not answers:
            no_answer[0] += 1
            no_answer[1].append([index, index_abstract])
            continue

          keyword += answers

      if not keyword :
        no_keyword[0] += 1
        no_keyword[1].append([index, index_abstract])
        continue

      keyword = list({(x['answer_start']):x for x in keyword}.values())

      data['title'].append(title)
      data['abstract'].append(df_squad['abstract'][index][index_abstract]['context'])
      keyword = sorted(keyword, key = lambda x: x['answer_start'])
      keyword = list(set([x['text'] for x in keyword]))
      data['keyword'].append(keyword)
  print(f'{impossible_keywords} answers not found in context abstract')
  print(f'{no_keyword[0]} row(s) have no answers index : {no_keyword[1]}')
  print(f'{no_answer[0]} question(s) have no answers : {no_answer[1]}')
  return pd.DataFrame(data)

In [None]:
df_squad = preprocess_ori_for_bert_kpe(df_squad)
df_squad = reconstruct_ori_for_bert_kpe(df_squad)
print(df_squad.shape)
display(df_squad.head(5))

0 answers not found in context abstract
0 row(s) have no answers index : []
0 question(s) have no answers : []
(19035, 3)


Unnamed: 0,title,abstract,keyword
0,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,"[singing and dancing, lead singer, Mathew Know..."
1,Beyoncé,Following the disbandment of Destiny's Child i...,"[Beyoncé, acting, Sasha Fierce, Cadillac Recor..."
2,Beyoncé,"A self-described ""modern-day feminist"", Beyonc...","[2013 and 2014, 60 million, modern-day feminis..."
3,Beyoncé,"Beyoncé Giselle Knowles was born in Houston, T...","[Joseph Broussard., Solange, her mother's maid..."
4,Beyoncé,Beyoncé attended St. Mary's Elementary School ...,"[dance instructor Darlette Johnson, St. John's..."


In [None]:
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.2.0-py2.py3-none-any.whl (241 kB)
[?25l[K     |█▍                              | 10 kB 24.6 MB/s eta 0:00:01[K     |██▊                             | 20 kB 28.0 MB/s eta 0:00:01[K     |████                            | 30 kB 12.8 MB/s eta 0:00:01[K     |█████▍                          | 40 kB 9.3 MB/s eta 0:00:01[K     |██████▉                         | 51 kB 5.2 MB/s eta 0:00:01[K     |████████▏                       | 61 kB 5.3 MB/s eta 0:00:01[K     |█████████▌                      | 71 kB 5.4 MB/s eta 0:00:01[K     |██████████▉                     | 81 kB 6.1 MB/s eta 0:00:01[K     |████████████▏                   | 92 kB 4.6 MB/s eta 0:00:01[K     |█████████████▋                  | 102 kB 5.0 MB/s eta 0:00:01[K     |███████████████                 | 112 kB 5.0 MB/s eta 0:00:01[K     |████████████████▎               | 122 kB 5.0 MB/s eta 0:00:01[K     |█████████████████▋              | 133 kB 5.0 MB/s eta 0:0

In [None]:
import re
from unidecode import unidecode

def unicode_to_ascii(s):
    return unidecode(s)

non_ascii_regex = re.compile(r"[^\x00-\x7F\u2013]")

# Complete punctuation from string.punctuation: !"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~

def normalize_string(s, lower=False):
    s = unicode_to_ascii(s)
    if lower:
        s = s.lower()
    s = re.sub(' {2,}|\n{1,}', ' ', s)
    s = s.lstrip()
    s = s.rstrip()
    s = s.replace('`', '')
    return s

punctuations = '!"#$%&\'()*+-/;<=>@?[\\]^_`{|}~'
punctuations_regex = re.compile(r"([%s])" % punctuations)
real_separator_regex = re.compile(r"(([\.,:][^a-zA-Z0-9])|([\.,:]$))")
def tokenize(s):
    s = re.sub(punctuations_regex, r" \1 ", s)
    s = re.sub(real_separator_regex, r" \1", s)
    s = s.split()
    return s

In [None]:
def get_max_paragraph_length(df_squad, quantile = 1):
  paragraph_lengths = []
  for paragraph in df_squad['abstract']:
    sentences = nltk.tokenize.sent_tokenize(paragraph)
    length = 0
    for sentence in sentences:
      length += len(tokenize(normalize_string(sentence)))
    paragraph_lengths.append(length)
  df_paragraph_lengths = pd.DataFrame(paragraph_lengths)
  paragraph_lengths_desc = df_paragraph_lengths.describe()
  print(paragraph_lengths_desc, end='\n\n')
  if (quantile < 1):
    return df_paragraph_lengths.quantile(quantile)[0].astype(int)
  return df_paragraph_lengths.max()[0].astype(int)
  
def delete_over_limit_paragraph(df_squad, max_length):
  deleted_row = 0
  for index_paragraph in range(len(df_squad['abstract'])):
    paragraph = df_squad['abstract'][index_paragraph]
    sentences = nltk.tokenize.sent_tokenize(paragraph)
    length = 0
    for sentence in sentences:
      length += len(tokenize(normalize_string(sentence)))
    if (length > max_length):
      print
      df_squad.drop(index_paragraph, inplace=True)
      deleted_row += 1
  print(f'{deleted_row} rows deleted')
  return df_squad

### Remove outlier by paragraph length 

In [None]:
QUANTILE = 0.99
PARAGRAPH_MAX_LENGTH = get_max_paragraph_length(df_squad, QUANTILE)
# PARAGRAPH_MAX_LENGTH_TEST = get_max_paragraph_length(df_squad_kpe_test)
print(PARAGRAPH_MAX_LENGTH)
# print(PARAGRAPH_MAX_LENGTH_TEST)

                  0
count  19035.000000
mean     139.647229
std       59.703494
min       23.000000
25%      103.000000
50%      129.000000
75%      167.000000
max      815.000000

333


In [None]:
df_squad = delete_over_limit_paragraph(df_squad, PARAGRAPH_MAX_LENGTH)
df_squad.reset_index(drop=True, inplace=True)
print(df_squad.shape)
print(df_squad.index)

190 rows deleted
(18845, 3)
RangeIndex(start=0, stop=18845, step=1)


In [None]:
train_data, val_data = train_test_split(df_squad, test_size=0.1, random_state=42)

In [None]:
display(train_data.shape)
train_data.reset_index(drop=True, inplace=True)
train_data.head()

(16960, 3)

Unnamed: 0,title,abstract,keyword
0,Police,Colquhoun's utilitarian approach to the proble...,"[stipendiary system, full-time, Henry and John..."
1,Institute_of_technology,"In Japan, an institute of technology (工業大学, kō...","[Imperial College of Engineering, sciences]"
2,History_of_science,"Further studies, e.g. Jerome Ravetz 1971 Scien...","[scientism, a social construct, settling endur..."
3,Saint_Helena,The island of Saint Helena has a total area of...,"[volcanic, 1996, 2,684, 47, Millennium Forest ..."
4,Pacific_War,In an effort to discourage Japanese militarism...,"[""ABCD line"", stopped selling oil, iron ore, a..."


In [None]:
save_to_json(train_data, 'train', 'reconstruct_0.99_for_bert_kpe')

Saving file to dataset/SQuAD/v2.0/train_reconstruct_0.99_for_bert_kpe.json


In [None]:
display(val_data.shape)
val_data.reset_index(drop=True, inplace=True)
val_data.head()

(1885, 3)

Unnamed: 0,title,abstract,keyword
0,Light-emitting_diode,Assistive listening devices in many theaters a...,"[to send sound to listeners' receivers, theate..."
1,2008_Sichuan_earthquake,"The AP reported that ""The state-controlled med...","[propaganda bureau, state-controlled media, Th..."
2,Supreme_court,With respect to Pakistan's territories (i.e. F...,"[ICT, Azad Kashmir, appeals only of a constitu..."
3,Seven_Years%27_War,"During the war, the Seven Nations of Canada we...","[The Iroquois, dominant in what is now Upstate..."
4,Immunology,Maternal factors also play a role in the body’...,[distinct time frames found in vaccination sch...


In [None]:
save_to_json(val_data, 'val', 'reconstruct_0.99_for_bert_kpe')

Saving file to dataset/SQuAD/v2.0/val_reconstruct_0.99_for_bert_kpe.json


In [None]:
df_squad_test = load_df_from_json('dev', '-v2.0')
print(df_squad_test.shape)
df_squad_test.head(5)

Reading file at dataset/SQuAD/v2.0/dev-v2.0.json
(35, 2)


Unnamed: 0,version,data
0,v2.0,"{'title': 'Normans', 'paragraphs': [{'qas': [{..."
1,v2.0,"{'title': 'Computational_complexity_theory', '..."
2,v2.0,"{'title': 'Southern_California', 'paragraphs':..."
3,v2.0,"{'title': 'Sky_(United_Kingdom)', 'paragraphs'..."
4,v2.0,"{'title': 'Victoria_(Australia)', 'paragraphs'..."


In [None]:
df_squad_test = preprocess_ori_for_bert_kpe(df_squad_test)
df_squad_test = reconstruct_ori_for_bert_kpe(df_squad_test)
print(df_squad_test.shape)
display(df_squad_test.head(5))

0 answers not found in context abstract
0 row(s) have no answers index : []
15 question(s) have no answers : [[7, 14], [7, 14], [7, 17], [7, 21], [7, 24], [7, 25], [7, 25], [7, 25], [7, 26], [7, 27], [7, 30], [7, 31], [7, 34], [7, 36], [7, 41]]
(1204, 3)


Unnamed: 0,title,abstract,keyword
0,Normans,The Normans (Norman: Nourmands; French: Norman...,"[Denmark, Iceland and Norway, Normandy, the fi..."
1,Normans,"The Norman dynasty had a major political, cult...","[Richard I, political, cultural and military, ..."
2,Normans,"The English name ""Normans"" comes from the Fren...","[9th century, Norseman, Viking, ""Normans"", Vik..."
3,Normans,"In the course of the 10th century, the initial...","[Epte, treaty of Saint-Clair-sur-Epte, 911, fu..."
4,Normans,"Before Rollo's arrival, its populations did no...","[Danes, Norwegians, Norse–Gaels, Orkney Viking..."


In [None]:
df_squad_test = delete_over_limit_paragraph(df_squad_test, PARAGRAPH_MAX_LENGTH)
df_squad_test.reset_index(drop=True, inplace=True)
print(df_squad_test.shape)
print(df_squad_test.index)

25 rows deleted
(1179, 3)
RangeIndex(start=0, stop=1179, step=1)


In [None]:
save_to_json(df_squad_test, 'test', 'reconstruct_0.99_for_bert_kpe')

Saving file to dataset/SQuAD/v2.0/test_reconstruct_0.99_for_bert_kpe.json
