In [1]:
import os
import json
import pandas as pd
import tqdm

In [2]:
import nltk
from nltk import pos_tag, word_tokenize, RegexpParser

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def extract_verb_phrases(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)

    # extract VP and NP with regexp
    grammar = r"""
        VP: {<MD>?<VB.*><NP|PP>*}
        NP: {<DT>?<JJ.*>*<NN.*>+}
        PP: {<IN><NP>}
        RP: {<RP><NP>}
    """
    chunk_parser = RegexpParser(grammar)
    tree = chunk_parser.parse(tagged_words)
    
    vc_list = []
    vc = ""
    vc_c = 0
    for subtree in tree:
        if type(subtree) is not tuple:
            if subtree.label() == "VP":
                if len(vc) > 0: ## If there is a verb phrase before it, add it to the list
                    if vc_c == 1: ## Exclude when verbs appear consecutively
                        pass
                    else:
                        vc_list.append(vc)
                    vc_c = 0
                    vc = ""
                
                if subtree[0][1] != "VBZ" and subtree[0][0] != "happened": ## Exclude Be verb, happened   
                    vc = " ".join([word for word, tag in subtree.leaves()])
                    vc_c += 1
            else:
                if len(vc) > 0:
                    vc += " " + " ".join([word for word, tag in subtree.leaves()])
                    vc_c += 1
        else:
            if subtree[1] in ["RP", "IN"] and vc_c == 1: # Adverb immediately after verb (Cannot be detected by chunking)
                vc += " " + subtree[0]
                vc_c += 1
            elif len(vc) > 0:
                if vc_c == 1: ## Exclude single verb
                    pass
                else:
                    vc_list.append(vc)
                vc = ""
                vc_c = 0
    if vc_c > 1:
        vc_list.append(vc)
    
    #print(tree)
    return vc_list

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ubuntu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# create folder for each dataset first    

In [3]:
def save_json(content, save_path):
    with open(save_path, 'w') as f:
        f.write(json.dumps(content))
def load_jsonl(filename):
    with open(filename, "r") as f:
        return [json.loads(l.strip("\n")) for l in f.readlines()]

In [None]:
# STAR

In [7]:
train_path = '/data/VQA/data/star/STAR_train.json'
val_path = '/data/VQA/data/star/STAR_val.json'

In [8]:
train = json.load(open(train_path))
val = json.load(open(val_path))

In [9]:
new_train = []
new_val = []
for qa in train:
    qa_dict = {}
    qa_dict['video'] = qa['video_id']
    qa_dict['num_option'] = 4
    qa_dict['qid'] = qa['question_id']
    for i, choice in enumerate(qa['choices']):
        qa_dict['a{}'.format(str(i))] = choice['choice']
        if choice['choice'] == qa['answer']:
            answer = i
    qa_dict['answer'] = answer
    qa_dict['question'] = qa['question']
    qa_dict['q_events'] = extract_verb_phrases(qa['question'])
    
    qa_dict['start'] = qa['start']
    qa_dict['end'] = qa['end']
    new_train.append(qa_dict)

for qa in val:
    qa_dict = {}
    qa_dict['video'] = qa['video_id']
    qa_dict['num_option'] = 4
    qa_dict['qid'] = qa['question_id']
    for i, choice in enumerate(qa['choices']):
        qa_dict['a{}'.format(str(i))] = choice['choice']
        if choice['choice'] == qa['answer']:
            answer = i
    qa_dict['answer'] = answer
    qa_dict['question'] = qa['question']
    qa_dict['q_events'] = extract_verb_phrases(qa['question'])
    qa_dict['start'] = qa['start']
    qa_dict['end'] = qa['end']
    new_val.append(qa_dict)

In [10]:
len(new_train)

45731

In [44]:
save_json(new_train, '/data/VQA/MELA/prep_json/star/train.json')
save_json(new_val, '/data/VQA/MELA/prep_json/star/val.json')

In [36]:
new_train

[{'video': 'TJZ0P',
  'num_option': 4,
  'qid': 'Interaction_T1_4',
  'a0': 'The sandwich.',
  'a1': 'The medicine.',
  'a2': 'The blanket.',
  'a3': 'The box.',
  'answer': 0,
  'question': 'Which object was eaten by the person?',
  'q_events': ['eaten by the person'],
  'start': 7.7,
  'end': 15.7},
 {'video': 'TJZ0P',
  'num_option': 4,
  'qid': 'Interaction_T1_5',
  'a0': 'The book.',
  'a1': 'The refrigerator.',
  'a2': 'The closet/cabinet.',
  'a3': 'The window.',
  'answer': 0,
  'question': 'Which object was closed by the person?',
  'q_events': ['closed by the person'],
  'start': 6.7,
  'end': 11.5},
 {'video': 'B82GJ',
  'num_option': 4,
  'qid': 'Interaction_T1_7',
  'a0': 'The food.',
  'a1': 'The laptop.',
  'a2': 'The towel.',
  'a3': 'The broom.',
  'answer': 2,
  'question': 'Which object was taken by the person?',
  'q_events': ['taken by the person'],
  'start': 17.7,
  'end': 22.5},
 {'video': 'DUZDL',
  'num_option': 4,
  'qid': 'Interaction_T1_8',
  'a0': 'The sof