In [2]:
import pandas as pd
import tokenizers
import requests 
import json 
import os
import spacy

In [4]:
final_df = pd.DataFrame()
for relation_file in os.listdir("data/relations"):
    df = pd.read_parquet(f"data/relations/{relation_file}")
    final_df = pd.concat([final_df, df])

In [5]:
final_df.index = range(len(final_df))

In [6]:
final_df.head()

Unnamed: 0,id,text,entity,object,relation,title
0,7.0,"Finance Minister Mark\nEyskens, who currently ...",Minister_Mark_Eyskens,issue,has_called,"BELGIAN ECU COIN ISSUE PRICED, SALE DATE SET"
1,1.0,Bank governor Chang Chi-Cheng told reporters t...,governor_Chang_Chi_Cheng,reporters,told,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH
2,2.0,"He said the rise showed signs of slowing, howe...",Taiwan,import_policy,has_liberalised,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH
3,3.0,Chang declined to predict how high the reserve...,Chang,to_predict_how_high_the_reserves_might_rise,declined,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH
4,4.0,"In January, Taiwan reduced import tariffs of u...",Taiwan,import_tariffs,reduced,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH


In [7]:
nlp = spacy.load("en_core_web_lg")

In [8]:
noun_indexes = []
new_objs = []
for index, row in final_df.iterrows():
    cleaned = [c for c in row["object"].split("_") if c not in ["Reuter", "Reuters", "REUTER", "REUTERS", "reuter", "reuters"]]
    cleaned = " ".join(cleaned)
    doc = nlp(cleaned)
    nouns = []
    for d in doc:
        if d.tag_ in ["NNP", "NNPS"]:
            if index not in noun_indexes:
                noun_indexes.append(index)
            noun = str(d.text)
            if noun not in nouns:
                nouns.append(noun)
    if nouns:
        new_objs.append(" ".join(nouns))

In [9]:
clean_df = final_df[final_df.index.isin(noun_indexes)]

In [10]:
clean_entities = []

for index, row in clean_df.iterrows():
    cleaned = " ".join([x for x in row["entity"].split("_") if x not in ["Reuter", "Reuters", "REUTER", "REUTERS", "reuter", "reuters"] ])
    clean_entities.append(cleaned)

In [11]:
len(noun_indexes)

4196

In [12]:
clean_df["clean_entities"] = clean_entities

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["clean_entities"] = clean_entities


In [13]:
len(clean_df)

4196

In [14]:
clean_df["object_noun"] = new_objs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["object_noun"] = new_objs


In [15]:
def change_verb_tense(pair, new_tense):
    try:
        verb = pair.split("_")[-1]
        assert new_tense in ["past", "present", "future"]
        json_obj = {"verb": verb, "tense": new_tense}

        url = "http://localhost:5600/change" 
    
        res = requests.post(url, json=json_obj)
        new_verb = res.json()
    except AssertionError:
        print(f"Tense is 'past', 'present' or 'future'\nYou entered: {new_tense}")
        new_verb = None 
    except:
        new_verb = "error"
    return new_verb
    


In [16]:
past_verb_obj = []

for verb in clean_df['relation']:

    new_verb = change_verb_tense(verb, "past")
    past_verb_obj.append(new_verb)

In [17]:
past_verb_obj[0:10]

['backed',
 'asked',
 'appeared',
 'had',
 'launched',
 'issued',
 'told',
 'held',
 'decided',
 'hailed']

In [18]:
present_verb_obj = []

for verb in clean_df['relation']:

    new_verb = change_verb_tense(verb, "present")
    present_verb_obj.append(new_verb)

In [19]:
future_verb_obj = []

for verb in clean_df['relation']:

    new_verb = change_verb_tense(verb, "future")
    future_verb_obj.append(new_verb)

In [20]:
clean_df["relation_past"] = past_verb_obj
clean_df["relation_present"] = present_verb_obj
clean_df["relation_future"] = future_verb_obj

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["relation_past"] = past_verb_obj
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["relation_present"] = present_verb_obj
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["relation_future"] = future_verb_obj


In [21]:
clean_df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df.drop_duplicates(inplace=True)


In [22]:
len(clean_df)

4050

In [24]:
clean_df = clean_df[clean_df['relation_past'] != "error"]
clean_df = clean_df[clean_df['relation_present'] != "error"]

In [25]:
def get_relation_action(text, rel):
    r_value = rel.split("_")[-1]
    t_list = text.split(" ")
    t_index = t_list.index(r_value)+1
    action = " ".join(t_list[t_index: ])
    return action

def build_context(text, rel, ent):
    action = get_relation_action(text, rel)
    print(f"{' '.join(ent.split('_'))} {rel} {action}")

def build_answer(text, rel, obj, ent):
    action = get_relation_action(text, rel)
    action = action\
        .replace(obj, " ")\
        .strip()\
        .replace("\n", " ")\
        .replace("...", "")
    for ent in ent.split("_"):
        action = action.replace(ent, " ")
    print(f"{action}")

build_context(text="Wang Chang-Ming, Vice Chairman of the Council for Economic\nPlanning and Development, told Reuters the government is\nplanning another round of deep tariff cuts in the second half\nof this year.\n...", ent="Wang_Chang_Ming", rel="told")

Wang Chang Ming told Reuters the government is
planning another round of deep tariff cuts in the second half
of this year.
...


In [26]:
build_answer(text="Wang Chang-Ming, Vice Chairman of the Council for Economic\nPlanning and Development, told Reuters the government is\nplanning another round of deep tariff cuts in the second half\nof this year.\n...", rel="told", obj="Reuters", ent="Wang_Chang_Ming")

the government is planning another round of deep tariff cuts in the second half of this year. 


In [27]:
len(past_verb_obj)

4196

In [28]:
clean_df['relation_future'] = ["pay" if x == "paid" else x for x in clean_df["relation_future"]]

In [29]:
clean_df['relation_future'] = ["rumor" if x == "rumore" else x for x in clean_df["relation_future"]]

In [30]:
change_verb_tense("pay", "future")

'pay'

In [31]:
verb_wh_pairs  = pd.read_csv("data/future_token_counts.csv", index_col=False)

In [32]:
verb_wh_pairs["token"] = [x.strip() for x in verb_wh_pairs["token"]]

In [33]:
verb_wh_pairs = verb_wh_pairs[verb_wh_pairs["token"] != "remove"]

In [34]:
len(verb_wh_pairs)

510

In [35]:
clean_df[clean_df["relation_future"] == "stop"]

Unnamed: 0,id,text,entity,object,relation,title,clean_entities,object_noun,relation_past,relation_present,relation_future
1856,0.0,Botswana has stopped importing almost\nall mea...,Botswana,importing_almost_\n_all_meat_products_from_Zim...,has_stopped,"BOTSWANA BANS ZIMBABWE MEAT PRODUCTS, AGENCY SAYS",Botswana,Zimbabwe,stopped,stops,stop
5969,0.0,Zambia has stopped sending its copper\nexports...,Zambia,sending_its_copper_\n_exports_through_South_Af...,has_stopped,ZAMBIA STOPS SENDING COPPER THROUGH SOUTH AFRICA,Zambia,South Africa,stopped,stops,stop
6624,11.0,Cairo stopped repaying Moscow for arms purchas...,Cairo,repaying_Moscow_for_arms_purchases_in_1977_\n_...,stopped,"EGYPT, SOVIETS TO RENEGOTIATE ARMS DEBT TERMS",Cairo,Moscow president Anwar Sadat U.S,stopped,stops,stop
6813,11.0,Cairo stopped repaying Moscow for arms purchas...,Cairo,repaying_Moscow_for_arms_purchases_in_1977_\n_...,stopped,"EGYPT, SOVIET UNION TO RENEGOTIATE ARMS DEBT T...",Cairo,Moscow president Anwar Sadat U.S,stopped,stops,stop
8048,1.0,Baldrige will also stop in Hong Kong to meet B...,Baldrige,to_meet_British_\n_officials_and_local_U.S._An...,will_stop,BALDRIGE TO LAUNCH FAR EAST TRADE DRIVE,Baldrige,U.S. Hong Kong,stopped,stops,stop
12379,1.0,"Peru, the world's second biggest silver produc...",Peru,selling_its_refined_silver_and_state_-_markete...,stopped,PERU PRESIDENT WARNS OF RETALIATION ON SILVER,Peru,tuesday,stopped,stops,stop
14104,2.0,The company said deferrals of payment obligati...,Zapata,paying_interest_on_the_debentures_in_April_\n_...,stopped,ZAPATA <ZOS> WON'T PAY INTEREST ON DEBENTURES,Zapata,April,stopped,stops,stop
14489,13.0,But Rafidain bank stopped paying debt due on l...,Rafidain_bank,paying_debt_due_on_letters_of_\n_credit_last_M...,stopped,IRAQ DEFERS PAYMENTS ON 500 MLN DLR EUROLOAN,Rafidain bank,March,stopped,stops,stop
14519,1.0,<Ryoka Light Metal Industries Ltd> will stop s...,Ryoka_Light_Metal_Industries_Ltd_>,smelting_in_\n_April,will_stop,NIPPON LIGHT METAL CONTINUES ALUMINIUM OUTPUT CUT,Ryoka Light Metal Industries Ltd >,April,stopped,stops,stop
15586,12.0,"""\n An immediate goal is to get Shell to st...",Shell,selling_fuel_to_\n_South_Afria_'s_military_and...,to_stop,ROYAL DUTCH <RD> OBJECT OF SHAREHOLDER CAMPAIGN,Shell,South Afria,stopped,stops,stop


In [36]:
wh_lookup = {k[0]:k[1] for k in verb_wh_pairs[["token", "wh"]].values}

In [37]:
wh_lookup['offer']

'what'

In [38]:
import re 

def get_relation_action(text, rel):
    try:
        r_value = rel.split("_")[-1]
        t_list = text.split(" ")
        t_index = t_list.index(r_value)+1
        action = " ".join(t_list[t_index: ])
    except ValueError:
        r_value = rel.split("_")[-1]
        text = re.sub(r"""
                [,.;@#?!&$")(-/]+  # Accept one or more copies of punctuation
               \ *           # plus zero or more copies of a space,
               """,
               " ",          # and replace it with a single space
               text, flags=re.VERBOSE)
        t_list = text.replace("\n", " ").split(" ")
        t_index = t_list.index(r_value)+1
        action = " ".join(t_list[t_index: ])
    return t_index, action. \
        replace("\n", " "). \
            strip()

def build_text(text, rel, ent):
    _, action = get_relation_action(text, rel)
    return f"{' '.join(ent.split('_'))} {' '.join(rel.split('_'))} {action}".strip()

def build_answer(text, rel, obj, ent):
    action = get_relation_action(text.replace("\n", " "), rel)
    action = action\
        .replace(obj, " ")\
        .strip()\
        .replace("\n", " ")\
        .replace("...", "")
    for ent in ent.split("_"):
        action = action.replace(ent, " ")
    print(f"{action}")

def find_numbers(text, obj):
    pass


def build_question(wh, relation, obj, ent):
    ent = " ".join(ent.split("_"))
    obj = " ".join(obj.split("_"))
    q = f"{wh.capitalize()} did {ent} {relation}?"

    return q





In [39]:
text = clean_df.iloc[500]["text"]
relation = clean_df.iloc[500]["relation"]
entity = clean_df.iloc[500]["entity"]

ind, a = get_relation_action(text, relation)

In [40]:
build_text(text, relation, entity)

'IMF team urges Belgium to adopt a firm interest rate policy, with a particular emphasis on long-term rates.'

In [41]:
text

'The IMF team also urges Belgium to adopt a firm interest\nrate policy, with a particular emphasis on long-term rates.\n    '

In [42]:
paragraphs = []
qas = []

TEMPLATE = {"title": "this is a title",
            "paragraphs": [
                {"qas": [],
                 "context": "dfs"}]}
data = []
error_count = 0
error_indices = []
for title in clean_df["title"].unique():
    
    template = TEMPLATE
    template["title"] = title
    qas = []
    for index, row in clean_df[clean_df["title"] == title].iterrows():

        wh = wh_lookup.get(row["relation_future"]) 

        text = row["text"]
        
        clean_relation = row["relation_future"]
        relation = row["relation"]

        obj = row["object"]
        clean_obj = row["object_noun"]
        
        clean_entity = row["clean_entities"]
        entity = row["entity"]


        try:
            q_text = build_text(text, relation, entity)
            assert(obj != clean_entity)
        
            mod = "did"
    
            q = build_question(wh, clean_relation, clean_obj, clean_entity)
        except AssertionError:
            print("Assertion error build text")
            q = "AssertionError"
        except:
            q = "Error"
        
        #print(f"{text}\n{clean_entity}")
        try:
            answer_start_num, answer = get_relation_action(text, relation)
            qa = {"question": q,
                            "id": str(index),
                            "answers": [{"text": q_text, "answer_start": answer_start_num}],
                            "is_impossible": False
                            }
            qas.append(qa)
        except ValueError as e:
            print(f"Value error get relation:\n{e}")
            print(text)
            error_count += 1
            error_indices.append(index)
        qa_dict = {"qas": qas, "context": text}
        paragraphs = [qa_dict]

        final_dict = {"title": title, "paragraphs": paragraphs}
        data.append(final_dict)
    
        

In [43]:
final_dict = {"data": data}

In [46]:
with open("data/reuters/reuters_squad_form.json", "w") as f:
    d = json.dumps(final_dict)
    f.write(d)

In [45]:
data[0]

{'title': 'IRAN REPORTS HEAVY FIGHTING IN IRAQI KURDISTAN',
 'paragraphs': [{'qas': [{'question': 'What did Iran back?',
     'id': '7',
     'answers': [{'text': 'Iran has backed dissident Kurds in the area in attacks on government positions and installations in northern Iraq.',
       'answer_start': 3}],
     'is_impossible': False}],
   'context': 'Iran has backed dissident Kurds in the area in attacks on\ngovernment positions and installations in northern Iraq.\n    '}]}