In [16]:
import pandas as pd
import tokenizers
import requests 
import json
import re
import os
import spacy

In [2]:
final_df = pd.DataFrame()
for relation_file in os.listdir("../data/relations"):
    df = pd.read_parquet(f"../data/relations/{relation_file}")
    final_df = pd.concat([final_df, df])

In [3]:
final_df.head()

Unnamed: 0,id,text,entity,object,relation,title
0,7.0,"Finance Minister Mark\nEyskens, who currently ...",Minister_Mark_Eyskens,issue,has_called,"BELGIAN ECU COIN ISSUE PRICED, SALE DATE SET"
0,1.0,Bank governor Chang Chi-Cheng told reporters t...,governor_Chang_Chi_Cheng,reporters,told,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH
3,2.0,"He said the rise showed signs of slowing, howe...",Taiwan,import_policy,has_liberalised,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH
4,3.0,Chang declined to predict how high the reserve...,Chang,to_predict_how_high_the_reserves_might_rise,declined,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH
5,4.0,"In January, Taiwan reduced import tariffs of u...",Taiwan,import_tariffs,reduced,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH


In [4]:
all_df = pd.read_parquet("../data/reuters/reuters_all.parquet")

In [5]:
all_df.head()

Unnamed: 0,old_id,new_id,has_topics,date,topics,places,people,orgs,exchanges,companies,title,dateline,body,author,cgi_split,lewis_split
0,9914,5001,True,1987-03-13,acq,usa,,,,,PLM <PLMA> UNIT ENDS MERGER TALKS,"SAN FRANCISCO, March 13 -",PLM Cos Inc said its PLM Power Co\nunit broke ...,,TRAINING-SET,TRAIN
1,9915,5002,True,1987-03-13,coffee,colombia,,,,,COLOMBIA OPENS APRIL/MAY COFFEE REGISTRATIONS,"BOGOTA, March 13 -",Colombia opened coffee export\nregistrations f...,,TRAINING-SET,TRAIN
2,9916,5003,True,1987-03-13,grain,usa,,,,,USDA REPORTS 10.572 MLN ACRES IN CONSERVATION,"WASHINGTON, March 13 -",The U.S. Agriculture Department has\naccepted ...,,TRAINING-SET,TRAIN
3,9917,5004,True,1987-03-13,,usa,,,,,BRAZIL DEBT POSES THORNY ISSUE FOR U.S. BANKS,"NEW YORK, March 13 -",CitiCorp <CCI> appears to be digging\nin its h...,"By Cal Mankowski, Reuters",TRAINING-SET,TRAIN
4,9918,5005,True,1987-03-13,,usa,,,,,ROCKWELL <ROK> TO REPURCHASE MORE COMMON SHARES,"PITTSBURGH, March 13 -",Rockwell International Corp said its\nboard ha...,,TRAINING-SET,TRAIN


In [6]:
full_text_map = {}

for title in list(final_df['title'].unique()):
    title_df = all_df[all_df["title"] == title]
    if len(title_df) > 1:
        title_body = \
            title_df["body"].values[0]
    else:
        title_body = title_df["body"].values[0]
    full_text_map[title] = title_body

In [55]:
final_df["context"] = final_df['title'].map(lambda t: full_text_map.get(t))

In [56]:
final_df.index = range(len(final_df))

In [57]:
def clean_text(text):
    text = text.replace("\n", " ")
    text = re.sub(' +', ' ', text)
    return text

In [58]:
clean_text("A messy sentence\n about    something. Here is another sentebnce.")

'A messy sentence about something. Here is another sentebnce.'

In [59]:
final_df["context"] = final_df["context"].apply(clean_text)

In [60]:
final_df.head(50)

Unnamed: 0,id,text,entity,object,relation,title,context
0,7.0,"Finance Minister Mark\nEyskens, who currently ...",Minister_Mark_Eyskens,issue,has_called,"BELGIAN ECU COIN ISSUE PRICED, SALE DATE SET",A limited Belgian issue of silver Ecu coins wi...
1,1.0,Bank governor Chang Chi-Cheng told reporters t...,governor_Chang_Chi_Cheng,reporters,told,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH,Taiwan's foreign exchange reserves hit a new h...
2,2.0,"He said the rise showed signs of slowing, howe...",Taiwan,import_policy,has_liberalised,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH,Taiwan's foreign exchange reserves hit a new h...
3,3.0,Chang declined to predict how high the reserve...,Chang,to_predict_how_high_the_reserves_might_rise,declined,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH,Taiwan's foreign exchange reserves hit a new h...
4,4.0,"In January, Taiwan reduced import tariffs of u...",Taiwan,import_tariffs,reduced,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH,Taiwan's foreign exchange reserves hit a new h...
5,6.0,"Wang Chang-Ming, Vice Chairman of the Council ...",Wang_Chang_Ming,Reuters,told,TAIWAN FOREIGN EXCHANGE RESERVES HIT NEW HIGH,Taiwan's foreign exchange reserves hit a new h...
6,1.0,"Iran launched the new offensive, codenamed Kar...",Iran,offensive,launched,IRAN REPORTS HEAVY FIGHTING IN IRAQI KURDISTAN,Iran said its troops repulsed heavy Iraqi coun...
7,10.0,Iran has backed dissident Kurds in the area in...,Iran,Kurds,has_backed,IRAN REPORTS HEAVY FIGHTING IN IRAQI KURDISTAN,Iran said its troops repulsed heavy Iraqi coun...
8,12.0,The Iraqis have made no comment so far on the ...,Iraqis,comment,have_made,IRAN REPORTS HEAVY FIGHTING IN IRAQI KURDISTAN,Iran said its troops repulsed heavy Iraqi coun...
9,12.0,The Iraqis have made no comment so far on the ...,Tehran,comment,reported,IRAN REPORTS HEAVY FIGHTING IN IRAQI KURDISTAN,Iran said its troops repulsed heavy Iraqi coun...


In [61]:
final_df.iloc[0]["context"]

'A limited Belgian issue of silver Ecu coins with a face value of five Ecus will go on sale from March 23 at a price of 500 Belgian francs each, a Finance Ministry spokesman said. Gold Ecu coins with a face value of 50 Ecus will be sold from the same day. The spokesman told Reuters the price for these would be fixed just before they go on sale but was likely to be between 8,500 and 9,000 francs. At least two mln silver coins and several hundreds of thousands of the gold coins will be minted, he said. They will be sold both in Belgium and abroad. The coins will be the first ever denominated in the Ecu, the "basket" comprised of the 12-nation European Community\'s currencies except the Spanish peseta and the Portuguese escudo. The issue is being made to mark the 30th anniversary of the EC\'s founding Treaty of Rome this month. Finance Minister Mark Eyskens, who currently presides over the EC\'s council of economic and finance ministers, has called the issue a political act of symbolic va

In [20]:
nlp = spacy.load("en_core_web_lg")

In [21]:
noun_indexes = []
new_objs = []
for index, row in final_df.iterrows():
    cleaned = [c for c in row["object"].split("_") if c not in ["Reuter", "Reuters", "REUTER", "REUTERS", "reuter", "reuters"]]
    cleaned = " ".join(cleaned)
    doc = nlp(cleaned)
    nouns = []
    for d in doc:
        if d.tag_ in ["NNP", "NNPS"]:
            if index not in noun_indexes:
                noun_indexes.append(index)
            noun = str(d.text)
            if noun not in nouns:
                nouns.append(noun)
    if nouns:
        new_objs.append(" ".join(nouns))

In [62]:
clean_df = final_df[final_df.index.isin(noun_indexes)]

In [63]:
clean_entities = []

for index, row in clean_df.iterrows():
    cleaned = " ".join([x for x in row["entity"].split("_") if x not in ["Reuter", "Reuters", "REUTER", "REUTERS", "reuter", "reuters"] ])
    clean_entities.append(cleaned)

In [64]:
len(noun_indexes)

4196

In [65]:
clean_df["clean_entities"] = clean_entities

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["clean_entities"] = clean_entities


In [66]:
len(clean_df.drop_duplicates())

4050

In [67]:
clean_df["object_noun"] = new_objs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["object_noun"] = new_objs


In [68]:
clean_df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df.drop_duplicates(inplace=True)


"""def change_verb_tense(pair, new_tense):
    try:
        verb = pair.split("_")[-1]
        assert new_tense in ["past", "present", "future"]
        json_obj = {"verb": verb, "tense": new_tense}

        url = "http://localhost:5600/change" 
    
        res = requests.post(url, json=json_obj)
        new_verb = res.json()
    except AssertionError:
        print(f"Tense is 'past', 'present' or 'future'\nYou entered: {new_tense}")
        new_verb = None 
    except:
        new_verb = "error"
    return new_verb
"""

def change_verb_tense(word, new_tense):

    try:
        conjugated_word = conjugate("walked", new_tense)
    except:
        conjugated_word = word
    return conjugated_word



In [69]:
past_verb_obj = []
missed = []
for index, row in clean_df.iterrows():
    verbs =  row["relation"].split("_")
    new_verb = None
    if len(verbs) > 1:
        for v in verbs:
            lemma = nlp(v)[0].lemma_
            if nlp(lemma)[0].pos_:
                new_verb = lemma
            else:
                continue
    else:
        new_verb = nlp(row["relation"])[0].lemma_
    if new_verb:
        past_verb_obj.append(new_verb)
    else:
        past_verb_obj.append("error")
        missed.append(index)

In [70]:
past_verb_obj[0:10]

['back',
 'ask',
 'appear',
 'have',
 'launch',
 'issue',
 'tell',
 'hold',
 'decide',
 'hail']

In [71]:
#from pattern3.en import conjugate, PAST, PRESENT, SINGULAR, PLURAL, FUTURE
clean_df["relation_past"] = past_verb_obj

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df["relation_past"] = past_verb_obj


In [72]:
clean_df.head()

Unnamed: 0,id,text,entity,object,relation,title,context,clean_entities,object_noun,relation_past
7,10.0,Iran has backed dissident Kurds in the area in...,Iran,Kurds,has_backed,IRAN REPORTS HEAVY FIGHTING IN IRAQI KURDISTAN,Iran said its troops repulsed heavy Iraqi coun...,Iran,Kurds,back
13,1.0,"Japan, the biggest buyer of Indian iron ore wi...",Japan,India,has_asked,INDIA AND JAPAN TO DISCUSS IRON ORE PRICES,The state-owned Minerals and Metals Trading Co...,Japan,India,ask
15,7.0,Banking sources said that although the January...,ALP_growth,to_have_accelerated_in_February,appeared,SPAIN RAISES BANKS' RESERVE REQUIREMENT,The Bank of Spain said it raised the reserve r...,ALP growth,February,appear
16,2.0,"In December, the BLEU had an 11.9 billion fran...",BLEU,franc_trade_surplus_REUTER,had,BELGOLUX TRADE MOVES INTO SURPLUS IN 1986,The Belgo-Luxembourg Economic Union (BLEU) mov...,BLEU,franc,have
18,0.0,Caisse Centrale de Credit Cooperatif is\nlaunc...,Caisse_Centrale_de_Credit_Cooperatif,bond_issue_managers_Credit_Lyonnais,is_launching,CENTRALE DE CREDIT COOPERATIF ISSUE DETAILED,Caisse Centrale de Credit Cooperatif is launch...,Caisse Centrale de Credit Cooperatif,Credit Lyonnais,launch


In [73]:
clean_df = clean_df[clean_df['relation_past'] != "error"]

In [74]:
clean_df['relation_past'] = ["pay" if x == "paid" else x for x in clean_df["relation_past"]]

In [75]:
clean_df['relation_past'] = ["rumor" if x == "rumore" else x for x in clean_df["relation_past"]]

In [76]:
#change_verb_tense("ended", "future")

In [77]:
verb_wh_pairs  = pd.read_csv("../data/future_token_counts.csv", index_col=False)

In [78]:
verb_wh_pairs["token"] = [x.strip() for x in verb_wh_pairs["token"]]

In [79]:
verb_wh_pairs = verb_wh_pairs[verb_wh_pairs["token"] != "remove"]

In [80]:
len(verb_wh_pairs)

510

In [81]:
wh_lookup = {k[0]:k[1] for k in verb_wh_pairs[["token", "wh"]].values}

In [82]:
wh_lookup['appear']

'when'

In [235]:
import re


def get_index_of_punc(text):
    #pat = r'[.?\-,:]+'
    pat = r'[.?;]+'
    res = re.findall(pat, text)
    if res and len(res) != 0:
        punc = res[0]
    elif res:
        punc = res
    else:
        raise ValueError
    stop_index = text.index(punc)
    return stop_index

def get_short_answer_text(text, start_index):
    start_token = text.split(" ")[start_index]
    string_start_index = text.index(start_token)
    chnk1 =  text[string_start_index:]

    stop_index = get_index_of_punc(chnk1)
    short_answer = chnk1[0:stop_index+1]
    return short_answer

def get_relation_action(text, rel):
    try:
        r_value = rel.split("_")[-1]
        t_list = text.split(" ")
        t_index = t_list.index(r_value)+1
        action = get_short_answer_text(text, t_index)
    except ValueError:

        r_value = rel.split("_")[-1]
        relation_stemmed = nlp(r_value)[0].lemma_

        t_list = text.replace("\n", " ").split(" ")
        t_index = t_list.index(relation_stemmed)+1
        action = get_short_answer_text(text, t_index)

    return t_index, action. \
    replace("\n", " "). \
        strip()



def build_text(text, rel, ent):
    _, action = get_relation_action(text, rel)
    return f"{' '.join(ent.split('_'))} {' '.join(rel.split('_'))} {action}".strip()

def build_answer(text, rel, obj, ent):
    action = get_relation_action(text.replace("\n", " "), rel)
    action = action\
        .replace(obj, " ")\
        .strip()\
        .replace("\n", " ")\
        .replace("...", "")
    for ent in ent.split("_"):
        action = action.replace(ent, " ")
    print(f"{action}")

def build_question(wh, relation, obj, ent):
    ent = " ".join(ent.split("_"))
    obj = " ".join(obj.split("_"))

    if wh:
        q = f"{wh.capitalize()} did {ent} {relation}?"
    else:
        relation_stemmed = nlp(relation)[0].lemma_
        try:
            wh = wh_lookup.get(relation_stemmed)
            q = f"{wh.capitalize()} did {ent} {relation_stemmed}?"
        except AttributeError:
            q = f"What did {ent} {relation_stemmed}?"

    return q


In [236]:
text = clean_df.iloc[500]["text"]
relation = clean_df.iloc[500]["relation"]
entity = clean_df.iloc[500]["entity"]

ind, a = get_relation_action(text, relation)

In [237]:
build_text(text, relation, entity)

'IMF team urges Belgium to adopt a firm interest rate policy, with a particular emphasis on long-term rates.'

In [238]:
paragraphs = []
qas = []
questions = []
TEMPLATE = {"title": "this is a title",
            "paragraphs": [
                {"qas": [],
                 "context": "dfs"}]}
data = []
error_count = 0
error_indices = []
for title in clean_df["title"].unique():
    
    template = TEMPLATE
    template["title"] = title
    qas = []
    for index, row in clean_df[clean_df["title"] == title].iterrows():
        wh = wh_lookup.get(row["relation_past"])
        text = row["context"]
        clean_relation = row["relation_past"]
        relation = row["relation"]
        obj = row["object"]
        clean_obj = row["object_noun"]
        clean_entity = row["clean_entities"]
        entity = row["entity"]

        try:
            q_text = build_text(text, relation, entity)
            assert(obj != clean_entity)
            q = build_question(wh, clean_relation, clean_obj, clean_entity)
        except AssertionError:
            print("Assertion error build text")
            q = "AssertionError"
        except AttributeError as e:
            print(f"Error:\n{e}\n{row['relation_future']}\n{index}")
            q = "Error"
        except ValueError as e:
            print(f"Error:\n{e}\n{row['relation']}\n{index}")
            q = "Error"

        if q not in questions and q not in  ["Error", "AssertionError"]:
            questions.append(q)
            answer_start_num, answer = get_relation_action(text, relation)
            qa = {"question": q,
                            "id": str(index),
                            "answers": [{"text": q_text, "answer_start": answer_start_num}],
                            "is_impossible": False
                            }
            qas.append(qa)
        else:
            #print(f"{q}")
            continue
        qa_dict = {"qas": qas, "context": text}
        paragraphs = [qa_dict]

        final_dict = {"title": title, "paragraphs": paragraphs}
        data.append(final_dict)


Error:
'end' is not in list
ended
830
Error:
'manage' is not in list
managed
1025
Error:
'comprise' is not in list
comprising
1151
Error:
'add' is not in list
will_add
1871
Error:
'end' is not in list
ends
2239
Error:
'end' is not in list
ends
2701
Error:
'fire' is not in list
fired
2734
Error:
'include' is not in list
includes
3845
Error:
'end' is not in list
ends
4058
Error:
'say' is not in list
said
5757
Error:
'end' is not in list
ends
6353
Error:

exclude
6434
Error:
'refuse' is not in list
refused
9289
Error:
'end' is not in list
ends
6491
Error:
'schedule' is not in list
is_scheduled
7494
Error:
'bag' is not in list
bagged
7611
Error:
'offer' is not in list
offers
19452
Error:
'resign' is not in list
resigned
8599
Error:
'end' is not in list
ends
10866
Error:
'say' is not in list
said
11874
Error:
'end' is not in list
ends
12599
Error:
'offer' is not in list
offered
13223
Error:
'consider' is not in list
would_consider
13848
Error:
'comment' is not in list
commented
13873
Error:

In [245]:
data[13]

{'title': 'TAIWAN SHIPBUILDER LOOKS FOR JAPANESE VENTURES',
 'paragraphs': [{'qas': [{'question': 'What did China Shipbuilding Corp plan?',
     'id': '65',
     'answers': [{'text': 'China Shipbuilding Corp plans to seek joint production agreements with Japan and further diversify into ship repairing to try to trim its debts, chairman Louis Lo said.',
       'answer_start': 7}],
     'is_impossible': False}],
   'context': 'Taiwan\'s state-owned China Shipbuilding Corp (CSBC) plans to seek joint production agreements with Japan and further diversify into ship repairing to try to trim its debts, chairman Louis Lo said. He told Reuters in an interview that CSBC\'s first joint production venture, to build two hulls for <Onomichi Dockyard Co Ltd>, was a success. Talks on similar projects have been held with other Japanese firms, including Mitsubishi Heavy Industries Co Ltd <MITH.T> and Ishikawajima-Harima Heavy Industries Co Ltd <JIMA.T>, he said. Lo said CSBC delivered the hulls of two 2

In [246]:
with open("../data/reuters/reuters_squad_form.json", "w") as f:
    d = json.dumps(final_dict)
    f.write(d)

In [247]:
qa_df = pd.DataFrame({"question": questions}, index=range(len(questions)))

In [248]:
qa_df.to_json("../data/reuters/questions.json")