# **Packages installation and modules importing**

In [0]:
pip install inflect     #module used to check if a word is singular or plural

In [0]:
pip install nltk        

In [0]:
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk, RegexpParser, Tree
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import csv
import re
import numpy as np
from google.colab import files
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# **Importing and preparing data**

In [0]:
uploaded = files.upload()         #Uploading training set

Saving training_set.csv to training_set.csv


In [0]:
training_data = pd.read_csv("training_set.csv")

In [0]:
training_data["sent"] = training_data["sent"].str.lower()

In [0]:
uploaded = files.upload()         #Uploading disambiguation data

Saving disambiguation_answers_file.csv to disambiguation_answers_file.csv


In [0]:
disambiguation = pd.read_csv("disambiguation_answers_file.csv")

In [0]:
disambiguation[" resolution"] = disambiguation[" resolution"].str.lower()

In [0]:
# Stripping the sentences from the anaphor tags
sentences_list = []
for statement in training_data['sent']:
    sentences_list.append(re.sub('\<referential\>|\<\/referential\>|\<referential id\="*[a-z]*"\>', '', str(statement)))

In [0]:
# New data frame of training set without the anaphor tags
training_data_df = pd.DataFrame({'sent_id' : training_data['sent_id'],
                                'sent' : sentences_list}, 
                                columns=['sent_id','sent'])

# **Functions**

In [0]:
# Defining a grammar & Parser
NP = "NP: {<DT>?<JJ>*<NN>}"
chunker = RegexpParser(NP)

# Function to identify the antecedents
def get_continuous_chunks(text, chunk_func=ne_chunk):
    chunked = chunk_func(pos_tag(word_tokenize(text)))
    continuous_chunk = []
    current_chunk = []

    for subtree in chunked:
        if type(subtree) == Tree:
            current_chunk.append(" ".join([token for token, pos in subtree.leaves()]))
        elif current_chunk:
            named_entity = " ".join(current_chunk)
            if named_entity not in continuous_chunk:
                continuous_chunk.append(named_entity)
                current_chunk = []
        else:
            continue

    return continuous_chunk

In [0]:
# This function checks if the word is singular using 'inflect' module
def is_singular (word):
  import inflect
  p = inflect.engine()
  singular = p.singular_noun(word) == False
  return singular

In [0]:
# This function returns one word from a list if it matches an anaphor in number, 
# and returns 0 if there is no matching or 1 if there are multiple matchings
def num_agreement(anaphor, ant_list): #input: anaphor text + antecedents list
  temp_list = []
  for item in ant_list:
    if is_singular(anaphor) == is_singular(item): temp_list.append(item)
  if len(temp_list) == 1: 
    num_agreement_output = temp_list[0]
  elif len(temp_list) == 0:
    num_agreement_output = 0
  else:
    num_agreement_output = 1
  return num_agreement_output

In [0]:
# List of definite article, demonstrative, and possessive adjectives.
def_words = ['the', 'this','that','these','those', 'my','your','his','her','its','our','your','their']

# Returns the definite NP of a list if there is only one.
# Returns 0 if there is none, 1 if there is more than one.
def definite(np_list):
  temp_list = []
  for item in np_list:
    tokenized_np = word_tokenize(item)
    if tokenized_np[0] in def_words: temp_list.append(item)
  if len(temp_list) == 1: 
    definiteness_output = temp_list[0]
  elif len(temp_list) > 1:
    definiteness_output = 1
  else:
    definiteness_output = 0
  return definiteness_output

In [0]:
#Returns the antecedent closest to the anaphor
def proximity(sentence, anaphor, np_list):
  near_ant = 0
  ant_list = []
  ant_distance = []
  for item in np_list:
    ant_list.append(item)
    ant_distance.append(sentence.find(anaphor) - (sentence.find(item)+len(item)))
  if len(ant_list) != 0:
    prox_index = ant_distance.index(min(ant_distance))
    near_ant = ant_list[prox_index]
  return near_ant

# **Identifying Antecedents**

In [0]:
# Noun phrases (antecedents) list
np_list = training_data_df['sent'].apply(lambda sent: get_continuous_chunks(sent, chunker.parse))

In [0]:
# Identifying the antecedents and their ids 
# The anaphor id is the sentence id if the sentnce hase one anaphor,
# or the sentence id followed by a letter of the alphabet otherwise
anaphors = []
sent_anaphor_ids = []

for i in range(0,len(training_data)) :
    anaphor = re.findall(r'\>(.*?)\<\/referential\>', training_data.iloc[i]['sent'])
    anaphors.append(anaphor)
    temp =[]
    if (re.search("<referential id=.+?>", training_data.iloc[i]['sent'])) is None :
      temp.append((training_data.iloc[i]['sent_id']))
    else:
      for j in range(0,len(anaphor)):
        temp.append(training_data.iloc[i]['sent_id']+ "-" + re.findall("<referential id=.+?>", training_data.iloc[i]['sent'])[j].split('"')[1])
    sent_anaphor_ids.append(temp)

In [0]:
# Constructing lists with the anaphors and the output of each antecedent 
# identification criteria for each anaphor.
i=0
anaphors_list = []
num_agree_list = []
def_agree_list = []
proximity_list = []
anaphor_ids = []

i=0
for item in anaphors:
  for anaphor in item:
    anaphors_list.append(anaphor)        
    num_agree_list.append(num_agreement(anaphor, np_list[i]))
    def_agree_list.append(definite(np_list[i]))
    proximity_list.append(proximity(sentences_list[i], anaphor, np_list[i]))
  i=i+1

j=0
for item in sent_anaphor_ids:
  for id in item:
    anaphor_ids.append(id)

  j=j+1

In [0]:
# List of whether an intecedent is ambiguous or unambiguous
# note that the disambiguation file (from the dataser) is used to construct this 
# list rather than the detection file as the detection file is listed per sentences
# not per anaphot
decision = []
for item in anaphor_ids:
  if item in list(disambiguation['sent_id']): decision.append("AMBIGUOUS")
  else: decision.append("UNAMBIGUOUS")

In [0]:
#detection data frame
detect = pd.DataFrame({'anaphor_id' : anaphor_ids,
                         'anaphor' : anaphors_list,
                          'num_agreement' : num_agree_list,
                          'definiteness' : def_agree_list ,
                          'proximity' : proximity_list,
                          'decision' : decision}, 
                                columns=['anaphor_id','anaphor','num_agreement', 'definiteness', 'proximity', 'decision'])

In [0]:
# Disambiguation dataframe
disambig = detect[detect['decision'] == 'AMBIGUOUS']

# **Classification**
Not functional!

In [0]:
# First for detection
# X = detect[['num_agreement','definiteness','proximity']].to_numpy()

In [0]:
# y = np.asanyarray(decision)

In [0]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)