In [None]:
# Loading the SVO of any type

import pickle as p
with open('/Datasets/svo_en_large.dat', 'rb') as f:
  (Sub_dict, Verb_dict, Obj_dict, svo_list) = p.load(f)

In [None]:
# Creating the dictionary structure for svo, sv, vo and subject, verb and object

# Notes down the total tokens of each
svo_total = 0
sub_total = 0
verb_total = 0
obj_total = 0
sv_total = 0
vo_total = 0

# Notes down the total types of each
svo_key = len(svo_list.keys())
sub_key = len(Sub_dict.keys())
obj_key = len(Obj_dict.keys())
verb_key = len(Verb_dict.keys())

sv_dict = {}
vo_dict = {}

for i in svo_list:
  svo_total+=svo_list[i]
  try:
    sv_dict[(i[0],i[1])]+=1
  except KeyError:
    sv_dict[(i[0],i[1])]=1
  try:
    vo_dict[(i[1],i[2])]+=1
  except KeyError:
    vo_dict[(i[1],i[2])]=1

for s in Sub_dict:
  sub_total += Sub_dict[s]
for v in Verb_dict:
  verb_total += Verb_dict[v]
for o in Obj_dict:
  obj_total += Obj_dict[o]

sv_key = len(sv_dict.keys())
vo_key = len(vo_dict.keys())

for sv in sv_dict:
  sv_total += sv_dict[sv]
for vo in vo_dict:
  vo_total += vo_dict[vo]

In [None]:
import numpy as np

In [None]:
import nltk
from nltk.corpus import wordnet
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
bnc = nltk.corpus.reader.bnc.BNCCorpusReader(root='Datasets/BNC_baby/Texts/', fileids=r'.*.xml')

In [None]:
# Loading NLTK lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
noun_tag = ('nn0','nn1','nn2','np0','pni','pnp','pnq','pnx')
verb_tag = ('vbb','vbd','vbg','vbi','vbn','vbz','vdb','vdd',
            'vdi','vdg','vdn','vdz','vhb','vhd','vhg','vhi',
            'vhn','vhz','vm0','vvb','vvd','vvg','vvi','vvn','vvz')

noun_set = {}
verb_set = {}
count_obj = {}

In [None]:
for sent in bnc.tagged_sents(c5=True):
  for wt in sent:
    try:
      word=lemmatizer.lemmatize(wt[0].lower())
      tag = wt[1].lower()
    except:
      continue
    if tag in noun_tag:
      try:
        noun_set[word]+=1
      except:
        noun_set[word]=1
    if tag in verb_tag:
      try:
        verb_set[word]+=1
      except:
        verb_set[word]=1
    try:
      count_obj[word]+=1
    except:
      count_obj[word]=1

In [None]:
verb_list = [j[1] for j in sorted(list(zip(verb_set.values(),verb_set.keys())), reverse=True)][:5000]

In [None]:
noun_list = [j[1] for j in sorted(list(zip(noun_set.values(),noun_set.keys())), reverse=True)][:2000]

In [None]:
SYN_PARAM = 1./20
HYP_PARAM = 1./40
HYPO_PARAM = 1./40

SVO_PARAM = 0.6
SVtO_PARAM = 0.7
SV_PARAM = 0.4
VO_PARAM = 0.4
SVt_PARAM = 0.5
VtO_PARAM = 0.5

SMOOTHING = 2

In [None]:
parameters = {}
parameters['R'] = np.array([SYN_PARAM, HYP_PARAM, HYPO_PARAM])
parameters['F'] = np.array([SVO_PARAM, SVtO_PARAM, SV_PARAM, SVt_PARAM, VO_PARAM, VtO_PARAM])
parameters['S'] = SMOOTHING

In [None]:
# Creating a scoring object for easy access

score_obj = {
    'svo':svo_list,
    'sub':Sub_dict,
    'verb':Verb_dict,
    'obj':Obj_dict,
    'sv':sv_dict,
    'vo':vo_dict,
    'token_sv':sv_total,
    'token_vo':vo_total,
    'token_sub':sub_total,
    'token_verb':verb_total,
    'token_obj':obj_total,
    'token_svo':svo_total,
    'type_sub':sub_key,
    'type_verb':verb_key,
    'type_obj':obj_key,
    'type_svo':svo_key,
    'type_sv':sv_key,
    'type_vo':vo_key,
    'phi_svo':10,
    'phi_sv':10,
    'phi_vo':10,
    'phi_sub':10,
    'phi_verb':10,
    'phi_obj':10,
    'noun_list': noun_list,
    'verb_list': verb_list,
    'count': count_obj,
    'parameters': parameters
}

In [None]:
score_obj['parameters']=parameters

In [None]:
# Calculates the jaccard similarity
def jaccard(s1, s2):
  intersection = len(s1.intersection(s2))
  union = len(s1.union(s2))
  if union==0:
    return 0.0
  return  float(intersection)/float(union)

# Calculates the R value between two words
def get_R_word2word(w1, w2, parameters):
  synonym1 = []
  synonym2 = []
  for ss in wordnet.synsets(w1):
    synonym1.append(set(ss.lemma_names()))
  for ss in wordnet.synsets(w2):
    synonym2.append(set(ss.lemma_names()))
  synonym_r = 0
  for sen_1 in synonym1:
    for sen_2 in synonym2:
      synonym_r = max(synonym_r, jaccard(sen_1, sen_2))

  hypernym1 = []
  hypernym2 = []
  for ss in wordnet.synsets(w1):
    hypernym1.append(set(ss.hypernyms()))
  for ss in wordnet.synsets(w2):
    hypernym2.append(set(ss.hypernyms()))
  hypernym_r = 0
  for sen_1 in hypernym1:
    for sen_2 in hypernym2:
      hypernym_r = max(hypernym_r, jaccard(sen_1, sen_2))

  hyponym1 = []
  hyponym2 = []
  for ss in wordnet.synsets(w1):
    hyponym1.append(set(ss.hyponyms()))
  for ss in wordnet.synsets(w2):
    hyponym2.append(set(ss.hyponyms()))
  hyponym_r = 0
  for sen_1 in hyponym1:
    for sen_2 in hyponym2:
      hyponym_r = max(hyponym_r, jaccard(sen_1, sen_2))

  # To be completed using senses
  value = np.dot(np.array([synonym_r, hyponym_r, hypernym_r]), parameters)
  return value

# Calculates the R value between two phrases
# Use get_R_word2word as base in this
def get_R_p2w(p, w, parameters, tag, lemmatizer, count_obj):
  tokens = [lemmatizer.lemmatize(j[0].lower()) for j in p.split()]
  #wpos = nltk.pos_tag(' '.join(tokens))
  #w_list = [j[0] for j in wpos if j[1][0]==tag]
  R_l = []
  weights = []
  for word in tokens:
    if word in count_obj:
      R_l.append(get_R_word2word(word, w, parameters))
      weights.append(1.0/count_obj['word'])
  try:
    return np.average(R_l, weights=weights)
  except:
    return 0
  #word = lemmatizer.lemmatize(p.split()[-1][0].lower())
  #return get_R_word2word(word, w, parameters)


#w can be a svo, sv, vo
#Target refers to the index of change
#word_list get the list of words with 
#which we need to replace target position word
#w_dict is dictionary of counts of existing refernce type
def get_R_values_w(word_list, w_dict, w, target, tag, parameters, lemmatizer, count_obj):
  R_w = 0
  w_mut = list(w)
  for word in word_list:
    w_mut[target] = word
    w_c = tuple(w_mut)
    R_w+= get_R_p2w(w[target], word, parameters, tag, lemmatizer, count_obj)
  return R_w

In [None]:
# Probability function for svo, sv, vo
def get_prob_svo(svo, score_obj, lemmatizer):
  R_svo = get_R_values_w(score_obj['noun_list'], score_obj['svo'], svo, 0, 'N', score_obj['parameters']['R'], lemmatizer, score_obj['count'])
  R_svo += get_R_values_w(score_obj['verb_list'], score_obj['svo'], svo, 1, 'V', score_obj['parameters']['R'], lemmatizer, score_obj['count'])
  R_svo += get_R_values_w(score_obj['noun_list'], score_obj['svo'], svo, 2, 'N', score_obj['parameters']['R'], lemmatizer, score_obj['count'])

  count = 0

  if svo in score_obj['svo']:
    count = score_obj['svo'][svo]
  
  prob = float(count + score_obj['parameters']['S'] + R_svo)/(
      score_obj['parameters']['S']*score_obj['type_svo'] + score_obj['token_svo'] + score_obj['phi_svo']
  )

  return prob

def get_prob_sv(sv, score_obj, lemmatizer):
  R_sv = get_R_values_w(score_obj['noun_list'], score_obj['sv'], sv, 0, 'N', score_obj['parameters']['R'], lemmatizer, score_obj['count'])
  R_sv = get_R_values_w(score_obj['verb_list'], score_obj['sv'], sv, 1, 'V', score_obj['parameters']['R'], lemmatizer, score_obj['count'])

  count = 0

  if sv in score_obj['sv']:
    count = score_obj['sv'][sv]
  
  prob = float(count + score_obj['parameters']['S'] + R_sv)/(
      score_obj['parameters']['S']*score_obj['type_sv'] + score_obj['token_sv'] + score_obj['phi_sv']
  )

  return prob

def get_prob_vo(vo, score_obj, lemmatizer):
  R_vo = get_R_values_w(score_obj['verb_list'], score_obj['vo'], vo, 0, 'V', score_obj['parameters']['R'], lemmatizer, score_obj['count'])
  R_vo += get_R_values_w(score_obj['noun_list'], score_obj['vo'], vo, 1, 'N', score_obj['parameters']['R'], lemmatizer, score_obj['count'])

  count = 0

  if vo in score_obj['vo']:
    count = score_obj['vo'][vo]
  
  prob = float(count + score_obj['parameters']['S'] + R_vo)/(
      score_obj['parameters']['S']*score_obj['type_vo'] + score_obj['token_vo'] + score_obj['phi_vo']
  )

  return prob

def get_prob_sub(sub, score_obj, lemmatizer):
  R_s = 0
  for word in score_obj['noun_list']:
    R_s += get_R_p2w(sub, word, score_obj['parameters']['R'], 'N', lemmatizer, score_obj['count'])
  
  count = 0
  if sub in score_obj['sub']:
    count = score_obj['sub'][sub]
  
  prob = float(count + score_obj['parameters']['S'] + R_s)/(
      score_obj['parameters']['S']*score_obj['type_sub'] + score_obj['token_sub'] + score_obj['phi_sub']
  )

  return prob

def get_prob_verb(verb, score_obj, lemmatizer):
  R_s = 0
  for word in score_obj['verb_list']:
    R_s += get_R_p2w(verb, word, score_obj['parameters']['R'], 'V', lemmatizer, score_obj['count'])
  
  count = 0
  if verb in score_obj['verb']:
    count = score_obj['verb'][verb]
  
  prob = float(count + score_obj['parameters']['S'] + R_s)/(
      score_obj['parameters']['S']*score_obj['type_verb'] + score_obj['token_verb'] + score_obj['phi_verb']
  )

  return prob

def get_prob_obj(obj, score_obj, lemmatizer):
  R_s = 0
  for word in score_obj['noun_list']:
    R_s += get_R_p2w(obj, word, score_obj['parameters']['R'], 'N', lemmatizer, score_obj['count'])
  
  count = 0
  if obj in score_obj['obj']:
    count = score_obj['obj'][obj]
  
  prob = float(count + score_obj['parameters']['S'] + R_s)/(
      score_obj['parameters']['S']*score_obj['type_obj'] + score_obj['token_obj'] + score_obj['phi_obj']
  )

  return prob


In [None]:
#Getting the phi values for svo, sv, and vo
#One time calculation. Updates for on the fly learning to be done separately
#Need to do again if parameters for R calculation changed
def set_phi_values(score_obj, lemmatizer):
  param = score_obj['parameters']['R']
  phi_svo = 0
  phi_sv = 0
  phi_vo = 0
  phi_sub = 0
  phi_verb = 0
  phi_obj = 0
  # Calculating for svo
  for svo in score_obj['svo']:
    #print(svo)
    phi_svo+= get_R_values_w(score_obj['noun_list'], score_obj['svo'], svo, 0,'N', param, lemmatizer, score_obj['count'])
    phi_svo+= get_R_values_w(score_obj['noun_list'], score_obj['svo'], svo, 2,'N', param, lemmatizer, score_obj['count'])
    phi_svo+= get_R_values_w(score_obj['verb'], score_obj['svo'], svo, 1,'V', param, lemmatizer, score_obj['count'])
  print('svo')
  # Calculating for sv
  for sv in score_obj['sv']:
    phi_sv+= get_R_values_w(score_obj['noun_list'], score_obj['svo'], sv, 0,'N', param, lemmatizer, score_obj['count'])
    phi_sv+= get_R_values_w(score_obj['verb'], score_obj['svo'], sv, 1,'V', param, lemmatizer, score_obj['count'])
  # Calculating for vo
  for vo in score_obj['vo']:
    phi_vo+= get_R_values_w(score_obj['noun_list'], score_obj['svo'], vo, 1,'V', param, lemmatizer, score_obj['count'])
    phi_vo+= get_R_values_w(score_obj['verb'], score_obj['svo'], vo, 0,'N', param, lemmatizer, score_obj['count'])
  
  # Calculating for sub
  for sub in score_obj['sub']:
    for word in score_obj['noun_list']:
      phi_sub += get_R_p2w(sub, word, score_obj['parameters']['R'], 'N', lemmatizer, score_obj['count'])
  
  # Calculating for verb
  for verb in score_obj['verb']:
    for word in score_obj['verb_list']:
      phi_verb += get_R_p2w(verb, word, score_obj['parameters']['R'], 'V', lemmatizer, score_obj['count'])

  # Calculating for sub
  for obj in score_obj['obj']:
    for word in score_obj['noun_list']:
      phi_obj += get_R_p2w(obj, word, score_obj['parameters']['R'], 'N', lemmatizer, score_obj['count'])

  # Updating the parameters
  score_obj['phi_svo'] = phi_svo
  score_obj['phi_sv'] = phi_sv
  score_obj['phi_vo'] = phi_vo
  score_obj['phi_sub'] = phi_sub
  score_obj['phi_verb'] = phi_verb
  score_obj['phi_obj'] = phi_obj

In [None]:
#Getting the phi values for svo, sv, and vo
#One time calculation. Updates for on the fly learning to be done separately
#Need to do again if parameters for R calculation changed
def set_phi_values_approx(score_obj, lemmatizer):
  param = score_obj['parameters']['R']
  phi_svo = 0
  phi_sv = 0
  phi_vo = 0
  phi_sub = 0
  phi_verb = 0
  phi_obj = 0
  # Calculating for svo
  ind = np.random.choice(np.arange(score_obj['type_svo']), 30)
  svo_ll = [list(score_obj['svo'].keys())[j] for j in ind]
  for svo in svo_ll:
    #print(svo)
    phi_svo+= get_R_values_w(score_obj['noun_list'], score_obj['svo'], svo, 0,'N', param, lemmatizer, score_obj['count'])
    phi_svo+= get_R_values_w(score_obj['noun_list'], score_obj['svo'], svo, 2,'V', param, lemmatizer, score_obj['count'])
    phi_svo+= get_R_values_w(score_obj['verb'], score_obj['svo'], svo, 1,'N', param, lemmatizer, score_obj['count'])
  print('svo')
  # Calculating for sv
  ind = np.random.choice(np.arange(score_obj['type_sv']), 30)
  svo_ll = [list(score_obj['sv'].keys())[j] for j in ind]
  for sv in svo_ll:
    phi_sv+= get_R_values_w(score_obj['noun_list'], score_obj['sv'], sv, 0,'N', param, lemmatizer, score_obj['count'])
    phi_sv+= get_R_values_w(score_obj['verb'], score_obj['sv'], sv, 1,'V', param, lemmatizer, score_obj['count'])
  # Calculating for vo
  ind = np.random.choice(np.arange(score_obj['type_vo']), 30)
  svo_ll = [list(score_obj['vo'].keys())[j] for j in ind]
  for vo in svo_ll:
    phi_vo+= get_R_values_w(score_obj['noun_list'], score_obj['vo'], vo, 1,'N', param, lemmatizer, score_obj['count'])
    phi_vo+= get_R_values_w(score_obj['verb'], score_obj['vo'], vo, 0,'V', param, lemmatizer, score_obj['count'])
  
  # Calculating for sub
  ind = np.random.choice(np.arange(score_obj['type_sub']), 30)
  svo_ll = [list(score_obj['sub'].keys())[j] for j in ind]
  for sub in svo_ll:
    for word in score_obj['noun_list']:
      phi_sub += get_R_p2w(sub, word, score_obj['parameters']['R'], 'N', lemmatizer, score_obj['count'])
  
  # Calculating for verb
  ind = np.random.choice(np.arange(score_obj['type_verb']), 30)
  svo_ll = [list(score_obj['verb'].keys())[j] for j in ind]
  for verb in svo_ll:
    for word in score_obj['verb_list']:
      phi_verb += get_R_p2w(verb, word, score_obj['parameters']['R'], 'V', lemmatizer, score_obj['count'])

  # Calculating for sub
  ind = np.random.choice(np.arange(score_obj['type_obj']), 30)
  svo_ll = [list(score_obj['obj'].keys())[j] for j in ind]
  for obj in svo_ll:
    for word in score_obj['noun_list']:
      phi_obj += get_R_p2w(obj, word, score_obj['parameters']['R'], 'N', lemmatizer, score_obj['count'])

  # Updating the parameters
  score_obj['phi_svo'] = phi_svo*score_obj['type_svo']/30
  score_obj['phi_sv'] = phi_sv*score_obj['type_sv']/30
  score_obj['phi_vo'] = phi_vo*score_obj['type_vo']/30
  score_obj['phi_sub'] = phi_sub*score_obj['type_sub']/30
  score_obj['phi_verb'] = phi_verb*score_obj['type_verb']/30
  score_obj['phi_obj'] = phi_obj*score_obj['type_obj']/30

In [None]:
# Calculating sigmoid of PMI
# !! What about P(Sub), P(Verb), P(Obj)
# Calculate PMI or just directly use the probability as score?
def cal_mi(sub,verb,obj, score_obj, lemmatizer):
  prob_svo = get_prob_svo((sub,verb,obj), score_obj, lemmatizer)
  prob_sub = get_prob_sub(sub, score_obj, lemmatizer)
  prob_obj = get_prob_obj(obj, score_obj, lemmatizer)
  prob_verb = get_prob_verb(verb, score_obj, lemmatizer)
  prob_sv = get_prob_sv((sub,verb), score_obj, lemmatizer)
  prob_vo = get_prob_vo((verb,obj), score_obj, lemmatizer)
    
  mi_svo = prob_svo/(prob_sub*prob_verb*prob_obj)
  mi_sv = prob_sv/(prob_sub*prob_verb)
  mi_vo = prob_vo/(prob_verb*prob_obj)
  
  return mi_svo/(1+mi_svo), mi_sv/(mi_sv+1), mi_vo/(1+mi_vo)

In [None]:
def xsigmoid(x):
  return x / (1 + np.exp(-x))

def get_final_score(sub, verb, obj, verb_t, score_obj, lemmatizer):
  s_svo, s_sv, s_vo = cal_mi(sub,verb,obj, score_obj, lemmatizer)
  s_svto, s_svt, s_vto = cal_mi(sub,verb_t,obj, score_obj, lemmatizer)

  score_vec = np.array([s_svo, s_svto, s_sv, s_svt, s_vo, s_vto])
  xsigmoid_score = np.dot(np.vectorize(xsigmoid)(score_vec), score_vec)
  fxsigmoid_score = np.dot(np.vectorize(xsigmoid)(score_vec), np.multiply(score_vec , score_obj['parameters']['F']))
  linear_score = np.dot(score_obj['parameters']['F'], score_vec)

  return xsigmoid_score, linear_score, fxsigmoid_score

In [None]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
#score_obj['phi_svo'] = score_obj['phi_svo']*0.001
#score_obj['phi_sv'] = score_obj['phi_sv']*0.001
#score_obj['phi_vo'] = score_obj['phi_vo']*0.001
#score_obj['phi_sub'] = score_obj['phi_sub']*0.001
#score_obj['phi_verb'] = score_obj['phi_verb']*0.001
#score_obj['phi_obj'] = score_obj['phi_obj']*0.001

In [None]:
set_phi_values_approx(score_obj, lemmatizer)

svo


In [None]:
get_final_score('you','mix','it','combine', score_obj, lemmatizer)

(2.799633407929544, 2.5106065949875678, 1.501712713342899)

In [None]:
get_final_score('you','mix','it','die', score_obj, lemmatizer)

(3.3991884090499136, 2.795268817617087, 1.811958060882159)

In [None]:
get_final_score('government','provide','cash','supply', score_obj, lemmatizer)

(4.187166218432598, 3.048077326793292, 2.183955945715394)

In [None]:
get_final_score('government','provide','cash','leave', score_obj, lemmatizer)

(3.9449043224495606, 2.9680466002144255, 2.062812300383214)

In [None]:
get_final_score('priest','say','prayer','state', score_obj, lemmatizer)

(4.302222563427463, 3.0791867809467925, 2.2322911136904753)

In [None]:
get_final_score('priest','say','prayer','allege', score_obj, lemmatizer)

(4.305112085002827, 3.0800584763128827, 2.233735913495684)

In [None]:
fp = open('/Datasets/TestingDatasets/GS2011data.txt', 'r')
line = fp.readline()
line = fp.readline()

groups = {}

while(line):
    a = line.split()
    v = a[1]
    s = a[2]
    o = a[3]
    vt = a[4]
    if (s,o) in groups:
      if (s,v,o, vt) in groups[(s,o)]:
        groups[(s,o)][(s,v,o, vt)]+=int(a[5])
      else:
        groups[(s,o)][(s,v,o, vt)]=int(a[5])
    else:
      groups[(s,o)] = {}
      groups[(s,o)][(s,v,o, vt)]=int(a[5])
    line = fp.readline()
fp.close()

In [None]:
from scipy.stats import spearmanr

In [None]:
spearman1 = 0
spearman2 = 0
spearman3 = 0
counter = 0
for so in groups:
  model_score_1 = []
  model_score_2 = []
  model_score_3 = []
  data_score = []
  for svovt in groups[so]:
    sub = lemmatizer.lemmatize(svovt[0])
    obj = lemmatizer.lemmatize(svovt[2])
    vv = lemmatizer.lemmatize(svovt[1])
    vt = lemmatizer.lemmatize(svovt[3])
    s = get_final_score(sub,vv,obj,vt, score_obj,lemmatizer)
    model_score_1.append(s[0])
    model_score_2.append(s[1])
    model_score_3.append(s[2])
    data_score.append(groups[so][svovt])
  s1 = spearmanr(np.argsort(np.argsort(model_score_1)), data_score)[0]
  s2 = spearmanr(np.argsort(np.argsort(model_score_2)), data_score)[0]
  s3 = spearmanr(np.argsort(np.argsort(model_score_3)), data_score)[0]
  if s1 < 2:
    spearman1+=s1
    counter+=1
  if s2 < 2:
    spearman2+=s2
  if s3 < 2:
    spearman3+=s3

print(spearman1/counter, spearman2/counter, spearman3/counter)

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0.07216494845360823 0.07216494845360823 0.07216494845360823


In [None]:
import pickle as p
with open('Good_english/score_obj_extractor','wb') as f:
  p.dump(score_obj, f)