In [1]:
from stanfordcorenlp import StanfordCoreNLP
import re
from bs4 import BeautifulSoup
import pandas as pd
delimiter = '\t'
E1 = 'e1'
E2 = 'e2'

In [9]:
text = [
'7	"The current view is that the chronic  <e1>inflammation</e1> in the distal part of the stomach caused by Helicobacter pylori  <e2>infection</e2> results in an increased acid production from the non-infected upper corpus region of the stomach."',
'8	" <e1>People</e1> have been moving back into  <e2>downtown</e2>."',
'9	"The  <e1>lawsonite</e1> was contained in a  <e2>platinum crucible</e2> and the counter-weight was a plastic crucible with metal pieces."',
'10	"The solute was placed inside a beaker and 5 mL of the  <e1>solvent</e1> was pipetted into a 25 mL glass  <e2>flask</e2> for each trial."'
]

In [25]:

def extract_features(train_sents):
	'''
	:param train_sents:list
	:return:features:list
	'''
	features = []

	nlp = StanfordCoreNLP(r'C:\stanford-corenlp-full-2018-02-27')

	for line in train_sents:
		feature = {}
		soup = BeautifulSoup(line, 'html.parser')
		e1 = soup.find(E1).text
		e2 = soup.find(E2).text
		# print(e1, e2)

		sent = re.split(r'[."\s]+', line)

		for w in sent:
			if w.startswith("<" + E1 + ">"):
				feature['index1'] = sent.index(w)
				break
		for w in sent:
			if w.startswith("<" + E2 + ">"):
				feature['index2'] = sent.index(w)
				break

		sent = re.split(r'[.\s]+', soup.text)
		# print(sent)

		feature['preffix1'], feature['preffix2'] = e1[:2], e2[:2]
		feature['suffix1'], feature['suffix2'] = e1[-2:], e2[-2:]
		feature['istitle1'], feature['istitle2'] = str(e1.istitle()), str(e2.istitle())
		feature['pos_tag1'], feature['pos_tag2'] = nlp.pos_tag(e1)[0][1], nlp.pos_tag(e2)[0][1]

		if feature['index1'] < 2:
			feature['pre_pos_tag1'] = 'None'
			feature['pre_istitle1'] = 'None'
		else:
			word1 = sent[feature['index1'] - 1]
			feature['pre_istitle1'] = str(word1.istitle())
			feature['pre_pos_tag1'] = nlp.pos_tag(word1)[0][1]

		if feature['index2'] > len(sent) - 2:
			feature['post_pos_tag2'] = 'None'
			feature['post_istitle2'] = 'None'
		else:
			word2 = sent[feature['index2'] + 1]
			feature['post_istitle2'] = str(word2.istitle())
			feature['post_pos_tag2'] = nlp.pos_tag(word2)[0][1]

		# print(feature)
		features.append(feature)
	nlp.close()
	return features

In [26]:
features = extract_features(text)

In [27]:
features

[{'index1': 8,
  'index2': 20,
  'istitle1': 'False',
  'istitle2': 'False',
  'pos_tag1': 'NN',
  'pos_tag2': 'NN',
  'post_istitle2': 'False',
  'post_pos_tag2': 'NNS',
  'pre_istitle1': 'False',
  'pre_pos_tag1': 'JJ',
  'preffix1': 'in',
  'preffix2': 'in',
  'suffix1': 'on',
  'suffix2': 'on'},
 {'index1': 1,
  'index2': 7,
  'istitle1': 'True',
  'istitle2': 'False',
  'pos_tag1': 'NNS',
  'pos_tag2': 'NN',
  'post_istitle2': 'False',
  'post_pos_tag2': 'NN',
  'pre_istitle1': 'None',
  'pre_pos_tag1': 'None',
  'preffix1': 'Pe',
  'preffix2': 'do',
  'suffix1': 'le',
  'suffix2': 'wn'},
 {'index1': 2,
  'index2': 7,
  'istitle1': 'False',
  'istitle2': 'False',
  'pos_tag1': 'NN',
  'pos_tag2': 'NN',
  'post_istitle2': 'False',
  'post_pos_tag2': 'NN',
  'pre_istitle1': 'True',
  'pre_pos_tag1': '``',
  'preffix1': 'la',
  'preffix2': 'pl',
  'suffix1': 'te',
  'suffix2': 'le'},
 {'index1': 13,
  'index2': 21,
  'istitle1': 'False',
  'istitle2': 'False',
  'pos_tag1': 'JJ',
  '

In [28]:
def get_features_dataframe(features):
	'''
	:param features:list
	:return:features_dataframe:DataFrame
	'''
	features_data = []  # data of dataframe
	features_keys = list(features[0].keys())  # key of features

	for item in features:
		feature = []
		for key in features_keys:
			feature.append(item[key])
		features_data.append(feature)
	# print(features_data)

	features_dataframe = pd.DataFrame(features_data, columns=features_keys)
	return features_dataframe

In [29]:
features_dataframe = get_features_dataframe(features)

In [30]:
def get_props_table(features_dataframe):
	'''
	:param features_dataframe:DataFrame
	:return:props_table:dict
	'''
	props_table = {}  # table of props,which aim to trans prop to num

	for prop_key in features_dataframe.columns:
		# print(set(features_dataframe[prop_key]))
		props = sorted(set(features_dataframe[prop_key]))
		print(props)
		props_table[prop_key] = props
	return props_table

In [31]:
features_dataframe

Unnamed: 0,index1,index2,preffix1,preffix2,suffix1,suffix2,istitle1,istitle2,pos_tag1,pos_tag2,pre_istitle1,pre_pos_tag1,post_istitle2,post_pos_tag2
0,8,20,in,in,on,on,False,False,NN,NN,False,JJ,False,NNS
1,1,7,Pe,do,le,wn,True,False,NNS,NN,,,False,NN
2,2,7,la,pl,te,le,False,False,NN,NN,True,``,False,NN
3,13,21,so,fl,nt,sk,False,False,JJ,NN,False,DT,False,IN


In [32]:
props_table = get_props_table(features_dataframe)

[1, 2, 8, 13]
[7, 20, 21]
['Pe', 'in', 'la', 'so']
['do', 'fl', 'in', 'pl']
['le', 'nt', 'on', 'te']
['le', 'on', 'sk', 'wn']
['False', 'True']
['False']
['JJ', 'NN', 'NNS']
['NN']
['False', 'None', 'True']
['DT', 'JJ', 'None', '``']
['False']
['IN', 'NN', 'NNS']


In [37]:
def props2nums(props_table, values_of_props, prop_key):
	'''
	:param props_table:dict
	:param values_of_props:list or Series
	:param prop_key:str
	:return:nums_of_props:list
	'''
	nums_of_props = []
	for value in values_of_props:
		nums_of_props.append(props_table[prop_key].index(value)+1)
	return nums_of_props

def features2nums(features_dataframe, props_table):
	'''
	:param features_dataframe:DataFrame
	:param props_table:dict
	:return:nums_of_features:DataFrame
	'''
	nums_of_features = pd.DataFrame(columns=features_dataframe.columns)
	for prop_key in features_dataframe.columns:
		values_of_props = features_dataframe[prop_key]
		nums_of_props = props2nums(props_table, values_of_props, prop_key)
		nums_of_features[prop_key] = nums_of_props
	return nums_of_features

In [38]:
nums_of_features = features2nums(features_dataframe, props_table)

In [39]:
nums_of_features

Unnamed: 0,index1,index2,preffix1,preffix2,suffix1,suffix2,istitle1,istitle2,pos_tag1,pos_tag2,pre_istitle1,pre_pos_tag1,post_istitle2,post_pos_tag2
0,3,2,2,3,3,2,1,1,2,1,1,2,1,3
1,1,1,1,1,1,4,2,1,3,1,2,3,1,2
2,2,1,3,4,4,1,1,1,2,1,3,4,1,2
3,4,3,4,2,2,3,1,1,1,1,1,1,1,1


In [41]:
def normalize(nums_of_features):
	normalized_features = pd.DataFrame(columns=nums_of_features.columns)
	for key in nums_of_features.columns:
		a = nums_of_features[key] - nums_of_features[key].mean()
		b = nums_of_features[key].std()
		if b == 0:
			normalized_features[key] = 0
			continue
		normalized_features[key] = a / b
	return normalized_features

In [42]:
normalized_features=normalize(nums_of_features)
text = []
for row in normalized_features.index:
    row_str = [str(x) for x in list(normalized_features.loc[row])]
    text.append(delimiter.join(row_str))

In [43]:
text

['0.3872983346207417\t0.26111648393354675\t-0.3872983346207417\t0.3872983346207417\t0.3872983346207417\t-0.3872983346207417\t-0.5\t0.0\t0.0\t0.0\t-0.7833494518006403\t-0.3872983346207417\t0.0\t1.224744871391589',
 '-1.161895003862225\t-0.7833494518006403\t-1.161895003862225\t-1.161895003862225\t-1.161895003862225\t1.161895003862225\t1.5\t0.0\t1.224744871391589\t0.0\t0.26111648393354675\t0.3872983346207417\t0.0\t0.0',
 '-0.3872983346207417\t-0.7833494518006403\t0.3872983346207417\t1.161895003862225\t1.161895003862225\t-1.161895003862225\t-0.5\t0.0\t0.0\t0.0\t1.3055824196677337\t1.161895003862225\t0.0\t0.0',
 '1.161895003862225\t1.3055824196677337\t1.161895003862225\t-0.3872983346207417\t-0.3872983346207417\t0.3872983346207417\t-0.5\t0.0\t-1.224744871391589\t0.0\t-0.7833494518006403\t-1.161895003862225\t0.0\t-1.224744871391589']

In [1]:
pwd

'E:\\workspace\\github\\DJH-SemEval2010_task8'

In [3]:
train_text=['1.295553514642969\t1.0080122384353334\t-0.8439955322160048\t-0.8047064057572209\t0.5939622222364747\t1.3413298606407469\t-0.21001974711693366\t-0.022364874106437876\t-0.4356138843362755\t1.055384844334442\t-0.7162597422505553\t-0.7868193289548033\t-0.10530481009138977\t-1.0381538789748221', '-0.7394078291960542\t-0.10748257297437272\t-0.8886093828750715\t-1.0974622056067467\t-0.15569991803156233\t-0.1530853393383667\t-0.21001974711693366\t-0.022364874106437876\t-0.4356138843362755\t-0.4365887825741174\t1.472463687086512\t1.3952856696631892\t-0.10530481009138977\t-0.24008920613187676', '-0.7394078291960542\t-0.47931417677760807\t-1.1116786361704054\t-1.0161411500929896\t0.6475095179699059\t-0.98541785578243\t-0.21001974711693366\t-0.022364874106437876\t-0.4356138843362755\t-0.4365887825741174\t1.472463687086512\t1.3952856696631892\t-0.10530481009138977\t1.3560401395540143', '-0.5544113433925066\t-0.6652299786792257\t0.9405584941466664\t1.1957915598812061\t-0.7982674668327369\t-0.7962513747724156\t-0.21001974711693366\t-0.022364874106437876\t-0.4356138843362755\t-0.4365887825741174\t-0.7162597422505553\t-0.6958982873457202\t-0.10530481009138977\t-1.0381538789748221', '-0.7394078291960542\t-1.4088931862856964\t1.1784990309950225\t-1.4227464276617756\t0.344074842147129\t0.5846639366006894\t-0.21001974711693366\t-0.022364874106437876\t-0.4356138843362755\t-0.4365887825741174\t1.472463687086512\t1.3952856696631892\t-0.10530481009138977\t2.0400955734193964']
key_text=['4', '17', '12', '17', '13']