-
Notifications
You must be signed in to change notification settings - Fork 0
/
glove_predict.py
114 lines (89 loc) · 4 KB
/
glove_predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import numpy as np
import pandas as pd
from scipy import linalg
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
import argparse
from utils import tokenize
def predict_answers(data, word2vec, N):
#choose question based on cosine distance
scores = get_glove_features(data, word2vec, N)
pred_answs = []
for score in scores:
idx = score.argmax()
pred_answs.append(["A", "B", "C", "D"][idx])
return pred_answs
def get_glove_features(data, word2vec, N):
stop = stopwords.words('english')
scores = []
for i in range(data.shape[0]):
#calculate word2vec for question
q_vec = np.zeros(N)
for w in tokenize(data['question'][i]):
if w.lower() in word2vec and w.lower() not in stop:
q_vec += word2vec[w.lower()]
# # get all synonyms of the word
# syns = wn.synsets(w.lower(), pos='n')
# if len(syns)>0:
# for syn in syns:
# sw = syn.lemma_names()[0]
# if sw.lower() in word2vec and sw.lower() not in stop:
# q_vec += word2vec[sw.lower()]
q_vec = q_vec / linalg.norm(q_vec)
#calculate word2vec for answers
A_vec = np.zeros(N)
B_vec = np.zeros(N)
C_vec = np.zeros(N)
D_vec = np.zeros(N)
for w in tokenize(data['answerA'][i]):
if w.lower() in word2vec and w.lower() not in stop:
A_vec += word2vec[w.lower()]
for w in tokenize(data['answerB'][i]):
if w.lower() in word2vec and w.lower() not in stop:
B_vec += word2vec[w.lower()]
for w in tokenize(data['answerC'][i]):
if w.lower() in word2vec and w.lower() not in stop:
C_vec += word2vec[w.lower()]
for w in tokenize(data['answerD'][i]):
if w.lower() in word2vec and w.lower() not in stop:
D_vec += word2vec[w.lower()]
A_vec = A_vec / linalg.norm(A_vec)
B_vec = B_vec / linalg.norm(B_vec)
C_vec = C_vec / linalg.norm(C_vec)
D_vec = D_vec / linalg.norm(D_vec)
scores.append(np.concatenate((A_vec, B_vec, C_vec, D_vec)).reshape(4, N).dot(q_vec))
return scores
if __name__ == '__main__':
#parsing input arguments
parser = argparse.ArgumentParser()
parser.add_argument('--fname', type=str, default='training_set.tsv', help='file name with data')
parser.add_argument('--N', type=int, default= 300, help='embeding size (50, 100, 200, 300 only)')
args = parser.parse_args()
#read data
data = pd.read_csv('data/' + args.fname, sep = '\t' )
#read glove
print("reading glove...")
word2vec = {}
with open("data/glove/glove.6B." + str(args.N) + "d.txt") as f:
for line in f:
l = line.split()
word2vec[l[0]] = map(float, l[1:])
##predict
#print("predicting...")
#pred_answs = predict_answers(data, word2vec, args.N)
# #test the model
# print('testing...')
# results = pd.DataFrame({'id': list(data['id']),'correctAnswer': pred_answs})[['id', 'correctAnswer']]
# score=0.0
# for i in range(data.shape[0]):
# if data['correctAnswer'][i]==results['correctAnswer'][i]:
# score+=1
# score = score/data.shape[0]
# print('The score is: %.2f' % score)
#save features
print('saving features...')
features = np.array(get_glove_features(data, word2vec, args.N))
pd.DataFrame({'id': list(data['id']),'fA': features[:,0], 'fB': features[:,1], 'fC': features[:,2], 'fD': features[:,3]})[['id', 'fA', 'fB', 'fC', 'fD']].to_csv('features_glove.csv', index = False)
##save prediction
#pd.DataFrame({'id': list(data['id']),'correctAnswer': pred_answs})[['id', 'correctAnswer']].to_csv('prediction_glove_syns.csv', index = False)
print("done...")