In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

import numpy as np
import pandas as pd
from scipy import stats
from gensim.models import Word2Vec
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.utils import resample

In [3]:
df = pd.DataFrame()
df = pd.read_csv('data/amr-bank-struct-v1.6-training.csv', encoding = 'utf-8')
df.head()

Unnamed: 0,isfocus,focus,word,normalized_words,index,POS,-1POS,-2POS,+1POS,+2POS
0,1,chapter,Chapter,chapter,0,NN,XXXX,XXXX,CD,.
1,0,chapter,7,7,1,CD,NN,XXXX,.,XXXX
2,0,chapter,.,.,2,.,CD,NN,XXXX,XXXX
3,0,reveal,On,on,0,IN,XXXX,XXXX,DT,JJ
4,0,reveal,the,the,1,DT,IN,XXXX,JJ,NN


In [4]:
x = df.loc[:,['word','normalized_words','isfocus','index','POS','-1POS','-2POS','+1POS','+2POS']]
x.head()

Unnamed: 0,word,normalized_words,isfocus,index,POS,-1POS,-2POS,+1POS,+2POS
0,Chapter,chapter,1,0,NN,XXXX,XXXX,CD,.
1,7,7,0,1,CD,NN,XXXX,.,XXXX
2,.,.,0,2,.,CD,NN,XXXX,XXXX
3,On,on,0,0,IN,XXXX,XXXX,DT,JJ
4,the,the,0,1,DT,IN,XXXX,JJ,NN


In [706]:
# x['normalized_words'] = x['normalized_words']+' '+x['index'].astype(str)+' '+x['POS']+' '+x['-1POS']+' '+x['-1POS']+' '+x['-2POS']+' '+x['+1POS']+' '+x['+2POS']
x['normalized_words'] = x['normalized_words']+' '+x['index'].astype(str)+' '+x['POS']+' '+x['-1POS']+' '+x['-1POS']+' '+x['-2POS']+' '+x['+1POS']+' '+x['+2POS']
x = x.iloc[0:10000,0:2]



In [707]:
df_0 = x[df.isfocus == 0]
df_1 = x[df.isfocus == 1]

df_majority_downsampled = resample(df_0, 
                                 replace=False,
                                 n_samples=df_1.size,     
                                 random_state=123) 

df_downsampled = pd.concat([df_majority_downsampled, df_1])

X = df_downsampled['normalized_words'].values
y = df_downsampled['isfocus'].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5,
                                                    random_state = 1234,stratify= y)

In [708]:
from tensorflow.python.keras.preprocessing.text import Tokenizer
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
tokenizer_obj = Tokenizer()
total_words = X_train +X_test
tokenizer_obj.fit_on_texts(total_words)

max_length = max([len(s.split()) for s in total_words])

vocab_size =len(tokenizer_obj.word_index)+1

X_train_tokens = tokenizer_obj.texts_to_sequences(X_train)
X_test_tokens = tokenizer_obj.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens, maxlen = max_length, padding = 'post')
X_test_pad = pad_sequences(X_test_tokens, maxlen = max_length, padding ='post')

In [709]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding,GRU
from keras.models import Sequential
from keras.layers.embeddings import Embedding

EMBEDDING_DIM =100

model=Sequential()
model.add(Embedding(vocab_size, EMBEDDING_DIM, input_length = max_length))
model.add(GRU(units=32, dropout =0.5, recurrent_dropout=0.5))
model.add(Dense(1,activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_67 (Embedding)     (None, 15, 100)           83900     
_________________________________________________________________
gru_64 (GRU)                 (None, 32)                12768     
_________________________________________________________________
dense_64 (Dense)             (None, 1)                 33        
Total params: 96,701
Trainable params: 96,701
Non-trainable params: 0
_________________________________________________________________
None


In [710]:
model.fit(X_train_pad, y_train, batch_size=128, epochs =50, validation_data= (X_test_pad, 
                                                                            y_test),verbose=2)


Train on 813 samples, validate on 813 samples
Epoch 1/50
 - 8s - loss: 0.6803 - acc: 0.6125 - val_loss: 0.6577 - val_acc: 0.6667
Epoch 2/50
 - 0s - loss: 0.6487 - acc: 0.6667 - val_loss: 0.6386 - val_acc: 0.6667
Epoch 3/50
 - 0s - loss: 0.6379 - acc: 0.6667 - val_loss: 0.6371 - val_acc: 0.6667
Epoch 4/50
 - 0s - loss: 0.6383 - acc: 0.6667 - val_loss: 0.6372 - val_acc: 0.6667
Epoch 5/50
 - 0s - loss: 0.6406 - acc: 0.6667 - val_loss: 0.6364 - val_acc: 0.6667
Epoch 6/50
 - 0s - loss: 0.6375 - acc: 0.6667 - val_loss: 0.6363 - val_acc: 0.6667
Epoch 7/50
 - 0s - loss: 0.6390 - acc: 0.6667 - val_loss: 0.6363 - val_acc: 0.6667
Epoch 8/50
 - 0s - loss: 0.6389 - acc: 0.6667 - val_loss: 0.6362 - val_acc: 0.6667
Epoch 9/50
 - 0s - loss: 0.6386 - acc: 0.6667 - val_loss: 0.6362 - val_acc: 0.6667
Epoch 10/50
 - 0s - loss: 0.6349 - acc: 0.6667 - val_loss: 0.6362 - val_acc: 0.6667
Epoch 11/50
 - 0s - loss: 0.6382 - acc: 0.6667 - val_loss: 0.6361 - val_acc: 0.6667
Epoch 12/50
 - 0s - loss: 0.6371 - acc:

<keras.callbacks.History at 0x1a864a7588>

In [711]:
y_pred = model.predict(x = X_test_pad)
for i in range(len(y_pred)):
    if y_pred[i][0] >= 0.9:
        y_pred[i][0] = 1
    else:
        y_pred[i][0] = 0

In [712]:
from sklearn.metrics import f1_score
print ('f1_score: ',f1_score(y_true=y_test, y_pred=y_pred))
cm_svm = confusion_matrix(y_true=y_test, y_pred=y_pred)
print (cm_svm)
def accuracy(cm):
    return np.trace(cm)/np.sum(cm)
print (accuracy(cm_svm))


f1_score:  0.7226890756302521
[[509  33]
 [ 99 172]]
0.8376383763837638


In [730]:
from keras.utils.vis_utils import plot_model
# test_sample_1 = 'say 9 VBD NN NN DT , ``'
# test_sample_2 = 'prince 9 NN JJ JJ DT , CC'
# test_sample_3 = 'reveal 26 VBN VBD VBD NN TO PRP'
# test_sample_4 = 'chapter 0 NN XXXX XXXX XXXX CD .'
# test_sample_5 = 'little 21 JJ DT DT IN NN POS'
# test_sample_6 = 'i 1 PRP `` XXXX VBP RB'

test_sample_1 = 'see 6 VBN VBN VBN RB IN DT'
test_sample_2 = 'present 8 VBN VBD VBD NN PRP TO'
test_sample_3 = 'account 24 NN IN IN VBZ IN DT'
test_sample_4 = 'he 1 PRP CC CC XXXX VBD IN'
test_sample_5 = 'in 3 IN VBD VBD PRP JJ NN'
test_sample_6 = 'of 23 IN NN NN IN NN ,'
test_samples=[test_sample_1,test_sample_2,test_sample_3,test_sample_4,test_sample_5,test_sample_6]
test_samples_tokens = tokenizer_obj.texts_to_sequences (test_samples)
test_samples_tokens_pad = pad_sequences(test_samples_tokens, maxlen=max_length)

model.predict(x=test_samples_tokens_pad)


    



array([[0.00641237],
       [0.0059375 ],
       [0.00600799],
       [0.0052671 ],
       [0.00553304],
       [0.00549879]], dtype=float32)

SyntaxError: invalid syntax (<ipython-input-299-16b4a43ecc59>, line 5)