In [57]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model, Sequential
from keras.layers import Convolution1D
from keras import initializers, regularizers, constraints, optimizers, layers
import pandas as pd
import numpy as np

In [8]:
# load train dataset and test dataset
file = 'no_head_tain.csv'
test = 'no_head_test.csv'
df = pd.read_csv(file, names=['reviews', 'sentiment'])
df_test = pd.read_csv(test, header = 0, names=['reviews', 'sentiment'])

In [41]:
# train model
max_features = 6000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(df['reviews'])
list_tokenized_train = tokenizer.texts_to_sequences(df['reviews'])

maxlen = 130
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
y = df['sentiment']

# define the structure
embed_size = 128
model = Sequential()
model.add(Embedding(max_features, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

batch_size = 100
epochs = 10
history = model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)



Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
# test model
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import roc_auc_score

y_test = df_test["sentiment"]
list_sentences_test = df_test["reviews"]
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)
prediction = model.predict(X_te)
y_pred = (prediction > 0.5)

pct_auc = roc_auc_score(y_pred, y_test) * 100
print('{:0.2f}'.format(pct_auc))
print('F1-score: {0}'.format(f1_score(y_pred, y_test)))
print('Confusion matrix:')
confusion_matrix(y_pred, y_test)

77.23
F1-score: 0.7829734005371399
Confusion matrix:


array([[ 8459,  1858],
       [ 4041, 10641]], dtype=int64)

In [55]:
import sys
import matplotlib as plt
plt.use('Agg')
import matplotlib.pyplot as plt
from matplotlib import *
from sklearn.metrics import roc_curve, auc
# score = history.decision_function(X_te)
                                 
# fpr,tpr,threshold = roc_curve(y_test, score) ###计算真正率和假正率
# roc_auc = auc(fpr,tpr) ###计算auc的值)

# plt.figure()
# lw = 2
# plt.figure(figsize=(10,10))
# plt.plot(fpr, tpr, color='darkorange',
#          lw=lw, label='ROC curve (area = %0.2f)' % roc_auc) ###假正率为横坐标，真正率为纵坐标做曲线
# plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver operating characteristic example')
# plt.legend(loc="lower right")
# plt.show()

# 二分类　ＲＯＣ曲线
# roc_curve:真正率（True Positive Rate , TPR）或灵敏度（sensitivity）
# 横坐标：假正率（False Positive Rate , FPR）
fpr, tpr, thresholds_keras = roc_curve(y_test, y_pred)
auc = auc(fpr, tpr)
print("AUC : ", auc)
# plt.figure()
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Keras (area = {:.3f})'.format(auc))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
# plt.savefig("../images/ROC/ROC_Binary.png")
plt.show()

AUC :  0.764034053924314




In [40]:
import tkinter
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as pyplot

pyplot.plot(history.history[ 'loss' ])
pyplot.plot(history.history[ 'val_loss' ])
pyplot.title( 'model train vs validation loss' )
pyplot.ylabel( 'loss' )
pyplot.xlabel( 'epoch' )
pyplot.legend([ 'train' , 'validation' ], loc= 'upper right' )
pyplot.show()


In [58]:
plt.imshow(confusion_matrix(y_pred, y_test),interpolation='nearest',cmap=plt.cm.Paired)
plt.title('Confusion Matrix')
plt.colorbar()
tick_marks=np.arange(4)
plt.xticks(tick_marks,tick_marks)
plt.yticks(tick_marks,tick_marks)
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()


  if __name__ == '__main__':
