In [1]:
# 用Conv1D对IMDB电影评论数据集做文本分类

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
# 读取数据
filepath = '../jupyter_files/labeledTrainData.tsv'
df = pd.read_csv(filepath, sep='\t')

In [4]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
# 将数据切分为训练集和测试集
train = df.iloc[0:20000, :]
validation = df.iloc[20000:25000, :]

In [6]:
# 将评论词转换为token。
max_words = 10000          # 仅保留前10000个最常见的词
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train['review'])

In [7]:
train_sequences = tokenizer.texts_to_sequences(train['review'])
validation_sequences = tokenizer.texts_to_sequences(validation['review'])

In [8]:
# 将评论填充或截断到同一长度
maxlen = 300
train_features = pad_sequences(train_sequences, maxlen=maxlen, padding='post', truncating='post')
validation_features = pad_sequences(validation_sequences, maxlen=maxlen, padding='post', truncating='post')

In [9]:
# 训练一个1维卷积网络
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers

In [10]:
embedding_dim = 128
num_epochs = 10
batch_size = 128

In [11]:
model = Sequential()
model.add(layers.Embedding(input_dim=max_words,
                    output_dim=embedding_dim,
                    input_length=maxlen))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.MaxPooling1D(5))
model.add(layers.Conv1D(32, 7, activation='relu'))
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(1))

model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['acc'])

In [12]:
history = model.fit(train_features, train['sentiment'],
                    epochs=num_epochs, batch_size=batch_size,
                    validation_data=(validation_features, validation['sentiment']))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
