In [1]:
import matplotlib.pyplot as plt
import os
import re
import shutil
import string
import tensorflow as tf

## 이번에는 영화 리뷰를 기반으로 감성 분석을 진행해볼 것이다
 사용되어질 데이터는 IMDB 데이터셋으로 train 2.5만개, test 2.5만개로 구성되어 있으며 긍적적인 리뷰와 부정적인 리뷰의 수는 동일하도록 구성되어 있다.

In [2]:
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

dataset = tf.keras.utils.get_file('aclIndb_v1', url, untar =True, cache_dir = '.', cache_subdir='')
dataset_dir = os.path.join(os.path.dirname(dataset), 'aclImdb')

Downloading data from https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [3]:
os.listdir(dataset_dir)

['imdb.vocab', 'train', 'test', 'README', 'imdbEr.txt']

In [4]:
train_dir = os.path.join(dataset_dir, 'train')
test_dir = os.path.join(dataset_dir, 'test')

os.listdir(train_dir), os.listdir(test_dir)

(['neg',
  'unsupBow.feat',
  'unsup',
  'urls_neg.txt',
  'urls_pos.txt',
  'labeledBow.feat',
  'urls_unsup.txt',
  'pos'],
 ['neg', 'urls_neg.txt', 'urls_pos.txt', 'labeledBow.feat', 'pos'])

In [5]:
sample_file = os.path.join(train_dir, 'pos/1181_9.txt')
with open(sample_file) as f:
  print(f.read())


print(len(os.listdir(os.path.join(train_dir, 'pos'))))

Rachel Griffiths writes and directs this award winning short film. A heartwarming story about coping with grief and cherishing the memory of those we've loved and lost. Although, only 15 minutes long, Griffiths manages to capture so much emotion and truth onto film in the short space of time. Bud Tingwell gives a touching performance as Will, a widower struggling to cope with his wife's death. Will is confronted by the harsh reality of loneliness and helplessness as he proceeds to take care of Ruth's pet cow, Tulip. The film displays the grief and responsibility one feels for those they have loved and lost. Good cinematography, great direction, and superbly acted. It will bring tears to all those who have lost a loved one, and survived.
12500


In [6]:
remove_dir = os.path.join(train_dir, 'unsup')
# 아래의 코드는 폴더안에 파일이 있으면 사용할 수 없다.
# os.rmdir(remove_dir)
shutil.rmtree(remove_dir)

In [7]:
batch_size = 32
seed = 42

raw_train_ds = tf.keras.utils.text_dataset_from_directory(train_dir,
                                                          batch_size=batch_size,
                                                          validation_split=0.2,
                                                          subset='training',
                                                          seed = seed)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [8]:
for text_batch, label_batch in raw_train_ds.take(1):
  for i in range(3):
    print('review', text_batch.numpy()[i])
    print('label', label_batch.numpy()[i])

review b'"Pandemonium" is a horror movie spoof that comes off more stupid than funny. Believe me when I tell you, I love comedies. Especially comedy spoofs. "Airplane", "The Naked Gun" trilogy, "Blazing Saddles", "High Anxiety", and "Spaceballs" are some of my favorite comedies that spoof a particular genre. "Pandemonium" is not up there with those films. Most of the scenes in this movie had me sitting there in stunned silence because the movie wasn\'t all that funny. There are a few laughs in the film, but when you watch a comedy, you expect to laugh a lot more than a few times and that\'s all this film has going for it. Geez, "Scream" had more laughs than this film and that was more of a horror film. How bizarre is that?<br /><br />*1/2 (out of four)'
label 0
review b"David Mamet is a very interesting and a very un-equal director. His first movie 'House of Games' was the one I liked best, and it set a series of films with characters whose perspective of life changes as they get into 

In [9]:
print('label 0 means -', raw_train_ds.class_names[0])
print('label 1 means -', raw_train_ds.class_names[1])

label 0 means - neg
label 1 means - pos


In [10]:
# 사실 아래의 코드에 subset = 'both' 로 설정하면 한번에 train과 val 데이터셋이 리턴된다.
raw_val_ds = tf.keras.utils.text_dataset_from_directory(train_dir,
                                                        batch_size = batch_size,
                                                        validation_split = 0.2,
                                                        subset = 'validation',
                                                        seed = seed)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [11]:
raw_test_ds = tf.keras.utils.text_dataset_from_directory(test_dir,
                                                         batch_size= batch_size)

Found 25000 files belonging to 2 classes.


In [21]:
# 여기서 더 커스터마이징을 진행하지 않는다면 아래의 TextVectorization의 파라미터 standardize = 'lower_and_strip_punctuation' 넣는것과 동일하다
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  stripped_html = tf.strings.regex_replace(lowercase, '<br />',' ')
  return tf.strings.regex_replace(stripped_html,
                                  f'[{re.escape(string.punctuation)}]',
                                  '')

In [22]:
#
s = "Alice (2252), Responses (3rd), 2018 Cens - 2.5% Sample"
out = re.sub(f'[{re.escape(string.punctuation)}]', '', s)
print(out)

Alice 2252 Responses 3rd 2018 Cens  25 Sample


In [24]:
max_features = 10000
sequence_length = 250

vectorize_layer = tf.keras.layers.TextVectorization(
    standardize = custom_standardization,
    max_tokens = max_features,
    output_mode = 'int',
    output_sequence_length = sequence_length
)

In [25]:
train_text = raw_train_ds.map(lambda x,y: x)
vectorize_layer.adapt(train_text)

In [27]:
import numpy as np

In [48]:
# k =np.array([[1,2,3],[4,5,6]])
# print(k.shape)
# tf.expand_dims(k,-1)

In [49]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [50]:
text_batch, label_batch = next(iter(raw_train_ds))
first_review, first_label = text_batch[0], label_batch[0]
print('review', first_review)
print('label', raw_train_ds.class_names[first_label])
print('vectorized review', vectorize_text(first_review, first_label))

review tf.Tensor(b'Great movie - especially the music - Etta James - "At Last". This speaks volumes when you have finally found that special someone.', shape=(), dtype=string)
label neg
vectorized review (<tf.Tensor: shape=(1, 250), dtype=int64, numpy=
array([[  86,   17,  260,    2,  222,    1,  571,   31,  229,   11, 2418,
           1,   51,   22,   25,  404,  251,   12,  306,  282,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
       

In [51]:
print(vectorize_layer.get_vocabulary()[1300])

presence


In [52]:
train_ds = raw_train_ds.map(vectorize_text)
val_ds = raw_val_ds.map(vectorize_text)
test_ds = raw_test_ds.map(vectorize_text)

In [54]:

autotune = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size = autotune)
val_ds = val_ds.cache().prefetch(buffer_size = autotune)
test_ds = test_ds.cache().prefetch(buffer_size = autotune)

In [56]:
embedding_dim = 16

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(max_features+1, embedding_dim),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1)
])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 16)          160016    
                                                                 
 dropout_1 (Dropout)         (None, None, 16)          0         
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 16)                0         
                                                                 
 dense (Dense)               (None, 1)                 17        
                                                                 
Total params: 160,033
Trainable params: 160,033
Non-trainable params: 0
__________________________________________________

In [57]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer = 'adam',
              metrics = tf.metrics.BinaryAccuracy(threshold=0.0))

In [59]:
epochs = 10
history = model.fit(train_ds,
                    validation_data=val_ds,
                    epochs=epochs)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
