In [1]:
import os
import tarfile
import urllib.request
import tensorflow as tf
import numpy as np
import re
import string
from random import randint



In [2]:
url = 'http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz'
filepath = "data/aclImdb_v1.tar.gz"

In [3]:
if not os.path.exists("data"):
    os.makedirs("data")
if not os.path.isfile(filepath):
    print("downloading file")
    resutl = urllib.request.urlretrieve(url,filepath)
    print("Downloaded:", resutl)
else:
    print(filepath,"exists!")

data/aclImdb_v1.tar.gz exists!


In [4]:
if not os.path.exists("data/aclImdb"):
    tfile = tarfile.open(filepath,"r:gz")
    print("Extracting data...")
    result = tfile.extractall("data/")
    print("Extraction completed successfully")
else:
    print("data/aclImdb exists!")

data/aclImdb exists!


In [5]:
def remove_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('',text)


In [6]:
def read_files(filetype):
    path='data/aclImdb/'
    file_list=[]
    # read positive reviews
    positive_path=path+filetype+'/pos/'
    for f in os.listdir(positive_path):
        file_list+=[positive_path+f]
    pos_files_num=len(file_list)
    # read negative reviews
    negative_path=path+filetype+'/neg/'
    for  f in os.listdir(negative_path):
        file_list+=[negative_path+f]
    neg_files_num=len(file_list)-pos_files_num
    
    print('read',filetype,'files:',len(file_list))
    print(pos_files_num,'pos files in',filetype,'files')
    print(neg_files_num,'neg files in',filetype,'files')
    # one encoding per file
    all_labels=([[1,0]]*pos_files_num+[[0,1]]*neg_files_num)
    
    # all text
    all_texts=[]
    for fi in file_list:
        with open(fi,encoding='utf8') as file_input:
            all_texts+=[remove_tags(''.join(file_input.readlines()))]
    return all_labels,all_texts
    

In [7]:
train_labels, train_texts = read_files("train")
test_labels, test_texts = read_files('test')


read train files: 25000
12500 pos files in train files
12500 neg files in train files
read test files: 25000
12500 pos files in test files
12500 neg files in test files


In [8]:
print("train set, positive, text:", train_texts[0])
print("train set, positive, label:", train_labels[0])

train set, positive, text: Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!
train set, positive, label: [1, 0]


# Create the dictionary token

In [9]:
token = tf.keras.preprocessing.text.Tokenizer(num_words=4000)


In [10]:
token.fit_on_texts(train_texts)


In [11]:
token.document_count

25000

In [12]:
# The dictionary returned with a frequency-decreasing order
#print(token.word_index)

In [13]:
# The number of documents where the word exists
token.word_docs

defaultdict(int,
            {'financially': 28,
             'ran': 225,
             'pettiness': 2,
             'your': 4266,
             'comedy': 2337,
             'when': 9027,
             'time': 8719,
             'welcome': 204,
             'a': 24173,
             'me': 7329,
             'as': 16116,
             'insightful': 63,
             'the': 24792,
             'some': 9644,
             'age': 966,
             'profession': 61,
             'my': 8106,
             'survive': 239,
             'one': 14096,
             'students': 276,
             'schools': 56,
             'satire': 209,
             'at': 12936,
             'other': 6679,
             "i'm": 3655,
             'adults': 330,
             'high': 1837,
             'believe': 2209,
             'programs': 62,
             'knew': 822,
             'life': 4576,
             'which': 7572,
             'whole': 2641,
             'in': 22036,
             'saw': 2756,
             'immed

In [14]:
# The word frequencies
# print(token.word_counts)

In [15]:
# Transfer text to sequences

train_sequences = token.texts_to_sequences(train_texts)
test_sequences = token.texts_to_sequences(test_texts)


In [16]:
print(train_texts[0])

Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!


In [17]:
print(train_sequences[0])


[308, 6, 3, 1068, 208, 8, 2160, 29, 1, 168, 54, 13, 45, 81, 40, 391, 109, 137, 13, 57, 149, 7, 1, 481, 68, 5, 260, 11, 2000, 6, 72, 2422, 5, 631, 70, 6, 1, 5, 2001, 1, 1530, 33, 66, 63, 204, 139, 64, 1229, 1, 4, 1, 222, 899, 28, 3021, 68, 4, 1, 9, 693, 2, 64, 1530, 50, 9, 215, 1, 386, 7, 59, 3, 1470, 3710, 798, 5, 3509, 176, 1, 391, 9, 1235, 29, 308, 3, 352, 343, 2970, 142, 129, 5, 27, 4, 125, 1470, 2372, 5, 308, 9, 532, 11, 107, 1466, 4, 57, 554, 100, 11, 308, 6, 226, 47, 3, 2231, 11, 8, 214]


In [18]:
x_train = tf.keras.preprocessing.sequence.pad_sequences(train_sequences,padding='post',truncating='post',maxlen=400)
x_test = tf.keras.preprocessing.sequence.pad_sequences(test_sequences,padding='post',truncating='post',maxlen=400)
y_train = np.array(train_labels)
y_test = np.array(test_labels)


In [19]:
x_train.shape

(25000, 400)

In [20]:
print(x_train[0])

[ 308    6    3 1068  208    8 2160   29    1  168   54   13   45   81
   40  391  109  137   13   57  149    7    1  481   68    5  260   11
 2000    6   72 2422    5  631   70    6    1    5 2001    1 1530   33
   66   63  204  139   64 1229    1    4    1  222  899   28 3021   68
    4    1    9  693    2   64 1530   50    9  215    1  386    7   59
    3 1470 3710  798    5 3509  176    1  391    9 1235   29  308    3
  352  343 2970  142  129    5   27    4  125 1470 2372    5  308    9
  532   11  107 1466    4   57  554  100   11  308    6  226   47    3
 2231   11    8  214    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [21]:
from tensorflow import keras

In [22]:
model = keras.models.Sequential()
model.add(keras.layers.Embedding(output_dim = 32, input_dim = 4000, input_length = 400))
# No platten layer for RNN or LSTM
#  model.add(keras.layers.SimpleRNN(units =16)) 
# or
model.add(keras.layers.Bidirectional(keras.layers.LSTM(units = 8)))

# Bidirectional LSTM

model.add(keras.layers.Dense(units = 32, activation = "relu"))
model.add(keras.layers.Dropout(0.3))
model.add(keras.layers.Dense(units = 2, activation = "Softmax"))

In [23]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 400, 32)           128000    
                                                                 
 bidirectional (Bidirectiona  (None, 16)               2624      
 l)                                                              
                                                                 
 dense (Dense)               (None, 32)                544       
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 2)                 66        
                                                                 
Total params: 131,234
Trainable params: 131,234
Non-trainable params: 0
__________________________________________________

In [24]:
model.compile(optimizer='adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [25]:
y_train = np.array(train_labels)
y_test = np.array(test_labels)

In [26]:
history=model.fit(x_train,y_train,
                 validation_split=0.2,
                 epochs=10,
                 batch_size=128,
                 verbose=1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
test_loss, test_acc = model.evaluate(x_test,y_test)




In [28]:
print("Test accuracy:", test_acc)

Test accuracy: 0.8387600183486938


In [29]:
predictions = model.predict(x_test)
predictions[0]



array([0.97746336, 0.02253672], dtype=float32)

In [30]:
sentiment_dict = {0:'pos',1:'neg'}

def display_test_sentiment(i):
    print(test_texts[i])
    print("label values:", sentiment_dict[np.argmax(y_test[i])], "predict value:", sentiment_dict[np.argmax(predictions[i])])

In [31]:
display_test_sentiment(0)


I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge.
label values: pos predict value: pos


In [34]:
any_review = "Phase 4's MCU comes to a close with Black Panther: Wakanda Forever. It's been a truly inconsistent run of films. There was the good (Shang-Chi, Spider-Man, some of Dr. Strange 2), the eh (other parts of Dr. Strange 2, Black Widow and Eternals), and the ugly (Thor 4, which is easily the worst MCU movie so far). If this movie had been bad, it might have been enough to make me bail on keeping up with the MCU's movies, and truth be told, I've already bailed on the Disney+ series', because WandaVision, Falcon & Winter Soldier, and Loki really didn't do it for me. If you're expecting Black Panther 2 to wrap up Phase 4, you might be disappointed. But then again, there hasn't been a consistent narrative to the last two years of Marvel releases, so there's really nothing to conclude, overall. Importantly, it's just a very good movie, and that's enough to ensure it ends Phase 4 on a high. It's not a total slam-dunk, but there's a great deal about Black Panther 2 that's extremely effective, and enough great stuff that I feel the overall movie is a strong one. I think the pacing is its greatest strength. I really didn't feel the 2 hour 40 minute runtime. The opening half-hour doesn't flow the best, but there was a need to adjust after Chadwick Boseman's passing meant T'Challa died, too. Once the movie establishes this and sets up its main plot, it's pretty smooth-sailing, and unlike many Marvel movies, it saves its most emotional moments and best action scenes for its excellent final hour. It's a very emotional movie, and the way it pays respect to Boseman and T'Challa is essentially perfect. With good action, a solid story, a very good new antagonist (Namor and the whole underworld were great), and fantastic music (maybe the best Marvel score so far?), this was a very good MCU movie, and gives me a little more hope for the franchise going forward."
def display_text_sentiment(any_review):
    input_seq = token.texts_to_sequences([any_review])
    pad_input_seq = tf.keras.preprocessing.sequence.pad_sequences(input_seq, padding = 'post', truncating='post', maxlen=400)

    pred = model.predict(pad_input_seq)
    print(any_review)
    print("predict value:", sentiment_dict[np.argmax(pred)])

In [35]:
display_text_sentiment(any_review)

Phase 4's MCU comes to a close with Black Panther: Wakanda Forever. It's been a truly inconsistent run of films. There was the good (Shang-Chi, Spider-Man, some of Dr. Strange 2), the eh (other parts of Dr. Strange 2, Black Widow and Eternals), and the ugly (Thor 4, which is easily the worst MCU movie so far). If this movie had been bad, it might have been enough to make me bail on keeping up with the MCU's movies, and truth be told, I've already bailed on the Disney+ series', because WandaVision, Falcon & Winter Soldier, and Loki really didn't do it for me. If you're expecting Black Panther 2 to wrap up Phase 4, you might be disappointed. But then again, there hasn't been a consistent narrative to the last two years of Marvel releases, so there's really nothing to conclude, overall. Importantly, it's just a very good movie, and that's enough to ensure it ends Phase 4 on a high. It's not a total slam-dunk, but there's a great deal about Black Panther 2 that's extremely effective, and e