In [1]:
import re
from tqdm import tqdm
from sklearn.utils import shuffle
import numpy as np
from tqdm import tqdm
import bz2
from keras.layers import *
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences


In [2]:
def splitReviewsLabels(lines):
    reviews = []
    labels = []
    for review in tqdm(lines):
        rev = reviewToX(review)
        label = reviewToY(review)
        reviews.append(rev[:512])
        labels.append(label)
    return reviews, labels

In [3]:
def reviewToY(review):
    return [1,0] if review.split(' ')[0] == '__label__1' else [0,1] #[1,0]:neg,[0,1]:pos,[neg probability, pos probability]

In [4]:
def reviewToX(review):
    review = review.split(' ', 1)[1][:-1].lower()
    review = re.sub('\d','0',review)
    if 'www.' in review or 'http:' in review or 'https:' in review or '.com' in review:
        review = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", review)
    return review

In [5]:
train_file = bz2.BZ2File('input/train.ft.txt.bz2')
test_file = bz2.BZ2File('input/test.ft.txt.bz2')

In [6]:
train_lines = train_file.readlines()
test_lines = test_file.readlines()

In [7]:
len(train_lines), len(test_lines)

(3600000, 400000)

In [8]:
train_lines = [x.decode('utf-8') for x in train_lines]
test_lines = [x.decode('utf-8') for x in test_lines]

In [9]:
# Load from the file
reviews_train, y_train = splitReviewsLabels(train_lines)
reviews_test, y_test = splitReviewsLabels(test_lines)

100%|██████████| 3600000/3600000 [00:30<00:00, 116474.33it/s]
100%|██████████| 400000/400000 [00:03<00:00, 125885.13it/s]


In [10]:
reviews_train, y_train = shuffle(reviews_train, y_train)
reviews_test, y_test = shuffle(reviews_test, y_test)

In [11]:
y_train = np.array(y_train)
y_test = np.array(y_test)

In [12]:
max_features = 8192
maxlen = 128
embed_size = 64

In [13]:
tokenizer = Tokenizer(num_words=max_features)

In [14]:
tokenizer.fit_on_texts(reviews_train)

In [15]:
token_train = tokenizer.texts_to_sequences(reviews_train)
token_test = tokenizer.texts_to_sequences(reviews_test)

In [16]:
x_train = pad_sequences(token_train, maxlen=maxlen, padding='post')
x_test = pad_sequences(token_test, maxlen=maxlen, padding='post')

In [17]:
reviews_test[:10]

["an eighties classic: i was raised on conan and red sonja, so i can't help but highly recommend these movies. no, the acting isn't great, and there are homosexual connotations that i just didn't catch when i was six, but these movies have stood the test of time-- at least in my estimation. wonderful adventures with unforgetable characters, and the swords are just darn cool.",
 'no more drips: i love this travel mug. it never drips or spills. it keeps nmy coffee hot and it looks good too !',
 "it ain't no joke! turnin' fitty: wow! very informative reading that mentally prepares us for the next half cent of living. it is a refresher of life looking back and reiterates in so many words that the reason the windshield is bigger than the rearview mirror is because it is more important where we are going than we we are coming from.definitely a fun read with contributions from many recognized resident experts. get this book before you turn fifty so you can set your gps on the future and make 

In [18]:
import gc
del train_file, test_file, train_lines, test_lines
del reviews_train, reviews_test
del token_train, token_test
gc.collect()

0

In [19]:
input = Input(shape=(maxlen,))
net = Embedding(max_features, embed_size)(input)
net = Dropout(0.2)(net)
net = BatchNormalization()(net)

net = Conv1D(32, 7, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net1 = BatchNormalization()(net)

net = Conv1D(2, 1)(net)
net = GlobalAveragePooling1D()(net)
output = Activation('softmax')(net)
model = Model(inputs = input, outputs = output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Metal device set to: Apple M1

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128)]             0         
                                                                 
 embedding (Embedding)       (None, 128, 64)           524288    
                                                                 
 dropout (Dropout)           (None, 128, 64)           0         
                                                                 
 batch_normalization (BatchN  (None, 128, 64)          256       
 ormalization)                                                   
                                                                 
 conv1d (Conv1D)             (None, 128, 32)           14368     
                                                                 
 batch_normalization_1 (Batc  (None, 128, 32)     

2022-05-25 23:46:04.129691: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-05-25 23:46:04.129813: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [20]:
# #model.fit(x_train, y_train, batch_size=2048, epochs=5, validation_split=0.1)
# model.fit(x_train, y_train, batch_size=2048, epochs=1, validation_split=0.1)

# model.save('my_model.h5')

In [21]:
import keras
model = keras.models.load_model('my_model.h5')

In [22]:
model.evaluate (x_test, y_test)

    1/12500 [..............................] - ETA: 36:12 - loss: 0.1233 - acc: 0.9688

2022-05-25 23:46:04.454978: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-05-25 23:46:04.524276: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.




[0.18090391159057617, 0.9315975904464722]

In [46]:
y_predict = model.predict(x_test)
# for i in range(len(y_predict)):
#     print(i, y_predict[i], y_test[i])
print(len(y_predict))
print(len(y_test))

400000
400000


In [47]:
def categorical_accuracy(np_predict, np_test):
  count = 0
  correct = 0
  if len(np_predict) == len(np_test):
    for i in range(len(np_predict)):
      count += 1
      if np.argmax(np_predict[i]) == np.argmax(np_test[i]):
        correct += 1
    print(correct/count)
  else:
    print("Error")
    return


categorical_accuracy(y_predict, y_test)


0.9315975
