#### Joongyeon Steven Cho
#### Text Categorization using Keras with Tensorflow

In [1]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

In [2]:
train_file_name = input("Name of labeled list of training file: ")

train_list = open(train_file_name).read().splitlines()


In [3]:
stopwords = ["ourselves", "hers", "between", "yourself", "but", "again", "there", "about", "once", "during", "out", "very", "having", "with", "they", "own", "an", "be", "some", "for", "do", "its", "yours", "such", "into", "of", "most", "itself", "other", "off", "is", "s", "am", "or", "who", "as", "from", "him", "each", "the", "themselves", "until", "below", "are", "we", "these", "your", "his", "through", "don", "nor", "me", "were", "her", "more", "himself", "this", "down", "should", "our", "their", "while",
             "above", "both", "up", "to", "ours", "had", "she", "all", "no", "when", "at", "any", "before", "them", "same", "and", "been", "have", "in", "will", "on", "does", "yourselves", "then", "that", "because", "what", "over", "why", "so", "can", "did", "not", "now", "under", "he", "you", "herself", "has", "just", "where", "too", "only", "myself", "which", "those", "i", "after", "few", "whom", "t", "being", "if", "theirs", "my", "against", "a", "by", "doing", "it", "how", "further", "was", "here", "than"]


In [4]:
categories = []
train_articles = []

REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^a-z #+_]')

for line in train_list:
    path = line.split(' ')[0]
    category = line.split(' ')[1]
    article = open(path).read()
    
    article = article.lower()
    article = REPLACE_BY_SPACE_RE.sub(' ', article)

    article = BAD_SYMBOLS_RE.sub('', article)
    # print(article)

    for word in stopwords:
        token = " " + word + " "
        article = article.replace(token, ' ')

    categories.append(category)
    train_articles.append(article)




In [5]:
# Parameters

numlen = 256
dict_size = 5000
train_ratio = 0.90
embedding_dim = 64


In [6]:
#training
train_size = int(len(train_articles)*train_ratio)
training_articles = train_articles[0:train_size]
train_categories = categories[0:train_size]

tuning_articles = train_articles[train_size:]
tuning_categories = categories[train_size:]

tokenizer = Tokenizer(num_words=dict_size, oov_token="OOV")
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index
print(len(word_index))

24010


In [7]:
train_sequences = tokenizer.texts_to_sequences(training_articles)
train_padded = pad_sequences(
    train_sequences, maxlen=numlen, padding="post",truncating="post")

tuning_sequences = tokenizer.texts_to_sequences(tuning_articles)
tuning_padded = pad_sequences(
    tuning_sequences, maxlen=numlen,padding="post", truncating="post")  
    
label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(categories)

print(label_tokenizer.word_index)   

{'str': 1, 'pol': 2, 'dis': 3, 'cri': 4, 'oth': 5}


In [8]:

model = tf.keras.Sequential([tf.keras.layers.Embedding(dict_size, embedding_dim),
                             tf.keras.layers.Bidirectional(
    tf.keras.layers.LSTM(embedding_dim)),
    tf.keras.layers.Dense(embedding_dim, activation = 'relu'),
    tf.keras.layers.Dropout(0.6),
    tf.keras.layers.Dense(6, activation='softmax')])

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam', metrics=['accuracy'])


num_epochs = 30
training_labels_seq = np.array(
    label_tokenizer.texts_to_sequences(train_categories))
tuning_labels_seq = np.array(
    label_tokenizer.texts_to_sequences(tuning_categories))

model.fit(train_padded, training_labels_seq, epochs=num_epochs,
          validation_data=(tuning_padded, tuning_labels_seq), verbose=2)



Epoch 1/30
25/25 - 6s - loss: 1.7000 - accuracy: 0.3053 - val_loss: 1.4600 - val_accuracy: 0.3483 - 6s/epoch - 233ms/step
Epoch 2/30
25/25 - 3s - loss: 1.5590 - accuracy: 0.3103 - val_loss: 1.3637 - val_accuracy: 0.3483 - 3s/epoch - 118ms/step
Epoch 3/30
25/25 - 3s - loss: 1.3056 - accuracy: 0.4070 - val_loss: 1.0912 - val_accuracy: 0.5393 - 3s/epoch - 110ms/step
Epoch 4/30
25/25 - 3s - loss: 1.0835 - accuracy: 0.5126 - val_loss: 0.9339 - val_accuracy: 0.6292 - 3s/epoch - 114ms/step
Epoch 5/30
25/25 - 3s - loss: 0.9205 - accuracy: 0.5766 - val_loss: 0.7535 - val_accuracy: 0.7303 - 3s/epoch - 120ms/step
Epoch 6/30
25/25 - 3s - loss: 0.7280 - accuracy: 0.6947 - val_loss: 0.5733 - val_accuracy: 0.7865 - 3s/epoch - 114ms/step
Epoch 7/30
25/25 - 3s - loss: 0.4894 - accuracy: 0.8116 - val_loss: 0.4845 - val_accuracy: 0.8989 - 3s/epoch - 116ms/step
Epoch 8/30
25/25 - 3s - loss: 0.3675 - accuracy: 0.8907 - val_loss: 0.3938 - val_accuracy: 0.8876 - 3s/epoch - 122ms/step
Epoch 9/30
25/25 - 3s - 

<keras.callbacks.History at 0x1fb6a4e8fa0>

In [9]:
test_file_name = input("Name of unlabeled list of testing file: ")

test_list = open(test_file_name).read().splitlines()

test_articles = []

for line in test_list:
    article = open(line).read()
    article = article.lower()
    article = REPLACE_BY_SPACE_RE.sub(' ', article)
    article = BAD_SYMBOLS_RE.sub('', article)
    for word in stopwords:
        token = " " + word + " "
        article = article.replace(token, ' ')
    test_articles.append(article)



In [10]:


test_sequences = tokenizer.texts_to_sequences(test_articles)
test_padded = pad_sequences(
    test_sequences, maxlen=numlen, padding="post", truncating="post")
choice = model.predict(test_padded)

predictions = []
inv_map = {value: key for key, value in label_tokenizer.word_index.items()}

for idx in range (0, len(choice)):
    index = np.argmax(choice[idx])
    predictions.append(inv_map[index])


In [12]:
output_name = input(
    "Enter name of output file: ")

output_file = open(output_name, 'w')


test_file = open(test_file_name).read().splitlines()

i = 0
for line in test_file:
        line = line.strip('\n')
        category = predictions[i]
        output_line = line + ' ' + category.capitalize() + '\n'
        output_file.write(output_line)

        i += 1

output_file.close()
