# LSTM

In [1]:
import pandas as pd
import numpy as np
import csv, datetime, time, json
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import string

import re
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from tensorflow import keras
from zipfile import ZipFile
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Model
from keras import Sequential
from keras.layers import Input, TimeDistributed, Dense, Lambda, concatenate, Dropout, BatchNormalization, LSTM, Concatenate, Embedding, Bidirectional
from keras.utils import to_categorical
# from keras.utils.vis_utils import plot_model
from keras.regularizers import l2
from keras import backend as K
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
# from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [2]:
SEED = 315
NUM_EPOCHS = 10 
BATCH_SIZE = 32
DROPOUT = 0.1
TEST_PORTION = 0.1
VALIDATION = 0.1
LSTM_DIM = 64
LSTM_REGULARIZATION = 0.0001
LSTM_DROPOUT = 0.1

MAX_N_WORDS = 30000
EMBEDDING_DIM = 300
MAX_SEQUENCE_LEN = 1000

OPTIMIZER = 'adam'


In [3]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

nltk_stopwords = set(stopwords.words('english'))
nltk_stopwords.remove('no')
nltk_stopwords.remove('not')

[nltk_data] Downloading package punkt to /Users/vincent/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/vincent/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vincent/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/vincent/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/vincent/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [4]:
# Pre-Processing
porter_stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()
punctuation_translator = str.maketrans('', '', string.punctuation)
def preprocess_text(s, lowercase=True, remove_stopwords=True, remove_punctuation=True, stemmer=None, lemmatizer=None):
    tokens = word_tokenize(s)

    if lemmatizer is not None:
        tokens = lemmatize_tokens(lemmatizer, tokens)
    elif stemmer is not None:
        tokens = stem_tokens(stemmer, tokens)

    if lowercase:
        tokens = [token.lower() for token in tokens]
    
    if remove_stopwords:
        tokens = [token for token in tokens if not token in nltk_stopwords]
    
    # Remove all punctuation marks if needed (note: also converts, e.g, "Mr." to "Mr")
    if remove_punctuation:
        tokens = [ ''.join(c for c in s if c not in string.punctuation) for s in tokens ]
        tokens = [ token for token in tokens if len(token) > 0 ] # Remove "empty" tokens

    # if (len(tokens) == 0):
    #     print('len = 0: '+ s)

    return ' '.join(tokens)

def remove_punctuation(s):
    return s.translate(punctuation_translator)

def lemmatize_tokens(lemmatizer, tokens):
    pos_tag_list = pos_tag(tokens)
    for idx, (token, tag) in enumerate(pos_tag_list):
        tag_simple = tag[0].lower() # Converts, e.g., "VBD" to "c"
        if tag_simple in ['n', 'v', 'j']:
            word_type = tag_simple.replace('j', 'a') 
        else:
            word_type = 'n'
        lemmatized_token = lemmatizer.lemmatize(token, pos=word_type)
        tokens[idx] = lemmatized_token
    return tokens

def stem_tokens(stemmer, tokens):
    for idx, token in enumerate(tokens):
        tokens[idx] = stemmer.stem(token)
    return tokens

In [8]:
train_file = 'Data/twitter_MBTI.csv'
labels = {
          'infp':0,
          'infj':1,
          'intp':2,
          'intj':3,
          'istp':4,
          'istj':5,
          'isfj':6,
          'isfp':7,
          'enfp':8,
          'entp':9,
          'enfj':10,
          'entj':11,
          'estp':12,
          'estj':13,
          'esfj':14,
          'esfp':15,
          }

with open(train_file, "r") as f:
    data = f.readlines()
data = pd.read_csv(train_file)
content = data.iloc[:,1].dropna("")

# X = [ preprocess_text(x, remove_stopwords=True, remove_punctuation=True, lemmatizer=None) for x in content ]
# Y = data.iloc[:,2]



  content = data.iloc[:,1].dropna("")


In [9]:
train_data, test_and_valid_data = train_test_split(data, test_size=0.2, random_state=42)
valid_data, test_data = train_test_split(test_and_valid_data, test_size=0.5, random_state=42)

content = train_data.iloc[:,1].dropna()
X_train = [ preprocess_text(x, remove_stopwords=True, remove_punctuation=True, lemmatizer=None) for x in content ]
Y_train = train_data.iloc[:,2]

test_content = test_data.iloc[:,1].dropna()
X_test = [ preprocess_text(x, remove_stopwords=True, remove_punctuation=True, lemmatizer=None) for x in test_content ]
Y_test = train_data.iloc[:,2]


In [10]:
X_df = pd.DataFrame(X_train)
# print(X_df.tail())
Y_df = pd.DataFrame(Y_train)

X_df.reset_index(drop=True, inplace=True)
# print(X_df.head())
Y_df.reset_index(drop=True, inplace=True)
data_df = pd.concat([X_df, Y_df], axis = 1)
data_df.to_csv("Data/preprocessed_train.csv", index = False)
data_saved = pd.read_csv("Data/preprocessed_train.csv")
X_df = data_saved.iloc[:,0].fillna("").tolist()
Y_df = data_saved.iloc[:,1].fillna("").tolist()
X_train = X_df
y_train = [ labels[y] for y in Y_df]

In [11]:
X_df = pd.DataFrame(X_test)
# print(X_df.tail())
Y_df = pd.DataFrame(Y_test)

X_df.reset_index(drop=True, inplace=True)
# print(X_df.head())
Y_df.reset_index(drop=True, inplace=True)
data_df = pd.concat([X_df, Y_df], axis = 1)
data_df.to_csv("Data/preprocessed_test.csv", index = False)
data_saved = pd.read_csv("Data/preprocessed_test.csv")
X_df = data_saved.iloc[:,0].fillna("").tolist()
Y_df = data_saved.iloc[:,1].fillna("").tolist()
X_test = X_df
y_test = [ labels[y] for y in Y_df]
y_train = to_categorical(y_train, 16)

In [12]:
tokenizer = Tokenizer(num_words=MAX_N_WORDS)

tokenizer.fit_on_texts(X_train)

#Training Set
train_idx_seq = tokenizer.texts_to_sequences(X_train)

#Testing Set
test_idx_seq = tokenizer.texts_to_sequences(X_test)

word_idx = tokenizer.word_index
print("Total words from training corpus: %d" % len(word_idx))

Total words from training corpus: 902634


In [13]:
# Saving word to idx dictionary
word2idx = pd.DataFrame([word_idx])

#save dataframe to csv file
word2idx.to_csv("Data/word2idx.csv", index=False)

#validate the csv file by importing it
word_idx_saved = pd.read_csv("Data/word2idx.csv").to_dict('r')[0]

  word_idx_saved = pd.read_csv("Data/word2idx.csv").to_dict('r')[0]


In [14]:
# Getting vectors for each word, since in the glove file each line is word followed by values for each dimension
# of the embedding vector, separated by space
embedding_dict = {}
with open('./glove.6B.300d.txt','r') as f:
  for line in f:
    values = line.split(' ')
    word = values[0]
    embedding = np.asarray(values[1:],'float32')
    embedding_dict[word] = embedding
f.close()

In [15]:
print('Number of glove word embeddings: %d' % len(embedding_dict))
vocab_size = min(MAX_N_WORDS, len(word_idx)) # Only keep up to MAX_N_WORDS unique words
word_embedding_matrix = np.zeros((vocab_size + 1, EMBEDDING_DIM)) # +1 reserved for padding
for word, index in word_idx.items():
  if index > MAX_N_WORDS: # word is not top MAX_N_WORDS frequent
    continue
  embedding = embedding_dict.get(word)
  if embedding is not None:
    word_embedding_matrix[index] = embedding
print('Total word embeddings: %d' % (len(word_embedding_matrix) - 1))
print('Null word embeddings: %d' % np.sum(np.sum(word_embedding_matrix, axis=1) == 0))
np.savetxt("Data/glove_word_embedding.csv", word_embedding_matrix, delimiter=",")

Number of glove word embeddings: 400000
Total word embeddings: 30000
Null word embeddings: 12343


In [16]:
# Pad the input so that it is valid input to the model

train = pad_sequences(train_idx_seq, maxlen=MAX_SEQUENCE_LEN)

test =  pad_sequences(test_idx_seq, maxlen=MAX_SEQUENCE_LEN)

print('Dimensions of train:', train.shape)

print('Dimensions of test:', test.shape)

print('Number of training labels', y_train.shape)

Dimensions of train: (6248, 1000)
Dimensions of test: (6248, 1000)
Number of training labels (6248, 16)


In [17]:
# Initializing Keras tensor
train_tensor = Input(shape=(MAX_SEQUENCE_LEN,))

test_tensor = Input(shape=(MAX_SEQUENCE_LEN,)) 

common_embed = Embedding(vocab_size + 1, # 0 is reserved for padding
               EMBEDDING_DIM,
               weights=[word_embedding_matrix],
               input_length=MAX_SEQUENCE_LEN, # Maximum 50 words (index) in a input sequence
               trainable=False)
common_lstm = LSTM(LSTM_DIM)

train_layer = common_embed(train_tensor)
train_layer = common_lstm(train_layer)

inputs = Dense(128, activation='relu')(train_layer)     #First
inputs = Dropout(DROPOUT)(inputs)
inputs = BatchNormalization()(inputs)
inputs = Dense(64, activation='relu')(inputs)     #Second
inputs = Dropout(DROPOUT)(inputs)
inputs = BatchNormalization()(inputs)
inputs = Dense(32, activation='relu')(inputs)     #Fourth
inputs = Dropout(DROPOUT)(inputs)
inputs = BatchNormalization()(inputs)

final_output = Dense(16, activation='softmax')(inputs)

model = Model(inputs=train_tensor, outputs=final_output)
model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER, metrics=['accuracy'])
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1000)]            0         
                                                                 
 embedding (Embedding)       (None, 1000, 300)         9000300   
                                                                 
 lstm (LSTM)                 (None, 64)                93440     
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 batch_normalization (Batch  (None, 128)               512       
 Normalization)                                                  
                                                             

In [18]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint("baseline.h5", monitor='val_accuracy', save_best_only=True),
             EarlyStopping(monitor='val_accuracy', patience=5, verbose=1, mode='auto')]
history = model.fit(train, 
                    y_train, 
                    epochs=10,
                    validation_split=VALIDATION,
                    verbose=1,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)

t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2024-04-18 20:26:46.833097
Epoch 1/10
Epoch 2/10


  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 7: early stopping
Training ended at 2024-04-18 20:33:45.228498
Minutes elapsed: 6.973254


In [19]:
model.load_weights("baseline.h5")

predicted = model.predict(test, verbose=0)
predicted = np.argmax(predicted, axis=1)


In [20]:
id2labels = {
          0:'infp',
          1:'infj',
          2:'intp',
          3:'intj',
          4: 'istp',
          5: 'istj',
          6:'isfj',
          7:'isfp',
          8:'enfp',
          9:'entp',
          10:'enfj',
          11:'entj',
          12:'estp',
          13:'estj',
          14:'esfj',
          15:'esfp'
          }
revised_predicted = [id2labels[int(i)] for i in predicted]
acc = sum(revised_predicted == Y_test) / len(Y_test)
print("Acc = ", acc)

Acc =  0.15925096030729832
