In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras import Sequential

In [3]:
train_df = pd.read_csv('C:/Users/kok19/Downloads/train.csv')

In [4]:
train_df.head()

Unnamed: 0,headline,is_sarcastic
0,olympic torch used to ignite tibetan protesters,1
1,this 594-foot-high basketball shot 'for mankin...,0
2,"dr. oz, mel gibson, & congress called out usin...",0
3,excited juror feels like murder trial being pu...,1
4,man has mixed feelings about $39 flight,1


In [5]:
train_df.shape

(24038, 2)

In [7]:
x_train = train_df['headline'].copy()
y_train = train_df['is_sarcastic'].copy()

In [8]:
# Tokenization and padding
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(x_train))
tokenized_x_train = tokenizer.texts_to_sequences(x_train)
padded = pad_sequences(tokenized_x_train, padding='post', maxlen=80)

In [10]:
# Load the pre-trained GloVe to embedding matrix

# Step 1 - Extract information in GloVe to a python dict
dict_glove = {}

with open('./machine_learning/glove.6B.50d.txt', "r", encoding="utf8") as file:
    for line in file:
        tokens = line.split()
        word = tokens[0]
        vector = np.array(tokens[1:], dtype=np.float32)
        
        if vector.shape[0] == 50:
            dict_glove[word] = vector
        else:
            print("There is a problem with" + word)

In [11]:
print("Dictionary size: ", len(dict_glove))

Dictionary size:  400000


In [12]:
# Step 2 - Embedding matrix configurations
embedding_dim = 50
vocab_size = len(tokenizer.word_index) + 1 
embedding_matrix = np.zeros((vocab_size, embedding_dim))

In [13]:
embedding_matrix.shape

(28131, 50)

In [14]:
# Step 3 - Transfer information from python dict to embedding matrix
unk_count = 0
unk_set = set()

for word in tokenizer.word_index:
    embedding_vector = dict_glove.get(word)
    
    if embedding_vector is not None:
        token_id = tokenizer.word_index[word]
        embedding_matrix[token_id] = embedding_vector
    else:
        unk_count += 1
        unk_set.add(word) 

In [15]:
print("Total unknown words: ", unk_count)

Total unknown words:  5449


In [17]:
rnn_units = 64
BATCH_SIZE = 8

In [24]:
def build_model_lstm(vocab_size, embedding_dim, rnn_units, batch_size, train_emb=False):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, mask_zero=True, weights=[embedding_matrix], trainable=train_emb),
        LSTM(rnn_units, return_sequences=True, dropout=0.5),
        Dropout(0.1),
        LSTM(rnn_units, dropout=0.25),
        Dropout(0.1),
        Dense(1, activation="sigmoid")
    ])
    return model

In [25]:
model_lstm = build_model_lstm(vocab_size=vocab_size,
                                embedding_dim=embedding_dim,
                                rnn_units=rnn_units,
                                batch_size = BATCH_SIZE)

In [26]:
model_lstm.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 50)          1406550   
_________________________________________________________________
lstm_2 (LSTM)                (None, None, 64)          29440     
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 64)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,469,079
Trainable params: 62,529
Non-trainable params: 1,406,550
_______________________________________

In [27]:
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [28]:
model_lstm.fit(padded, y_train, batch_size=BATCH_SIZE, epochs=3, validation_split=0.1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x15071381a90>

In [29]:
test_df = pd.read_csv('C:/Users/kok19/Downloads/test.csv')

In [30]:
x_test = test_df['headline'].copy()
y_test = test_df['is_sarcastic'].copy()

In [31]:
tokenized_x_test = tokenizer.texts_to_sequences(x_test)
padded_test = pad_sequences(tokenized_x_test, padding='post', maxlen=80)

In [32]:
model_lstm.evaluate(padded_test, y_test)



[0.38416826725006104, 0.8300262093544006]