# PD Data

In [8]:
from typing import List
import numpy as np
import pandas as pd

def MakeDF(colNames: List, df:pd):
    newDf = pd.DataFrame()
    for colName in colNames:
        newDf = pd.concat([newDf, df[colName]], axis=1)
    return newDf

train = pd.read_csv('liar_dataset/train.tsv', sep='\t')
train.columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'spkrJobTitle', 'state', 'party', 'barelyTrueCounts', 'falseCounts', 'halfTrueCounts', 'mostlyTrueCounts', 'pantsOnFireCounts', 'context']

test = pd.read_csv('liar_dataset/test.tsv', sep='\t')
test.columns = ['ID', 'label', 'statement', 'subject', 'speaker', 'spkrJobTitle', 'state', 'party', 'barelyTrueCounts', 'falseCounts', 'halfTrueCounts', 'mostlyTrueCounts', 'pantsOnFireCounts', 'context']

speakerLieCountHeaders = ['barelyTrueCounts', 'falseCounts', 'halfTrueCounts', 'mostlyTrueCounts', 'pantsOnFireCounts']
trainSpeakerLieCounts = MakeDF(speakerLieCountHeaders, train)
testSpeakerLieCounts = MakeDF(speakerLieCountHeaders, test)

# Word Embedding


In [9]:
import keras
import tensorflow
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical # This is now fully integrated into tensorflow
from keras.initializers import Constant
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM
from keras.models import Model, Sequential
from keras import layers

MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100

def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train['statement'])
train_sequences = tokenizer.texts_to_sequences(train['statement'])
test_sequences = tokenizer.texts_to_sequences(test['statement'])
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
#Converting this to sequences to be fed into neural network. Max seq. len
# is 1000 as set earlier. Initial padding of 0s, until vector is of
#size MAX_SEQUENCE_LENGTH

# Make Data
train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

# Make Keras Labels
NUM_CLASSES = 6
y_train_enc, y_test_enc = prepare_targets(train['label'], test['label'])
y_train = to_categorical(y_train_enc, NUM_CLASSES)
y_test = to_categorical(y_test_enc, NUM_CLASSES)

embeddings_index = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
        num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
        embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, embeddings_initializer=Constant(embedding_matrix), input_length=MAX_SEQUENCE_LENGTH, trainable=False)
print("Preparing of embedding matrix is done")


Found 12423 unique tokens.
Preparing of embedding matrix is done


In [None]:
from keras.models import Model
from keras.layers import Input, Dense, concatenate
from keras.utils.vis_utils import plot_model

left_branch_input = Input(shape=(2,), name='Left_input')
left_branch_output = Dense(5, activation='relu')(left_branch_input)

right_branch_input = Input(shape=(2,), name='Right_input')
right_branch_input = Embedding(MAX_NUM_WORDS, 128)(right_branch_input) # ?
right_branch_output = Dense(5, activation='relu')(right_branch_input)

concat = concatenate([left_branch_output, right_branch_output], name='Concatenate')
final_model_output = Dense(3, activation='sigmoid')(concat)
final_model = Model(inputs=[left_branch_input, right_branch_input], outputs=final_model_output,
                    name='Final_output')
final_model.compile(optimizer='adam', loss='binary_crossentropy')

In [13]:
Embedding(MAX_NUM_WORDS, 128)


<keras.layers.embeddings.Embedding at 0x184891e82e0>

In [None]:
print("Defining and training an LSTM model, training embedding layer on the fly")
rnnmodel = Sequential()
rnnmodel.add(Embedding(MAX_NUM_WORDS, 128))
rnnmodel.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
rnnmodel.add(Dense(4, activation='sigmoid'))


Defining and training an LSTM model, training embedding layer on the fly
