In [1]:
# libraries
import torch
import numpy as np
from keras import optimizers
import matplotlib.pyplot as plt
from keras import initializers
from keras.utils import np_utils
from keras import regularizers
from keras.models import Sequential, Model
from keras.callbacks import ModelCheckpoint
from keras.layers.merge import concatenate
from keras.utils import to_categorical
from keras.layers import Dense, Flatten, Dropout, Input, BatchNormalization, PReLU

# for running on multiple GPU
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
import threading
os.environ["CUDA_VISIBLE_DEVICES"]="2"

Using TensorFlow backend.


### text embeddings

In [2]:
import pickle
with open('politifact/finalTrainEmbeddings.pkl', 'rb') as f:
    trainEmbeddings = pickle.load(f)
with open('politifact/finalTestEmbeddings.pkl', 'rb') as f:
    testEmbeddings = pickle.load(f)

In [3]:
# dictionary---> Article text: ( label, articleURL, ImageId )
import json
with open('imageDataset/politi/trainJson.json', 'r') as f:
    trainData = json.load(f)
with open('imageDataset/politi/testJson.json', 'r') as f:
    testData = json.load(f)

In [4]:
for i in trainEmbeddings:
    trainEmbeddings[i] = [torch.mean(j[0], axis=1) for j in trainEmbeddings[i]]

In [5]:
for i in testEmbeddings:
    testEmbeddings[i] = [torch.mean(j[0], axis=1) for j in testEmbeddings[i]]

In [6]:
for i in testEmbeddings:
    temp = testEmbeddings[i]
    break

In [7]:
# padding
# if a paragraph has more than 50 sentences then crop, if less than 50 then pad.

for i in trainEmbeddings:
    if len(trainEmbeddings[i]) >=50:
        trainEmbeddings[i] = trainEmbeddings[i][0:50]
    else:
        deficit = 50 - len(trainEmbeddings[i])
        for j in range(deficit):
            trainEmbeddings[i].append(torch.zeros((1,768), dtype=torch.float32, device='cuda:0'))
    temp = torch.empty(50,768, dtype=torch.float32, device='cuda:0')
    for j in range(len(trainEmbeddings[i])):
        temp[j][:] = trainEmbeddings[i][j]
    trainEmbeddings[i] = temp

In [8]:
for i in testEmbeddings:
    if len(testEmbeddings[i]) >=50:
        testEmbeddings[i] = testEmbeddings[i][0:50]
    else:
        deficit = 50 - len(testEmbeddings[i])
        for j in range(deficit):
            testEmbeddings[i].append(torch.zeros((1,768), dtype=torch.float32, device='cuda:0'))
    temp = torch.empty(50,768, dtype=torch.float32, device='cuda:0')
    for j in range(len(testEmbeddings[i])):
        temp[j][:] = testEmbeddings[i][j]
    testEmbeddings[i] = temp

### image embeddings

In [9]:
with open('politifact/train_vgg_poli.pickle', 'rb') as f:
    train_vgg_poli = pickle.load(f)
with open('politifact/test_vgg_poli.pickle', 'rb') as f:
    test_vgg_poli = pickle.load(f)

In [10]:
train_text = [] # text embeddings
train_label = [] # labels
test_text = [] # text embeddings
test_label = []
train_image = [] # image embeddings
test_image = []

# Train Image IDs
# Test Image IDs
trainImageNames = [] # names of the images i.e name.jpg
trainTextNames = []  # train articles
testTextNames = []   # test articles
testImageNames = []  # names of the images in the test folder

In [11]:
# match code
# Check whether Image exists, then get corresponding Text Embeddings, and finally append to respective lists
for i in train_vgg_poli:
    for j in trainData:
        if i.split('.jpg')[0] == trainData[j][-1]['Top_img']:
            if j in trainEmbeddings:
                trainImageNames.append(i)
                trainTextNames.append(j)
                train_text.append(trainEmbeddings[j])
                train_image.append(train_vgg_poli[trainData[j][-1]['Top_img'] + '.jpg'])
                train_label.append(trainData[j][0])

In [12]:
for i in test_vgg_poli:
    for j in testData:
        if i.split('.jpg')[0] == testData[j][-1]['Top_img']:
            if j in testEmbeddings:
                testImageNames.append(i)
                testTextNames.append(j)
                test_text.append(testEmbeddings[j])
                test_image.append(test_vgg_poli[testData[j][-1]['Top_img'] + '.jpg'])
                test_label.append(testData[j][0])

In [13]:
len(trainTextNames), len(testTextNames)

(381, 104)

In [14]:
len(trainImageNames), len(testImageNames)

(381, 104)

In [15]:
realCount = 0
fakeCount = 0

for i in train_label:
    if i == 1:
        realCount += 1
    elif i == 0:
        fakeCount += 1

for i in test_label:
    if i == 1:
        realCount += 1
    elif i == 0:
        fakeCount += 1

print(realCount, fakeCount)

321 164


In [16]:
import pandas as pd
df=pd.DataFrame()
df['article']=testTextNames
df['image']=testImageNames
df['label']=test_label


# df['article']=trainTextNames
# df['image']=trainImageNames
# df['label']=train_label

In [None]:
df.shape

In [None]:
df.to_csv('/media/data_dump_2/Shivangi/baseline_models/politifact_test.csv', sep='\t')

In [None]:
# saving updated final dataset in the csv files for future reference
import pandas as pd
df=pd.DataFrame()
# df['train_articles']=trainTextNames
# df.to_csv('politifact_train_articles.csv', index=False)

# df['test_articles']=testTextNames
# df.to_csv('politifact_test_articles.csv', index=False)

df['train_label']=train_label
df.to_csv('politifact_train_label.csv', index=False)

# df['test_label']=test_label
# df.to_csv('politifact_test_label.csv', index=False)

# df['train_image']=trainImageNames
# df.to_csv('politifact_train_image.csv', index=False)

# df['test_image']=testImageNames
# df.to_csv('politifact_test_image.csv', index=False)

In [17]:
train_label = to_categorical(train_label)
test_label = to_categorical(test_label)

In [18]:
train_text=[torch.Tensor.numpy(i.cpu()) for i in train_text]
test_text=[torch.Tensor.numpy(i.cpu()) for i in test_text]

In [19]:
train_text_matrix = np.ndarray(shape=(len(train_text), 50,768))

In [20]:
train_text_matrix.shape

(381, 50, 768)

In [21]:
counter = 0
for i in train_text:
    train_text_matrix[counter][:][:] = i
    counter += 1

In [22]:
test_text_matrix = np.ndarray(shape=(len(test_text), 50,768))

In [23]:
test_text_matrix.shape

(104, 50, 768)

In [24]:
counter = 0
for i in test_text:
    test_text_matrix[counter][:][:] = i
    counter += 1

In [25]:
train_image[0].shape

(1, 4096)

In [26]:
train_image_matrix = np.ndarray(shape=(len(train_image), 4096,1))

In [27]:
train_image_matrix .shape

(381, 4096, 1)

In [28]:
counter = 0
for i in train_image:
    train_image_matrix[counter][:][:] = i.reshape(4096,1)
    counter += 1

In [29]:
test_image_matrix = np.ndarray(shape=(len(test_image), 4096,1))

In [30]:
test_image_matrix.shape

(104, 4096, 1)

In [31]:
counter = 0
for i in test_image:
    test_image_matrix[counter][:][:] = i.reshape(4096,1)
    counter += 1

In [32]:
train_image_matrix = train_image_matrix.reshape(381,4096)
test_image_matrix = test_image_matrix.reshape(104,4096)

### multimodal: XLNET + dense layer + VGG

In [33]:
input_text = Input(shape=(50,768))
text_flat = Flatten()(input_text)
dense_text = Dense(1000,activation='relu',kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_normal(seed=0))(text_flat)
#dense_text = Dropout(0.4)(dense_text)
dense_text = Dense(500,activation='relu',kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_normal(seed=0))(dense_text)
#dense_text = Dropout(0.4)(dense_text)
dense_text = Dense(100,activation='relu',kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_normal(seed=0))(dense_text)
dense_text = BatchNormalization()(dense_text)
dense_text_drop = Dropout(0.4)(dense_text)

input_image = Input(shape=(4096,))
dense_image = Dense(2000,activation='relu',kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_normal(seed=0))(input_image)
#dense_image = Dropout(0.4)(dense_image)
dense_image = Dense(1000, activation='relu',kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_normal(seed=0))(dense_image)
#dense_image = Dropout(0.4)(dense_image)
dense_image = Dense(100,activation='relu',kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_normal(seed=0))(dense_image)
dense_image = BatchNormalization()(dense_image)
dense_image_drop = Dropout(0.4)(dense_image)

concat = concatenate([dense_text_drop,dense_image_drop])

inter1_dense = Dense(200,activation='relu',kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_normal(seed=0))(concat)
inter1_dense = Dense(100,activation='relu',kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_normal(seed=0))(inter1_dense)
final_dense = Dense(50,activation='relu',kernel_regularizer=regularizers.l2(0.01), kernel_initializer=initializers.he_normal(seed=0))(inter1_dense)
final_dropout = Dropout(0.4)(final_dense)
output = Dense(2, activation='softmax')(final_dropout)

model = Model(inputs=[input_text,input_image], outputs=output)
adam = optimizers.Adam(lr=1e-4)
#adagrad = optimizers.Adagrad(lr=1e-4)
#adamax = optimizers.Adamax(lr=0.002, beta_1=0.9, beta_2=0.999)
checkpoint = ModelCheckpoint(filepath='../checkpoints_polity/dense_MM_model.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

#sgd = optimizers.SGD(lr=1e-4, clipnorm=1.)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 50, 768)      0                                            
__________________________________________________________________________________________________
flatten_1 (Flatten)             (None, 38400)        0           input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
dense_1 (Dense)                 (None, 1000)         38401000    flatten_1[0][0]                  
____________________________________________________________________________________________

In [34]:
history = model.fit([train_text_matrix, train_image_matrix],train_label,validation_data=([test_text_matrix,test_image_matrix],test_label),batch_size =32,epochs =100,callbacks=callbacks_list)

Train on 381 samples, validate on 104 samples
Epoch 1/100
Epoch 2/100



Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 

In [35]:
with open('XL_poli_history.json', 'w') as f:
    json.dump(str(history.history), f)

In [None]:
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

### XLNET+ dense layer

In [None]:
def baseline_model():
    model = Sequential()
    model.add(Flatten(input_shape=(50,768)))
    model.add(Dense(1000, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    #model.add(Dropout(0.4))
    model.add(Dense(500, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    #model.add(Dropout(0.4))
    model.add(Dense(100, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
    #model.add(BatchNormalization())
    model.add(Dropout(0.4))
    model.add(Dense(2, activation='softmax'))
    #adam = optimizers.Adam(lr=1e-4)
    #adamax = optimizers.Adamax(learning_rate=0.002, beta_1=0.9, beta_2=0.999)
    sgd = optimizers.SGD(lr=0.001, clipnorm=1.)
    #adadelta = optimizers.Adadelta(lr=1.0, rho=0.95)
    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [None]:
checkpoint = ModelCheckpoint(filepath='../checkpoints_polity/dense_Text_model.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
model = baseline_model()
model.summary()

In [None]:
tempHist = model.fit(train_text_matrix,train_label,validation_data=(test_text_matrix,test_label),batch_size =32,epochs =100,callbacks=callbacks_list)

### XLNET + LSTM 

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D,MaxPooling1D,Flatten
from keras.layers import LSTM, Bidirectional
from keras.optimizers import RMSprop

In [None]:
def lstm():
    model = Sequential()
    model.add(LSTM(128,input_shape=(50,768)))
    model.add(Dropout(0.5))
    model.add(Dense(595, activation='relu'))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy',optimizer='sgd',metrics=['accuracy'])
    return model

In [None]:
checkpoint = ModelCheckpoint(filepath='../checkpoints_polity/lstm_text_model.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
model = lstm()

In [None]:
model.summary()

In [None]:
model.fit(train_text_matrix,train_label,validation_data=(test_text_matrix,test_label),batch_size =32,epochs =100, callbacks=callbacks_list)

### XLNET + CNN

In [None]:
def cnn_model():
    model = Sequential()
    model.add(Conv1D(filters=3, kernel_size=5, activation='relu',data_format='channels_first' , input_shape=(50,768)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))
    model.add(Conv1D(filters=3, kernel_size=5, activation='relu',data_format='channels_first' ))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.5))
    model.add(Flatten())
    model.add(Dense(500, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
    return model

In [None]:
model=cnn_model()
checkpoint = ModelCheckpoint(filepath='../checkpoints_polity/cnn_text_model.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [None]:
model.summary()

In [None]:
model.fit(train_text_matrix,train_label,validation_data=(test_text_matrix,test_label),batch_size =32,epochs =100, callbacks=callbacks_list)