# A Deep Learning Approach to Identifying Covert Disinformation Networks

### Boilerplate

In [None]:
# Suppress warning caused by h5py version Conda is using
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from multiplicative_lstm import MultiplicativeLSTM
import numpy as np
import preProcess

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Activation, Bidirectional, Conv1D, Dense, Dropout
from keras.layers import Embedding, GlobalMaxPooling1D, LSTM, MaxPooling1D

### Preprocess Data

In [None]:
# Calls preprocessing routines to turn positive and negative datasets into train and test
#   input and output vectors
# Here, pos.csv comes from tweets sent by the Internet Research Agency
#   neg.csv is a random set of tweets geolocated in the U.S.,
#   neg3.csv is a set of tweets chosen to represent a similiar user and content makeup to pos.csv.
# Details on datasets in the project paper
def loadData():
    POSFILE = "datasets\\pos.csv"
    NEGFILE = "datasets\\neg.csv"
    NEGFILE2 = "datasets\\neg3.csv"

    MAXPOS = 200000
    MAXNEG = 200000
    MAXNEG2 = 0
    GROUP = 20
    TRAINPCT = .8

    splitSize = int(TRAINPCT*((MAXPOS//GROUP + (MAXNEG + MAXNEG2)//GROUP)))
    (x, y) = preProcess.readData(POSFILE, NEGFILE, NEGFILE2, MAXPOS, MAXNEG, MAXNEG2, GROUP)
    indexes = preProcess.getIndexes(x)
    (X, Y) = preProcess.vectorize(x, y, indexes)
    (x_train, y_train), (x_test, y_test) = preProcess.splitData(X, Y, splitSize)
    
    return (x_train, y_train), (x_test, y_test)

In [None]:
(x_train, y_train), (x_test, y_test) = loadData()

### Run Model

The first model tried was a modified version of the imdb_cnn_lstm example on the keras team's Github: 
https://github.com/keras-team/keras/blob/master/examples/imdb_cnn_lstm.py

The multiplicative LSTM tested came from https://github.com/titu1994/Keras-Multiplicative-LSTM.git

In [None]:
# Set hyperparameters
max_features = max(np.amax(x_train), np.amax(x_test)) + 1
embedding_size = 4
maxlen = len(x_train[0])

kernel_size = 16
filters = 64
pool_size = 4

lstm_output_size = 70

batch_size = 32
epochs = 1

In [None]:
# Run model
model = Sequential()

model.add(Embedding(max_features, embedding_size, input_length=maxlen))
model.add(Dropout(0.20))

model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(GlobalMaxPooling1D())

# These were the models listed in the project paper that didn't work well
#model.add(LSTM(lstm_output_size))
#model.add(Bidirectional(LSTM(lstm_output_size)))
#model.add(MultiplicativeLSTM(lstm_output_size, dropout=0.2, recurrent_dropout=0.2))
#model.add(Bidirectional(MultiplicativeLSTM(lstm_output_size, dropout=0.2, recurrent_dropout=0.2)))

model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(x_test, y_test))

In [None]:
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)

In [None]:
print('History: ', history.history)
print('Test score:', score)
print('Test accuracy:', acc)