# LSTM model for identifying Silk Road authors

This notebook contains the code used for building and training an RNN model on sequence representations of silk road posts. The configuration definitions must match those used in building the training dataframe. 

In [None]:
import pandas as pd
import numpy as np
import pickle
import tensorflow

In [None]:
GLOVE_TYPE = "twitter"
EMBEDDING_VECTOR_SIZE = 50 # should match glove file
INPUT_LENGTH = 200
N_AUTHORS = 25
VOCAB_SIZE = 5000
df = pd.read_pickle(f"files/data/{GLOVE_TYPE}_{N_AUTHORS}_{EMBEDDING_VECTOR_SIZE}_{INPUT_LENGTH}_df.pickle")

with open(f"files/data/{GLOVE_TYPE}_{N_AUTHORS}_{EMBEDDING_VECTOR_SIZE}_{INPUT_LENGTH}_embedding.pickle",'rb') as f:
    embedding_matrix = pickle.load(f)

## Set up function for building model
Here we use a very basic, somewhat arbitrarily defined RNN model. This should be expanded to accept hidden layer size, dropout, activation, and optimizer as arguments. In the context of this work as a final project for EECE 5644, however, the cross validation will only be performed on my CPU, which is inherantly limiting. 

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dropout, Activation, Dense, Embedding
from tensorflow.keras import metrics

def build_rnn_model(vocabulary_size, embedding_dim, input_length, embedding_matrix, output_size):
    ## create model
    model = Sequential()
    model.add(
        Embedding(
            vocabulary_size,
            embedding_dim,
            input_length=input_length,
            weights=[embedding_matrix],
            trainable=False,
        )
    ) 
    

    model.add(Dense(100, activation = "softmax"))

    model.add(Dropout(0.1, noise_shape=None, seed=None))

    model.add(SimpleRNN(10, activation = 'softmax'))

    model.compile(
        loss="categorical_crossentropy",
        optimizer="adam",
        metrics=[metrics.categorical_accuracy],
    )
    
    model_glove.summary()
    return model_glove

## Convert author labels into categorical labels

In [None]:
from tensorflow.keras.utils import to_categorical

# need to figure out labeling, unique authors
string_labels = df["author"].unique()
label_dict = {}
for i in range(len(string_labels)):
    label_dict[string_labels[i]] = i
labels = df["author"].map(label_dict)
categorical_labels = to_categorical(labels, num_classes=None)

## Format input values

In [None]:
data = df["input"].values.tolist()
data = np.array(data)

## Build model
Use the scikit-learn wrapper so that we can perform cross-validation.

In [None]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from functools import partial

# set up model
model_fn = partial(build_rnn_model, VOCAB_SIZE, EMBEDDING_VECTOR_SIZE, INPUT_LENGTH, embedding_matrix, N_AUTHORS)
model = KerasClassifier(model_fn, epochs=1, batch_size=128)

## Set up kfolds and run cross validation

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
data


kfold = KFold(n_splits=5)
scores = cross_val_score(model, data, categorical_labels, cv=kfold)
scores