### Preprocessing

In [None]:
import os, os.path
import numpy as np
import gensim
from gensim.parsing.preprocessing import preprocess_string, remove_stopwords, strip_punctuation
import pickle as pkl
import itertools
from sklearn.model_selection import train_test_split
import io

In [None]:
# strip punctuation and remove stopwords
def preprocess(sentence_list):
    filters = [lambda x : x.lower(), strip_punctuation, remove_stopwords]
    sentence_token=preprocess_string(''.join(sentence_list),filters)
    return sentence_token


In [None]:
#Build Word2Vector model
def w2v_training( total_corpus, embedding_size):
    min_word_count = 2                      
    num_workers = 4       
    context = 5                                                                                       

    print("Train gensim word2vector...")
    w2vModel = gensim.models.Word2Vec(total_corpus, size=embedding_size, window=context, min_count=min_word_count, workers=num_workers)
    w2vModel.save('/Users/Hannah/ML_project/Enron/trained_word2vector')
    return w2vModel 

In [None]:
#Build embedding matrix for word2vector or Fasttext
def build_embedding(input_model, total_corpus, word_index, EMBEDDING_SIZE):
    path='/Users/Hannah/ML_project/Enron/'
    num_words = len(word_index) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_SIZE))
    
    if input_model=='trained_word2vector':
        model=w2v_training(total_corpus, EMBEDDING_SIZE)
        for word, i in word_index.items():
            if word in model.wv.vocab:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = model.wv[word]
        np.save(path+'trained_word2vector_embedding_matrix.npy', embedding_matrix)          
    
    elif input_model=='Fasttext' :
        if 'fasettext_embedding_index.npy' not in [f for f in os.listdir(path)]:
            print('Build embedding matrix...')
            fasttest_path=path+'crawl-300d-2M.vec'
            embeddings_index = dict()
            f = open(fasttest_path,encoding='utf-8')
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs 
            f.close()
            for word, i in word_index.items():
                if word in embeddings_index.keys():
                    embedding_matrix[i] = embeddings_index.get(word)
            
            
            np.save(path+'fasettext_embedding_matrix.npy', embedding_index)
                    
        else:
            print('Load embedding matrix...')
            embedding_matrix=np.load(path+'fasettext_embedding_matrix.npy')
        
    return embedding_matrix, num_words


In [None]:
#Load data
path='/Users/Hannah/ML_project/Enron/data/'
dataset=[]
class_name=[] 
count_labels=[]  #number of texts in each class
for i, names in enumerate(os.listdir(path)):   
    class_name.append(names)
    count_labels.append([i]*len(os.listdir(path+names+'/text/')))
    for texts in os.listdir(path+names+'/text/'):
        with open(path+names+'/text/'+texts) as file:
            dataset.append(file.readlines())   
labels=list(itertools.chain(*count_labels)) #flatten labels
labels_index=dict([(class_name[x], x) for x in range(len(class_name))])

In [None]:
#Build Corpus
total_corpus=[]
for sentence_list in dataset:
    total_corpus.append(preprocess(sentence_list))

### Model

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
from keras import backend as K
from sklearn.model_selection import train_test_split
from keras.layers import Input, Flatten, Reshape, LSTM, Dense, Embedding, merge, Dropout, dot, Activation, Bidirectional,GRU
from sklearn.utils import class_weight

In [None]:
#Tokenize corpus
MAX_NB_WORDS=45000
EMBEDDING_SIZE = 300

tokenizer = Tokenizer(oov_token='UNK', num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(total_corpus)
sequences = tokenizer.texts_to_sequences(total_corpus)
word_index = tokenizer.word_index

In [None]:
#Choose fasttext or word2vec as embedding
embedding_matrix, num_words= build_embedding('Fasttext', total_corpus,  word_index,  EMBEDDING_SIZE)

In [None]:
#Set max sequence length
MAX_SEQUENCE_LENGTH = 50

In [None]:
#Pad sequences to same length
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
#One hot labels
one_hot_labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', one_hot_labels.shape)

In [None]:
#Split data to training set and validation
X_train, X_valid, y_train, y_valid =train_test_split(data,  one_hot_labels ,stratify = one_hot_labels, test_size=0.3, random_state=66)

In [None]:
#Define evaluation metrics
def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
#Use class weight to deal with imblance data
class_weights = class_weight.compute_class_weight('balanced', np.unique(labels), labels)

In [None]:
# Load pre-trained word embeddings into an Embedding layer
# Set trainable = False so as to keep the embeddings fixed


print('Training model.')


embedding_layer = Embedding(num_words,
                            EMBEDDING_SIZE,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)


# Bideractional GRU
sequence_input=Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x=Bidirectional(GRU(128, return_sequences=True, recurrent_dropout=0.1),merge_mode="sum")(embedded_sequences)
x=Dropout(0.5)(x)
x=Bidirectional(GRU(128, return_sequences=True, recurrent_dropout=0.1),merge_mode="sum")(x)
x=Dropout(0.5)(x)
x=Flatten()(x)
preds = Dense(len(labels_index), activation='softmax')(x)


model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=[recall_m, precision_m, f1_m])

model.fit(X_train, y_train, class_weight=class_weights,
          batch_size=128,
          epochs=10,
          validation_data=(X_valid, y_valid))