# Author__ Hussam Qassim__

# San Francisco Crime Classifier using BLSTM neural network

# Setup

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Import the main necessary libraries
import os
import warnings
from distutils.version import LooseVersion
from datetime import datetime
import numpy as np
import csv
import re
import tensorflow as tf

# To make this notebook's output stable across runs
def rset_graph(seed=42):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

# Check TensorFlow Version. Please use TensorFlow version 1.0 or newer
assert LooseVersion(tf.__version__) >= LooseVersion('1.0')
print('TensorFlow Version: {}'.format(tf.__version__))

TensorFlow Version: 1.3.0


# Load and preprocessing the data 

In [2]:
# Create a matrix of features (x) and a vector of class labels (y)
x = []
y = []
with open('dataset/train.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    for i, row in enumerate(reader):
        if i == 0:
            pass
        else:
            date = re.search("([0-9]{4})-([0-9]{2})-([0-9]{2})", row[0]).groups()
            date = [int(x) for x in date]
            time = re.search("([0-9]{2}):([0-9]{2}):([0-9]{2})", row[0]).groups()
            time = [int(x) for x in time]
            category_string = row[1]
            dayofweek_string = row[3]
            pddistrict_string = row[4]
            longitude = float(row[7])
            latitude = float(row[8])
            x_row = date + time + [longitude, latitude, \
                    dayofweek_string, pddistrict_string]
            y_label = category_string
            x.append(x_row)
            y.append(y_label)

# One-hot encoding for dayofweek and pddistrict vars
dayofweek_set = set()
pddistrict_set = set()
for row in x:
    dayofweek_set.add(row[-2])
    pddistrict_set.add(row[-1])
dayofweek_dict = {item: i for i, item in enumerate(dayofweek_set)}
pddistrict_dict = {item: i for i, item in enumerate(pddistrict_set)}
num_unique_dayofweek = len(dayofweek_dict)
num_unique_pddistrict = len(pddistrict_dict)
for i, row in enumerate(x):
    encoded_dayofweek = [0]*num_unique_dayofweek
    encoded_pddistrict = [0]*num_unique_pddistrict
    current_dayofweek = row[-2]
    current_pddistrict = row[-1]
    encoded_dayofweek[dayofweek_dict[current_dayofweek]] = 1
    encoded_pddistrict[pddistrict_dict[current_pddistrict]] = 1
    x[i] = row[:-2] + encoded_dayofweek + encoded_pddistrict

print(y)

# label binarization
category_set = set()
for label in y:
    category_set.add(label)
category_dict = {item: i for i, item in enumerate(sorted(category_set))}
num_unique_category = len(category_dict)
for i, label in enumerate(y):
    y[i] = category_dict[label]

# Convert x, y to numpy array
x = np.asarray(x, dtype=np.float32)
y = np.asarray(y, dtype=np.int32)

print('Done..')

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.


Done..


# Normalizing and shaping the data 

In [None]:
from sklearn.preprocessing import MinMaxScaler

n_features = 25 
seq_len = 1

scaler = MinMaxScaler()
x = scaler.fit_transform(x)

print('Done..')

# Build the Training, Validation and Test datasets

In [None]:
# Divide the dataset into 80% Training, 10% Validation, and 10% Testing  
split_idx = int(len(x)*0.8)
train_x, val_x = x[:split_idx], x[split_idx:]
train_y, val_y = y[:split_idx], y[split_idx:]

test_idx = int(len(val_x)*0.5)
val_x, test_x = val_x[:test_idx], val_x[test_idx:]
val_y, test_y = val_y[:test_idx], val_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

# Build the Neural Network

In [None]:
# Define the Neural Network parameters
lstm_size = 256
batch_size = 250
n_layers = 2
drop_out = 0.5
learning_rate = 0.001
epochs = 100

print('Done..')

### Create TF Placeholders for the Neural Network

In [None]:
# Create the graph object
graph = tf.Graph()
# Add nodes to the graph
with graph.as_default():
    inputs_ = tf.placeholder(tf.float32, [None, seq_len, n_features], name='inputs')
    labels_ = tf.placeholder(tf.int32, [None], name='labels')
    keep_prob = tf.placeholder(tf.float32, name='keep_prob')   
print('Done..')

# Construction phase_ Build the TF graph 

### Build NN Cell and Initialize

In [None]:
'''
Bidirectional LSTMs train two instead of one LSTMs on the input sequence. The first on the input sequence 
as-is and the second on a reversed copy of the input sequence. This can provide additional context to the 
network and result in faster and even fuller learning on the problem
'''
# Build one BLSTM and Fully_connected layer with Softmax classifier
with graph.as_default():
    
# Using He initialization can significantly reduce the vanishing/exploding gradients problems   
    he_init = tf.contrib.layers.variance_scaling_initializer()

# Create the graph of 2 hidden BLSTM layer and one output layer
    with tf.name_scope("BLSTM"):
        def lstm_cell():
            '''
This LSTM variant with extra connections called peephole connections: the previous long-term state is added as
an input to the controllers of the forget gate and the input gate, and the current long-term tate is added as
input to the controller of the output gate
'''
            cell = tf.contrib.rnn.LSTMCell(num_units=lstm_size, initializer=he_init, use_peepholes=True)
            '''
            Applying dropout between the LSTM layers to prevent overfitting the training set.The following 
code applies dropout to the inputs of each layer in the LSTM, dropping each input with a 50% probability
'''
            cell_drop = tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=keep_prob)
            return cell_drop
    
        cell_fw = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(n_layers)])
        cell_bw = tf.contrib.rnn.MultiRNNCell([lstm_cell() for _ in range(n_layers)])
   
        outputs, final_state = tf.nn.bidirectional_dynamic_rnn(cell_fw=cell_fw, cell_bw=cell_bw, 
                                                          inputs=inputs_, dtype=tf.float32, 
                                                          scope="BiLSTM")
        outputs = tf.concat(axis = 2, values = outputs)
        last_output = outputs[:,-1,:]
    
        logits = tf.contrib.layers.fully_connected(last_output, 39, activation_fn=None, scope="logits")
        
# Create the cost function     
    with tf.name_scope("loss"): 

# Computes the cross entropy, it is equivalent to applying the softmax activation function and then
# computing the cross entropy, but it is more efficient, and it properly takes care of corner cases like logits 
# equal to 0

        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels_, logits=logits)
# Computes the mean of elements across dimensions of a tensor
        loss = tf.reduce_mean(xentropy, name="loss")
                
# Craete the optimizer 
    with tf.name_scope("train"): 
# Applying clip the gradients technique to lessen the exploding gradient problem in the LSTM
        threshold = 0.5
# Using Adam as optimizer because it is combines the ideas of Momentum optimization and RMSProp
# Adam is an adaptive learning rate algorithm, it requires less tuning of the learning rate hyperparameter η. 
# We can often use the default value η = 0.001
        optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
        grads_and_vars = optimizer.compute_gradients(loss)
        capped_gvs = [(tf.clip_by_value(grad, -threshold, threshold), var)
                                     for grad, var in grads_and_vars]
        training_op = optimizer.apply_gradients(capped_gvs)
        
# Evaluate the NN     
    with tf.name_scope("eval"):    
        correct = tf.nn.in_top_k(logits, labels_, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32), name="accuracy" )

print('Done..')

### Baching the dataset

In [None]:
def get_batches(x, y, batch_size=100):
    
    n_batches = len(x)//batch_size
    x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
    for ii in range(0, len(x), batch_size):
        yield x[ii:ii+batch_size], y[ii:ii+batch_size]
        
print('Done..')

# Execution phase_Execute the TF graph 

### Training the model

In [None]:
# initialize a name and file directory for TensorBoard
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = "tf_logs"
logdir = "{}/run-{}/".format(root_logdir, now)

# Create a Saver node
with graph.as_default():
    saver = tf.train.Saver()
    
# Create early stopping      
early_stopping = 0 

with tf.Session(graph=graph) as sess:
    sess.run(tf.global_variables_initializer()) # initialize all variables
    
# Creates a node in the graph that will evaluate the reduce_mean value and write it to a TensorBoard 
# compatible binary log string called a summary 
    loss_summary = tf.summary.scalar("accuracy", accuracy)

# Creates a FileWriter that you will use to write summaries to logfiles in the log directory
    file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

    print('training ..')
    
    for epoch in range(epochs):
        for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
            x = x[:,np.newaxis,:]
            #x = x.reshape(-1, seq_len, n_features)
# Update the execution phase to evaluate the loss_summary node regularly during training(every 10 mini-batches)
            if ii % 10 == 0:
                summary_str = loss_summary.eval(feed_dict={inputs_: x, labels_: y, keep_prob: drop_out})
                step = epochs * batch_size + ii
                file_writer.add_summary(summary_str, step)
            sess.run(training_op, feed_dict={inputs_: x, labels_: y, keep_prob: drop_out})
        acc_train = accuracy.eval(feed_dict={inputs_: x, labels_: y, keep_prob: drop_out})
        for x, y in get_batches(val_x, val_y, batch_size):
            x = x[:,np.newaxis,:]
            #x = x.reshape(-1, seq_len, n_features)
            acc_val = accuracy.eval(feed_dict={inputs_: x, labels_: y, keep_prob: 1})
        print(epoch, "Train accuracy:", acc_train, "Validation accuracy:", acc_val)
        if acc_val >= early_stopping:
            # Save the best trained model
            saver.save(sess, "checkpoints/sentiment.ckpt")
            early_stopping = acc_val
                        
file_writer.close()

### Testing the model

In [None]:
with tf.Session(graph=graph) as sess:

    print('Loading the saved checkpoint..')
# Load the saved model 
    saver.restore(sess, "checkpoints/sentiment.ckpt")
    print('Testing..')
# Baching the test dataset
    for ii, (x, y) in enumerate(get_batches(test_x, test_y, batch_size), 1):
        x = x[:,np.newaxis,:]
        #x = x.reshape(-1, seq_len, n_features)
        acc_test = accuracy.eval(feed_dict={inputs_: x, labels_: y, keep_prob: 1})
    print("Test accuracy:", acc_test)