In [1]:
DATA_FOLDER = './sub_data_2/'

import matplotlib.pyplot as plt
%matplotlib inline

import pickle
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
X = pickle.load(open(DATA_FOLDER + 'featureset.pickle', 'rb'))
Y = pickle.load(open(DATA_FOLDER + 'labels_int.pickle', 'rb'))

In [3]:
X_train_np, X_dev_test_np, y_train_np, y_dev_test_np = train_test_split(X, Y, test_size=0.3, random_state=42)

X_dev_np, X_test_np, y_dev_np, y_test_np = train_test_split(X_dev_test_np, y_dev_test_np, test_size=0.5, random_state=42)

In [4]:
# Normalization

normalizer = preprocessing.Normalizer().fit(X_train_np)

#X_train_np = normalizer.transform(X_train_np)
#X_dev_np = normalizer.transform(X_dev_np)
#X_test_np = normalizer.transform(X_test_np)

In [5]:
print(len(X_train_np), len(X_dev_np), len(X_test_np))

47773 10237 10238


In [6]:
X_train_np.shape

(47773, 2920)

In [7]:
_, feature_length = X_train_np.shape
class_length =  len(np.unique(Y))

print(feature_length, class_length)

2920 2699


In [8]:
import torch
from torch.autograd import Variable
import torch.nn.functional as F

In [9]:
X_train = torch.from_numpy(X_train_np).float()
y_train = torch.LongTensor(y_train_np)

X_dev = torch.from_numpy(X_dev_np).float()
y_dev = torch.LongTensor(y_dev_np)

x, y = Variable(X_train), Variable(y_train)
x_dev, y_dev = Variable(X_dev), y_dev

In [10]:
net = torch.nn.Sequential(
    torch.nn.Linear(feature_length, 200),
    torch.nn.ReLU(),
    torch.nn.Linear(200, class_length),
    torch.nn.LogSoftmax()
)
print(net)

Sequential (
  (0): Linear (2920 -> 200)
  (1): ReLU ()
  (2): Linear (200 -> 2699)
  (3): LogSoftmax ()
)


In [11]:
optimizer = torch.optim.Adam(net.parameters(), lr=0.02)
loss_func = torch.nn.NLLLoss()

In [12]:
for t in range(40):
    out = net(x)                 # input x and predict based on x
    loss = loss_func(out, y)     # must be (1. nn output, 2. target), the target label is NOT one-hotted

    optimizer.zero_grad()   # clear gradients for next train
    loss.backward()         # backpropagation, compute gradients
    optimizer.step()        # apply gradients
    
    if t % 5 == 1 or t in [0, 2]:
        # plot and show learning process
        _, prediction = torch.max(out, 1)
        pred_y = prediction.data.numpy().squeeze()
        target_y = y.data.numpy()
        
        
        out_dev = net(x_dev)
        _, prediction_dev = torch.max(out_dev, 1)
        pred_y_dev = prediction_dev.data.numpy().squeeze()
        target_y_dev = y_dev.numpy()

        acc_train = sum(pred_y == target_y)/len(target_y)
        acc_dev = sum(pred_y_dev == target_y_dev)/len(target_y_dev)
        
        print('Epoch{0:3d} -> Loss={1:.6f}, Train Acc={2:.3f}, Dev Acc={3:.3f}'.format(t+1, loss.data[0], acc_train, acc_dev))


Epoch  1 -> Loss=7.912150, Train Acc=0.000, Dev Acc=0.278
Epoch  2 -> Loss=5.771262, Train Acc=0.287, Dev Acc=0.418
Epoch  3 -> Loss=4.177922, Train Acc=0.432, Dev Acc=0.583
Epoch  7 -> Loss=1.454784, Train Acc=0.801, Dev Acc=0.810
Epoch 12 -> Loss=0.488628, Train Acc=0.927, Dev Acc=0.900
Epoch 17 -> Loss=0.196956, Train Acc=0.962, Dev Acc=0.929
Epoch 22 -> Loss=0.103036, Train Acc=0.978, Dev Acc=0.940
Epoch 27 -> Loss=0.063277, Train Acc=0.986, Dev Acc=0.942
Epoch 32 -> Loss=0.042062, Train Acc=0.989, Dev Acc=0.947
Epoch 37 -> Loss=0.030290, Train Acc=0.993, Dev Acc=0.948


KeyboardInterrupt: 

In [165]:
from sklearn import preprocessing
import numpy as np
X_train = np.array([[ 51, -1.,  2000.],
                    [ 50.,  2.,  2500.],
                   [ 30.,  1., 200.]])

scaler = preprocessing.Normalizer().fit(X_train.T)
scaler.transform(X_train.T).T

array([[ 0.65835231, -0.40824829,  0.62347969],
       [ 0.64544344,  0.81649658,  0.77934961],
       [ 0.38726606,  0.40824829,  0.06234797]])

## Test Acc:

In [71]:
from random import randint
print()

X_test = torch.from_numpy(X_test_np).float()
y_test = torch.LongTensor(y_test_np)

x_test, y_test = Variable(X_test), y_test

out_test = net(x_test)
_, prediction_test = torch.max(out_test, 1)
#pred_y_test = [randint(0, y_test.shape[0]) for x in range(y_test.shape[0])] 
pred_y_test = prediction_test.data.numpy().squeeze()
target_y_test = y_test.numpy()

acc_test = sum(pred_y_test == target_y_test)/len(target_y_test)
print(acc_test)


0.948427427232


## Test with some example

In [62]:
y_test_np[0:5]

[394, 2685, 2579, 1584, 779]

In [None]:
doc_sender_dict = pickle.load(open(DATA_FOLDER + 'doc_sender_dict.pickle', 'rb'))
sender_id_all = pickle.load(open(DATA_FOLDER +'sender_id_all.pickle', 'rb'))

In [None]:
sender_id = sender_id_all[1584]
print(sender_id)

In [None]:
print('Documents of sender {} are:'.format(sender_id))
for doc_id, _sender_id in doc_sender_dict.items():
    if _sender_id == sender_id:
        print(doc_id)

In [22]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

lexicon = pickle.load(open(DATA_FOLDER + 'lexicon.pickle', 'rb'))

def is_valid_word(word):
    import re
    if len(word) < 1:
        return False
    if re.search('[a-zA-Z]', word) == None:
        return False
    
    return True

def parse_single_file(j):    
    words = []
    for word_info in j.get('describedWords'):
        f = word_info.get('features')
        try:
            wordHight = word_info.get('bottom')
            pageHeight = float(f.get('pageHeight'))
            relativePosition = wordHight/pageHeight
            
            if relativePosition < 1.0/3 or relativePosition > 2.0/3:
                word = word_info.get('text')
                
                if is_valid_word(word):
                    local_words_list = tokenizer.tokenize(word.lower())
                    words += local_words_list
        except:
            pass
    return words

def get_sender_name_via_id(doc_id):
    import boto3
    import botocore
    import json
    
    b_name = 'ts-dev-cs-training-data'
    prefix = 'cache_v2_individual/ubl1/hocr13/ubllight1/hocrlight1/sendertrainingmatrices20170819/data/'
    key = prefix + doc_id + '.training'
    
    obj = boto3.resource('s3').Object(b_name, key)
    doc_str = obj.get()['Body'].read().decode('utf-8')
    doc_json = json.loads(doc_str)
    
    return doc_json['sender']

def get_feature_via_words(words):
    features = np.zeros(len(lexicon))
    for word in words:
        if word in lexicon:
            index_value = lexicon.index(word)
            features[index_value] += 1
    return np.array([features])

def get_words_via_id(doc_id):
    import boto3
    import botocore
    import json
    
    b_name = 'ts-dev-cs-training-data'
    prefix = 'rbp-research/data/e2e/2017-09-11/'
    key = prefix + doc_id + '.training.json'
    
    obj = boto3.resource('s3').Object(b_name, key)
    doc_str = obj.get()['Body'].read().decode('utf-8')
    doc_json = json.loads(doc_str)
    
    words = parse_single_file(doc_json)
    
    words_in_lexicon = []
    for word in words:
        if word in lexicon:
            words_in_lexicon.append(word)
    return words_in_lexicon


In [None]:
doc_id = '0da51d45-2069-406c-ae3c-631c62bef891'

print(get_sender_name_via_id(doc_id))

words = get_words_via_id(doc_id)
#words = ['paletten', 'paletten', 'rechnungsbetrag']
print(words)

f = get_feature_via_words(words)

f_tensor = torch.from_numpy(f).float()
f_variable = Variable(f_tensor)

out = net(f_variable)

_, prediction = torch.max(out, 1)
pred_y = prediction.data.numpy().squeeze()
print(pred_y)