In [174]:
import boto3
import botocore

import json
import os
import pickle

import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [175]:
BUCKET_NAME = 'ts-dev-cs-training-data'
PREFIX = 'rbp-research/data/e2e/2017-09-11/'
FILES_PREFIX = 'dumpData/'

s3 = boto3.resource('s3').Bucket(BUCKET_NAME)

In [165]:
def download_file(k, diskFilename):
    try:
        s3.download_file(k, diskFilename)
    except botocore.exceptions.ClientError as e:
        if e.response['Error']['Code'] == "404":
            print("The object does not exist.")
        else:
            raise

def get_a_train_file_key(s3, prefix):
    for o in s3.objects.filter(Prefix=prefix, Delimiter='/'):
        yield o.key
        
        
def parse_file(k):
    filename = 'test.json'
    download_file(k, filename)
    with open(filename) as json_data:
        return json.load(json_data)

def is_valid_word(word):
    import re
    if len(word) < 1:
        return False
    if re.search('[a-zA-Z]', word) == None:
        return False
    
    return True

def parse_single_file(s):
    
    j = parse_file(s)
    key = s.replace('.','/').split('/')[-3]
    words = []
    for word_info in j.get('describedWords'):
        f = word_info.get('features')
        try:
            wordHight = word_info.get('bottom')
            pageHeight = float(f.get('pageHeight'))
            relativePosition = wordHight/pageHeight
            
            if relativePosition < 1.0/3 or relativePosition > 2.0/3:
                word = word_info.get('text')
                
                if is_valid_word(word):
                    local_words_list = tokenizer.tokenize(word.lower())
                    words += local_words_list
        except:
            pass
    return key, words

In [166]:
gen = get_a_train_file_key(s3, PREFIX)

In [167]:
n = next(gen)
print(n)
n.replace('.','/').split('/')[-3]

rbp-research/data/e2e/2017-09-11/00001593-256e-46f5-b0a1-bd0c09981c3a.training.json


'00001593-256e-46f5-b0a1-bd0c09981c3a'

In [168]:
_, word_list = parse_single_file(n)
print(len(word_list))
word_list

142


['registration',
 'kuehne',
 'no',
 'road',
 'signed',
 'our',
 'invoice',
 'trafford',
 'england',
 'fax',
 '1at',
 'of',
 'business',
 'trafford',
 'sole',
 'ltd',
 'info',
 'utopia',
 'tableware',
 'com',
 'chesterfield',
 'florence',
 'tableware',
 'to',
 'account',
 'no',
 'phone',
 'way',
 'chesterfield',
 'ocean',
 'vat',
 'to',
 'reg',
 'park',
 's42',
 'm',
 'utopia',
 'road',
 'shipment',
 'technical',
 'ltd',
 'road',
 '5uy',
 'the',
 'issued',
 'no',
 '0009',
 'meet',
 'uxbridge',
 'glass',
 'the',
 'tableware',
 '2ls',
 '5uy',
 'derbyshire',
 'of',
 'director',
 'notified',
 's42',
 'middlesex',
 'park',
 'despatch',
 'copied',
 'm17',
 'invoice',
 'road',
 'date',
 'invoice',
 'whse',
 'nagel',
 'spirit',
 'registered',
 'no',
 'body',
 'rockingham',
 'e',
 'mail',
 'the',
 'no',
 'park',
 'union',
 'ord',
 'tableware',
 'to',
 'page',
 '5uy',
 'estate',
 'no',
 'home',
 'this',
 'vat',
 'is',
 'date',
 '2014',
 '22',
 'eu',
 'no',
 'park',
 'notified',
 'works',
 'body',

In [181]:
gen = get_a_train_file_key(s3, PREFIX)

i = 0
sender_words_dict = {}

lexicon_all = []

for n in gen:
    i = i + 1
    
    key, word_list = parse_single_file(n)
    sender_words_dict[key] = word_list
    
    lexicon_all += word_list
    
    if i % 1000 == 1:
        print('Doc number {}. lexicon length {}'.format(i, len(lexicon_all)))
    
    # if i > 10:
        #break

pickle.dump(sender_words_dict, open('sender_words_dict.pickle', 'wb'))
pickle.dump(lexicon_all, open('lexicon_all.pickle', 'wb'))

Doc number 1. lexicon length 142
Doc number 1001. lexicon length 208520
Doc number 2001. lexicon length 393213
Doc number 3001. lexicon length 571619
Doc number 4001. lexicon length 777602
Doc number 5001. lexicon length 963956
Doc number 6001. lexicon length 1157962
Doc number 7001. lexicon length 1341242
Doc number 8001. lexicon length 1536156
Doc number 9001. lexicon length 1730414
Doc number 10001. lexicon length 1936677
Doc number 11001. lexicon length 2113356
Doc number 12001. lexicon length 2308220
Doc number 13001. lexicon length 2501513
Doc number 14001. lexicon length 2682624
Doc number 15001. lexicon length 2884595
Doc number 16001. lexicon length 3061323
Doc number 17001. lexicon length 3249505
Doc number 18001. lexicon length 3438725
Doc number 19001. lexicon length 3632700
Doc number 20001. lexicon length 3828292
Doc number 21001. lexicon length 4014558
Doc number 22001. lexicon length 4206671
Doc number 23001. lexicon length 4384602
Doc number 24001. lexicon length 45772

KeyboardInterrupt: 

In [182]:
pickle.dump(sender_words_dict, open('sender_words_dict.pickle', 'wb'))
pickle.dump(lexicon_all, open('lexicon_all.pickle', 'wb'))

In [180]:
sender_words_dict

{'00001593-256e-46f5-b0a1-bd0c09981c3a': ['registration',
  'kuehne',
  'no',
  'road',
  'signed',
  'our',
  'invoice',
  'trafford',
  'england',
  'fax',
  '1at',
  'of',
  'business',
  'trafford',
  'sole',
  'ltd',
  'info',
  'utopia',
  'tableware',
  'com',
  'chesterfield',
  'florence',
  'tableware',
  'to',
  'account',
  'no',
  'phone',
  'way',
  'chesterfield',
  'ocean',
  'vat',
  'to',
  'reg',
  'park',
  's42',
  'm',
  'utopia',
  'road',
  'shipment',
  'technical',
  'ltd',
  'road',
  '5uy',
  'the',
  'issued',
  'no',
  '0009',
  'meet',
  'uxbridge',
  'glass',
  'the',
  'tableware',
  '2ls',
  '5uy',
  'derbyshire',
  'of',
  'director',
  'notified',
  's42',
  'middlesex',
  'park',
  'despatch',
  'copied',
  'm17',
  'invoice',
  'road',
  'date',
  'invoice',
  'whse',
  'nagel',
  'spirit',
  'registered',
  'no',
  'body',
  'rockingham',
  'e',
  'mail',
  'the',
  'no',
  'park',
  'union',
  'ord',
  'tableware',
  'to',
  'page',
  '5uy',
  'e

In [115]:
import re

print(re.search('[a-zA-Z]', '16.01.17') == None)

True


In [179]:
ls -alh

total 720
drwxr-xr-x  13 fuyangliu  staff   442B Sep 28 12:29 [1m[34m.[m[m/
drwxr-xr-x  39 fuyangliu  staff   1.3K Sep 27 14:33 [1m[34m..[m[m/
-rw-r--r--   1 fuyangliu  staff    76B Sep 27 14:35 .env
drwxr-xr-x  15 fuyangliu  staff   510B Sep 28 12:29 [1m[34m.git[m[m/
-rw-r--r--   1 fuyangliu  staff   1.1K Sep 27 14:37 .gitignore
drwxr-xr-x   4 fuyangliu  staff   136B Sep 28 11:17 [1m[34m.ipynb_checkpoints[m[m/
-rwxr-xr-x   1 fuyangliu  staff   364B Sep 27 14:34 [1m[32minit_python3_ml_env.sh[m[m*
-rw-r--r--   1 fuyangliu  staff    26K Sep 28 12:29 lexicon_all.pickle
drwxr-xr-x   8 fuyangliu  staff   272B Sep 27 14:35 [1m[34mp3ml-venv[m[m/
-rw-r--r--   1 fuyangliu  staff    66K Sep 28 12:28 senderClassifier-fuyang.ipynb
-rw-r--r--   1 fuyangliu  staff    13K Sep 28 11:17 senderClassifier.ipynb
-rw-r--r--   1 fuyangliu  staff    26K Sep 28 12:29 sender_words_dict.pickle
-rw-r--r--   1 fuyangliu  staff   206K Sep 28 12:29 test.json
