In [4]:
from google.cloud import storage
from google.cloud.storage import Blob

from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences

import pandas as pd
import numpy as np

import json
import time


def DownloadWeights(modelname,bucket):
    '''Downloads all of the model dependencies from Google Cloud Storage
    
    Args:
        modelname (string) - Where in google cloud a particular models dependencies are stored.
        bucket (obj) - Which bucket is it in?
    Returns:
        Indx2Word (dict) - A mapping between numerical indicies and words (or characters).
        Word2Indx (dict) - A mapping between words (or characters) to a numerical index.
        Language2Indx (dict) - A mapping between languages to a numerical index.
        Indx2Language (dict) - An inverse of ^.
    '''
    client = storage.Client()
    bucket = client.bucket('...')
    
    blob = bucket.blob('models/nlp/language_classification/{}/lexicons/Indx2Word.json'.format(modelname))
    Indx2Word = json.loads(blob.download_as_string().decode())
    
    blob = bucket.blob('models/nlp/language_classification/{}/lexicons/Word2Indx.json'.format(modelname))
    Word2Indx = json.loads(blob.download_as_string().decode())
    
    blob = bucket.blob('models/nlp/language_classification/{}/lexicons/Language2Indx.json'.format(modelname))
    Language2Indx = json.loads(blob.download_as_string().decode())
    
    blob = bucket.blob('models/nlp/language_classification/{}/lexicons/Indx2Language.json'.format(modelname))
    Indx2Language = json.loads(blob.download_as_string().decode())
    
    blob = bucket.blob('models/nlp/language_classification/{}/model_weights/Weights.h5'.format(modelname))
    blob.download_to_filename('Weights.h5')
        
    return Indx2Word,Word2Indx,Language2Indx,Indx2Language
    
def CreateMatricies(data, Word2Indx, sequence_length):
    '''Creates a numerical representation (matrix) for our source words (or characters).

    Args:
        data (list) - A list of strings.
        Word2Indx (dict) - A mapping between words (or characters) to a numerical index.
        sequence_length (int) - How many tokens in a sequence?
    Returns:
        X (matrix) - A matrix containing the data needed by Keras/TF.
        '''
    X = []
    for row in data:
        sentence = [1]
        row = row.strip()
        for word in row:
            word = word.lower()
            if word in Word2Indx:
                sentence.append(Word2Indx[word])
            else:
                sentence.append(Word2Indx['<unk>'])
        sentence.append(Word2Indx['<end>'])
        X.append(sentence)
    X = pad_sequences(X,sequence_length,padding='post',truncating='post')
    for row_idx, row in enumerate(X):
        if 2 not in row:
            X[row_idx][-1] = 2
    return X
    

def ReturnModel(Word2Indx, Language2Indx, sequence_length, Embedding_Dim, Neurons):
    model = Sequential()
    model.add(Embedding(len(Word2Indx), Embedding_Dim, input_length = sequence_length))
    model.add(Bidirectional(LSTM(Neurons)))
    model.add(Dense(len(Language2Indx), activation='softmax'))
    model.compile('adam', 'categorical_crossentropy', metrics=['accuracy'])
    return model


def PredictBatch(data, threshold = 0.3, sequence_length = 50):
    '''Makes a batch prediction on an array of strings. Returns all (multiple languages given the nature of the text )
    of the languages where p > threshold.
    
    Args:
        data (list) - List of strings you want to classify
        threshold (float) - Probablity for language detection cut off
        sequence_length (int) - How many characters/words are each sequence padded to?
    Returns:
        predictions (pandas Series) - A pandas series containing the predicted languages per string in data.
    '''
    batch = CreateMatricies(data,Word2Indx,sequence_length)
    results = model.predict(batch)
    indicies = [np.where(r > threshold)[0] + 1 if sum(r > threshold) > 0  else '0' for r in results]
    predictions = [[Indx2Language[str(indx)] for indx in row] for row in indicies]
    return pd.Series(predictions)

In [2]:
client = storage.Client()
bucket = client.bucket('...')

Lookups = {'Afghanistan/json': 'fas',
           'Argentina/json':'spa',
           'Brazil/json':'por',
           'China/json':'zho',
           'Greece/json':'ell',
           'Qatar/json':'ara',
           'Russia/json':'rus',
           'Ukraine/json':'ukr',
           'North Korea/json':'kor',
           'USA/json':'eng'
}
Names = set()
X = []
Y = []
for kk,vv in Lookups.items():
    print('Starting {}...'.format(vv))
    count = 0
    while count < 100:
        for blob in bucket.list_blobs(prefix=kk):
            if count > 100:
                break
            data = json.loads(blob.download_as_string().decode())
            for d in data:
                Name = d['EntityName'].strip()
                if Name!='' and Name not in Names:
                    count = count + 1
                    X.append(Name)
                    Y.append(vv)
                    Names.add(Name)
                    if count > 100:
                        break

Starting fas...
Starting spa...
Starting por...
Starting zho...
Starting ell...
Starting ara...
Starting rus...
Starting ukr...
Starting kor...
Starting eng...


In [5]:
modelname = 'OneBidirectionalLSTM_Char_2019-04-12_128_50_250_512_0.5'

sequence_length = 50
Embedding_Dim = 250
Neurons = 512

Indx2Word,Word2Indx,Language2Indx,Indx2Language = DownloadWeights(modelname,bucket)
Indx2Language['0'] = 'Unknown'
model =  ReturnModel(Word2Indx, Language2Indx, sequence_length, Embedding_Dim, Neurons)
model.load_weights('Weights.h5')

Times = []
while len(Times)<10:
    t0 = time.time()
    Yhat = PredictBatch(X, threshold = 0.3, sequence_length = 50)
    t1 = time.time()
    Times.append(t1-t0)
    
Accuracy = []
for index,pred in enumerate(Yhat):
    if len(pred)==1:
        Accuracy.append(pred == Yhat[index])
    else:
        acc = False
        for p in pred:
            if p == Yhat[index]:
                acc = True
        Accuracy.append(acc)

AvgTime = np.mean(Times)
AvgPerUnitTime = len(X)/AvgTime
AvgAccuracy = np.mean(Accuracy)

print('Average Time {:.3f}'.format(AvgTime))
print('Average Rate {:.3f} per second'.format(AvgPerUnitTime))
print('Average Accuracy {:.3f}'.format(AvgAccuracy))


Average Time 8.767
Average Rate 115.201
Average Accuracy 0.985


In [6]:
modelname = 'OneBidirectionalLSTM_Char_2019-04-15_128_50_150_256_0.5'

sequence_length = 50
Embedding_Dim = 150
Neurons = 256

Indx2Word,Word2Indx,Language2Indx,Indx2Language = DownloadWeights(modelname,bucket)
Indx2Language['0'] = 'Unknown'
model =  ReturnModel(Word2Indx, Language2Indx, sequence_length, Embedding_Dim, Neurons)
model.load_weights('Weights.h5')

Times = []
while len(Times)<10:
    t0 = time.time()
    Yhat = PredictBatch(X, threshold = 0.3, sequence_length = 50)
    t1 = time.time()
    Times.append(t1-t0)
    
Accuracy = []
for index,pred in enumerate(Yhat):
    if len(pred)==1:
        Accuracy.append(pred == Yhat[index])
    else:
        acc = False
        for p in pred:
            if p == Yhat[index]:
                acc = True
        Accuracy.append(acc)

AvgTime = np.mean(Times)
AvgPerUnitTime = len(X)/AvgTime
AvgAccuracy = np.mean(Accuracy)

print('Average Time {:.3f}'.format(AvgTime))
print('Average Rate {:.3f} per second'.format(AvgPerUnitTime))
print('Average Accuracy {:.3f}'.format(AvgAccuracy))

Average Time 2.588
Average Rate 390.241 per second
Average Accuracy 0.976
