In [177]:
from __future__ import unicode_literals, print_function, division

import os
from io import open
import sys
import math
import random
import argparse
import operator
import pdb

import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


from collections import defaultdict
from collections import Counter

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# Kyle's attempt
import faker
from faker import Faker
import pandas as pd
import numpy as np
import re
from string import punctuation
import glob
import unicodedata
import string
import random
import time

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

fake = Faker()

In [178]:
# Need to have the class of the model in local memory to load a saved model in pytorch
class LSTMClassifier(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(LSTMClassifier, self).__init__()

        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1)

        self.hidden2out = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

        self.dropout_layer = nn.Dropout(p=0.2)


    def init_hidden(self, batch_size):
        return(autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)),
                    autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)))


    def forward(self, batch, lengths):

        self.hidden = self.init_hidden(batch.size(-1))

        embeds = self.embedding(batch)
        packed_input = pack_padded_sequence(embeds, lengths)
        outputs, (ht, ct) = self.lstm(packed_input, self.hidden)
        # ht is the last hidden state of the sequences
        # ht = (1 x batch_size x hidden_dim)
        # ht[-1] = (batch_size x hidden_dim)
        output = self.dropout_layer(ht[-1])
        output = self.hidden2out(output)
        output = self.softmax(output)

        return output

In [179]:
        
class PaddedTensorDataset(Dataset):
#     """Dataset wrapping data, target and length tensors.

#     Each sample will be retrieved by indexing both tensors along the first
#     dimension.

#     Arguments:
#         data_tensor (Tensor): contains sample data.
#         target_tensor (Tensor): contains sample targets (labels).
#         length (Tensor): contains sample lengths.
#         raw_data (Any): The data that has been transformed into tensor, useful for debugging
#     """

    def __init__(self, data_tensor, target_tensor, length_tensor, raw_data):
        assert data_tensor.size(0) == target_tensor.size(0) == length_tensor.size(0)
        self.data_tensor = data_tensor
        self.target_tensor = target_tensor
        self.length_tensor = length_tensor
        self.raw_data = raw_data

    def __getitem__(self, index):
        return self.data_tensor[index], self.target_tensor[index], self.length_tensor[index], self.raw_data[index]

    def __len__(self):
        return self.data_tensor.size(0)

In [181]:
class DF_To_Tensors():
    def __init__(self):
        self.tag2id = defaultdict(int,
                        {'city': 0,
                         'first_name': 1,
                         'geo': 2,
                         'percent': 3,
                         'year': 4,
                         'ssn': 5,
                         'language_name': 6,
                         'country_name': 7,
                         'phone_number': 8,
                         'month': 9,
                         'zipcode': 10,
                         'iso8601': 11,
                         'paragraph': 12,
                         'pyfloat': 13,
                         'email': 14,
                         'prefix': 15,
                         'pystr': 16,
                         'isbn': 17,
                         'boolean': 18,
                        'country_code':19,
                         'country_GID':20,
                        'continent':21,
                        'date': 22,
                        'day_of_month':23,
                        'day_of_week':24,
                        'date_long_dmdy':25,
                        'date_long_mdy': 26,
                        'date_long_dmdyt':27,
                        'date_long_mdyt_m':28,
                         'city_suffix':29,
                         'month_name':30

                         })
        self.n_categories = len(self.tag2id)
        self.token_set={'a','b','c','d','e',
                        'f','g','h','i','j','k','l',
                        'm','n','o','p','q','r','s',
                        't','u','v','w','x','y','z',
                        'A','B','C','D','E','F','G',
                        'H','I','J','K','L','M','N',
                        'O','P','Q','R','S','T','U',
                        'V','W','X','Y','Z','1','2',
                        '3','4','5','6','7','8','9','0',
                        "'",',','.',';','*','!','@',
                        '#','$','%','^','&','(',')',
                        '_','=','-',':','+','/',"\\", '*'}
        self.token2id = defaultdict(int,
            {'PAD': 0,
             'UNK': 1,
             'a':2,
             'b':3,
             'c': 4,
             'd': 5,
             'e': 6,
             'f': 7,
             'g':8,
             'h': 9,
             'i': 10,
             'j':11,
             'k':12,
             'l':13,
             'm':14,
             'n':15,
             'o':16,
             'p':17,
             'q':18,
             'r':19,
             's':20,
             't':21,
             'u':22,
             'v':23,
             'w':24,
             'x':25,
             'y':26,
             'z':27,
             'A':28,
             'B':29,
             'C':30,
             'D':31,
             'E':32,
             'F':33,
             'G':34,
             'H':35,
             'I':36,
             'J':37,
             'K':38,
             'L':39,
             'N':40,
             'O':41,
             'P':42,
             'Q':43,
             'R':44,
             'S':45,
             'T':46,
             'U':47,
             'V':48,
             'W':49,
             'X':50,
             'Y':51,
             'Z':52,
             '1':53,
             '2':54,
             '3':55,
             '4':56,
             '5':57,
             '6':58,
             '7':59,
             '8':60,
             '9':61,
             '0':62,
             "'":63,
             ',':64,
             '.':65,
             ';':66,
             '*':67,
             '!':68,
             '@':68,
             '#':70,
             '$':71,
             '%':72,
             '^':73,
             '&':74,
             '(':75,
             ')':76,
             '_':77,
             '=':78,
             '-':79,
             ':':80,
             '+':81,
             '/':82,
             '\\':83,
             '*': 84})
    
    def vectorized_string(self, string):
            return [self.token2id[token] if token in self.token2id else self.token2id['UNK'] for token in str(string)]
        
    def vectorized_array(self, array):
        vecorized_array=[]
        for stringValue in array:
            vecorized_array.append(self.vectorized_string(stringValue))
        return vecorized_array
    
    def pad_sequences(self, vectorized_seqs, seq_lengths):
        # create a zero matrix
        seq_tensor = torch.zeros((len(vectorized_seqs), seq_lengths.max())).long()

        # fill the index
        for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
            seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        return seq_tensor


    def create_dataset(self, data, batch_size=1):
        
        vectorized_seqs = self.vectorized_array(data)
        seq_lengths = torch.LongTensor([len(s) for s in vectorized_seqs])
        seq_tensor = self.pad_sequences(vectorized_seqs, seq_lengths)
        target_tensor = torch.LongTensor([self.tag2id[y] for  y in data])
        raw_data = [x for x in data]
        
        return DataLoader(PaddedTensorDataset(seq_tensor, target_tensor, seq_lengths, raw_data), batch_size=batch_size)

    def sort_batch(self,batch, targets, lengths):
        seq_lengths, perm_idx = lengths.sort(0, descending=True)
        seq_tensor = batch[perm_idx]
        target_tensor = targets[perm_idx]

        return seq_tensor.transpose(0, 1), target_tensor, seq_lengths


    def evaluate_test_set(self, model, test):
        y_pred = list()
        all_predictionsforValue=[]

        for batch, targets, lengths, raw_data in self.create_dataset(test, batch_size=1):
            batch, targets, lengths = self.sort_batch(batch, targets, lengths)
            pred = model(torch.autograd.Variable(batch), lengths.cpu().numpy())
            pred_idx = torch.max(pred, 1)[1]
            def get_key(val):
                for key, value in self.tag2id.items():
                     if val == value:
                            return {'top_pred':key, 'tensor':pred, 'pred_idx':pred_idx}
#                             all_predictionsforValue.append({'key':key, 'tensor':pred, 'pred_idx':pred_idx})

            all_predictionsforValue.append(get_key(pred_idx[0]))
        return all_predictionsforValue
        
    def read_in_csv(self,path):
        self.df = pd.read_csv(path)
#         print(self.df.head())
    
    def get_arrayOfValues_df(self):
        column_value_object={}

        for column in self.df.columns:
            guesses=[]
            column_value_object[column]=[]
            for _ in range(1,10):
                random_values = str(np.random.choice(self.df[column]))
                random_col = column
                column_value_object[column].append(str(random_values)) 

        return column_value_object
    def averaged_predictions(self, all_predictions):
        all_arrays=[]
        for pred in all_predictions:
                all_arrays.append(pred['tensor'].detach().numpy())
        
        out = np.mean(all_arrays, axis=0)
        def get_key(val):
                for key, value in self.tag2id.items():
                     if val == value:
                        return key

        return {
            "averaged_tensor":out,
            'averaged_top_category':get_key(np.argmax(out))
        }
    
    def predictions(self, model, path_to_csv):
        self.read_in_csv(path=path_to_csv)
        column_value_object = self.get_arrayOfValues_df()
        predictions=[]
        for column in column_value_object:
#             print(column_value_object[column])
#             print(column)
#             print(self.evaluate_test_set(model, column_value_object[column]))
            all_predictions = self.evaluate_test_set(model, column_value_object[column])
            avg_predictions = self.averaged_predictions(all_predictions)
            predictions.append({
                    'column': column,
                    'values': column_value_object[column],
                    'avg_predictions': avg_predictions,
                    'model_predictions': all_predictions
                   
                })
    
        
#         for pred in predictions:
#                 if len(pred['values']) == len(pred['model_predictions']):
#                         for i, v in enumerate(pred['values']):
# #                             print(v, i)
#                             pred['model_predictions'][i]['original']=pred['values'][i]
# #                         print('values', pred['values'])
# #                         print(pred['model_predictions'])
        self.predictions = predictions
        print(self.predictions)


In [187]:
# create our class and load the saved model
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.03.pth')
model2.eval()

LSTMClassifier(
  (embedding): Embedding(84, 128)
  (lstm): LSTM(128, 32)
  (hidden2out): Linear(in_features=32, out_features=31, bias=True)
  (softmax): LogSoftmax(dim=1)
  (dropout_layer): Dropout(p=0.2, inplace=False)
)

In [183]:
preds2=dft_tensor.predictions(model=model2, path_to_csv='datasets/data/africa_test.csv')
preds2

[{'column': 'fid', 'values': ['74', '18', '63', '65', '65', '30', '23', '7', '55'], 'avg_predictions': {'averaged_tensor': array([[-10.788871  , -10.494506  , -10.303161  ,  -7.91339   ,
         -5.21803   , -11.681477  ,  -8.329886  ,  -9.050607  ,
        -10.108499  ,  -1.9565302 ,  -7.057903  ,  -9.5367155 ,
        -15.821407  , -11.807798  , -10.538514  , -16.338898  ,
        -10.174675  , -10.465824  , -17.919344  ,  -9.250635  ,
        -12.091434  , -12.061408  ,  -8.1639185 ,  -0.32153168,
        -14.977112  ,  -8.228518  , -10.286186  , -10.116358  ,
        -10.592918  , -14.308406  , -13.515233  ]], dtype=float32), 'averaged_top_category': 'day_of_month'}, 'model_predictions': [{'top_pred': 'day_of_month', 'tensor': tensor([[ -7.8788, -10.8678, -10.5702,  -6.5144,  -5.6173,  -9.6748,  -7.9067,
          -9.8661, -10.0517,  -1.3489,  -7.0374,  -6.8425, -16.1601, -11.9128,
         -10.5351, -16.8671,  -9.3043, -10.5509, -16.1444,  -8.7825, -11.6716,
         -13.5191,  -

In [186]:
preds3=dft_tensor.predictions(model=model, path_to_csv='datasets/data/four_col_test.csv')
preds3

[{'column': 'iso3', 'values': ['SLV', 'ARE', 'NGA', 'IRQ', 'DMA', 'RUS', 'CRI', 'IRN', 'ZAF'], 'avg_predictions': {'averaged_tensor': array([[-15.489687  , -10.634596  , -14.069369  , -14.052122  ,
        -14.437083  , -13.246917  , -13.529269  , -12.18584   ,
        -12.190811  , -11.929234  , -12.029672  , -13.091918  ,
        -15.939815  , -16.643188  , -15.408449  , -11.590191  ,
         -6.9313097 , -15.311035  , -12.981564  ,  -2.3261125 ,
         -0.18979633,  -6.996067  , -13.499749  , -12.922486  ,
        -12.478847  , -11.649639  , -13.796424  , -14.8552685 ,
        -12.671888  , -17.875643  , -11.405562  ]], dtype=float32), 'averaged_top_category': 'country_GID'}, 'model_predictions': [{'top_pred': 'country_GID', 'tensor': tensor([[-15.3560,  -8.9228, -12.2865, -14.1857, -13.1579, -11.6872, -13.3790,
         -12.7212, -10.1936, -14.4785, -10.3313, -14.4151, -16.0978, -14.1855,
         -15.5876, -10.8290,  -8.0176, -15.3312, -13.0326,  -1.3645,  -0.2961,
          -8

In [188]:
preds4=dft_tensor.predictions(model=model, path_to_csv='datasets/data/cshape_africa.csv')
preds4

[{'column': 'country', 'values': ['Zambia', 'Liberia', 'Rwanda', 'Egypt', 'Saudi Arabia', 'Guinea-Bissau', 'The Gambia', 'Israel', 'Equatorial Guinea'], 'avg_predictions': {'averaged_tensor': array([[ -4.430699  ,  -3.4931073 , -18.022495  , -14.277381  ,
        -12.642521  , -14.933206  ,  -2.9969385 ,  -0.55397224,
        -15.657098  ,  -9.320038  , -17.297201  , -13.390749  ,
        -10.028104  , -18.216972  , -11.068612  , -16.259323  ,
         -8.840366  , -14.575265  , -10.401575  , -12.39828   ,
        -15.327362  ,  -9.153301  , -15.053763  ,  -8.945912  ,
         -9.090069  ,  -9.435516  , -14.266641  , -13.459404  ,
        -14.361956  , -10.552912  , -10.768927  ]], dtype=float32), 'averaged_top_category': 'country_name'}, 'model_predictions': [{'top_pred': 'country_name', 'tensor': tensor([[ -6.0386,  -2.8219, -16.5380, -14.3044, -13.9985, -14.2122,  -2.0446,
          -0.2139, -14.9084, -10.2137, -17.6548, -12.0147,  -9.5135, -16.7261,
         -10.3072, -15.3954,  -

In [189]:
# Just test a random array by itself.
dft_tensor.evaluate_test_set(model=model,test=['01/02/2020', '09', 'USA', 'WOWOOWOWOWOWOWOWO', '1999', '1200', '402', '.3934', 'iiii', '12', 'USA',"ETH",'South America'])

[{'top_pred': 'date',
  'tensor': tensor([[-1.8405e+01, -1.3475e+01, -1.0843e+01, -1.3926e+01, -9.8927e+00,
           -4.1174e+00, -1.3473e+01, -1.5847e+01, -8.4487e+00, -7.4543e+00,
           -1.0893e+01, -6.7751e+00, -2.0338e+01, -1.3331e+01, -1.9767e+01,
           -1.5002e+01, -1.4928e+01, -8.4395e+00, -1.4448e+01, -8.6970e+00,
           -1.0094e+01, -1.1057e+01, -1.9654e-02, -8.0456e+00, -1.5556e+01,
           -9.3909e+00, -9.4819e+00, -8.5697e+00, -1.0287e+01, -1.7160e+01,
           -1.4445e+01]], grad_fn=<LogSoftmaxBackward>),
  'pred_idx': tensor([22])},
 {'top_pred': 'month',
  'tensor': tensor([[-10.4091, -12.7464, -12.3732,  -7.0635,  -5.8465, -16.1444,  -7.0775,
            -9.0475, -13.9970,  -0.5013, -11.1098, -11.0008, -14.9622, -12.5449,
           -10.6254, -17.3869, -10.4383, -12.3830, -18.6079, -12.5750, -15.1814,
           -11.7973, -10.2138,  -0.9437, -16.1655,  -9.1353, -11.2784,  -9.8058,
           -12.1363, -15.2351, -16.6754]], grad_fn=<LogSoftmaxBackwar

In [41]:

# for later use
def randomChoice(self, values):
    return values[random.randint(0, len(values) - 1)]

def getRandomSet(self):
    category = self.randomChoice(self.all_categories)
#         print(category)
    line = self.randomChoice(list(self.category_values[category]['obj']))
#         print('line', line)
    return (line, category)