In [140]:
from __future__ import unicode_literals, print_function, division

import os
from io import open
import sys
import math
import random
import argparse
import operator
import pdb

import torch
import torch.autograd as autograd
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


from collections import defaultdict
from collections import Counter

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# Kyle's attempt
import faker
from faker import Faker
import pandas as pd
import numpy as np
import re
from string import punctuation
import glob
import unicodedata
import string
import random
import time
import dateutil.parser
import datetime
import arrow
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

fake = Faker()

In [141]:
# Need to have the class of the model in local memory to load a saved model in pytorch
class LSTMClassifier(nn.Module):

    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_size):
        super(LSTMClassifier, self).__init__()

        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=1)

        self.hidden2out = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

        self.dropout_layer = nn.Dropout(p=0.2)


    def init_hidden(self, batch_size):
        return(autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)),
                    autograd.Variable(torch.randn(1, batch_size, self.hidden_dim)))


    def forward(self, batch, lengths):

        self.hidden = self.init_hidden(batch.size(-1))

        embeds = self.embedding(batch)
        packed_input = pack_padded_sequence(embeds, lengths)
        outputs, (ht, ct) = self.lstm(packed_input, self.hidden)
        # ht is the last hidden state of the sequences
        # ht = (1 x batch_size x hidden_dim)
        # ht[-1] = (batch_size x hidden_dim)
        output = self.dropout_layer(ht[-1])
        output = self.hidden2out(output)
        output = self.softmax(output)

        return output

In [142]:
        
class PaddedTensorDataset(Dataset):
#     """Dataset wrapping data, target and length tensors.

#     Each sample will be retrieved by indexing both tensors along the first
#     dimension.

#     Arguments:
#         data_tensor (Tensor): contains sample data.
#         target_tensor (Tensor): contains sample targets (labels).
#         length (Tensor): contains sample lengths.
#         raw_data (Any): The data that has been transformed into tensor, useful for debugging
#     """

    def __init__(self, data_tensor, target_tensor, length_tensor, raw_data):
        assert data_tensor.size(0) == target_tensor.size(0) == length_tensor.size(0)
        self.data_tensor = data_tensor
        self.target_tensor = target_tensor
        self.length_tensor = length_tensor
        self.raw_data = raw_data

    def __getitem__(self, index):
        return self.data_tensor[index], self.target_tensor[index], self.length_tensor[index], self.raw_data[index]

    def __len__(self):
        return self.data_tensor.size(0)

In [531]:
class DF_To_Tensors():
    def __init__(self):
        self.number_of_random_samples=50
#       prediction tensors with the best match being less than predictionLimit will not be returned
        self.predictionLimit=-4.5
        self.country_lookup= pd.read_csv('datasets/lookups/country.csv')
        self.day_of_week = ['Monday', 'Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday','Sunday']
        self.month_of_year =['January', 'February', 'March','April', 'May', 'June','July', 'August', 'September', 'October', 'November', 'December', 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sept','Oct','Nov','Dec']
        self.tag2id = defaultdict(int,
                        {'city': 0,
                         'first_name': 1,
                         'geo': 2,
                         'percent': 3,
                         'year': 4,
                         'ssn': 5,
                         'language_name': 6,
                         'country_name': 7,
                         'phone_number': 8,
                         'month': 9,
                         'zipcode': 10,
                         'iso8601': 11,
                         'paragraph': 12,
                         'pyfloat': 13,
                         'email': 14,
                         'prefix': 15,
                         'pystr': 16,
                         'isbn': 17,
                         'boolean': 18,
                         'country_code':19,
                         'country_GID':20,
                         'continent':21,
                         'date_%Y-%m-%d': 22,
                         'date_%Y_%m_%d': 23,
                         'date_%Y/%m/%d':24,
                         'date_%Y.%m.%d': 25,
                         'date_%m-%d-%Y': 26,
                         'date_%m-%d-%y': 27,
                         'date_%m_%d_%Y': 28,
                         'date_%m_%d_%y': 29,
                         'date_%m/%d/%Y': 30,
                         'date_%m/%d/%y': 31,
                         'date_%m.%d.%Y': 32,
                         'date_%m.%d.%y': 33,
                         'date_%d-%m-%Y': 34,
                         'date_%d-%m-%y': 35,
                         'date_%d_%m_%Y': 36 ,
                         'date_%d_%m_%y': 37,
                         'date_%d/%m/%Y': 38,
                         'date_%d/%m/%y': 39,
                         'date_%d.%m.%Y': 40,
                         'date_%d.%m.%y': 41, 
                         'date_%Y%m%d': 42,
                         'date_%Y%d': 43,
                         'date_%Y-%m':44,
                         'date_%Y/%m':45,
                         'date_%Y.%m': 46,
                         'day_of_month':47,
                         'day_of_week':48,
                         'date_long_dmdy':49,
                         'date_long_mdy': 50,
                         'date_long_dmdyt':51,
                         'date_long_mdyt_m':52,
                         'date_long_dmonthY': 53,
                         'date_long_dmonthy': 54,
                         'city_suffix':55,
                         'month_name':56

                         })
        self.n_categories = len(self.tag2id)
        self.token_set={'a','b','c','d','e',
                        'f','g','h','i','j','k','l',
                        'm','n','o','p','q','r','s',
                        't','u','v','w','x','y','z',
                        'A','B','C','D','E','F','G',
                        'H','I','J','K','L','M','N',
                        'O','P','Q','R','S','T','U',
                        'V','W','X','Y','Z','1','2',
                        '3','4','5','6','7','8','9','0',
                        "'",',','.',';','*','!','@',
                        '#','$','%','^','&','(',')',
                        '_','=','-',':','+','/',"\\", '*'}
        self.token2id = defaultdict(int,
            {'PAD': 0,
             'UNK': 1,
             'a':2,
             'b':3,
             'c': 4,
             'd': 5,
             'e': 6,
             'f': 7,
             'g':8,
             'h': 9,
             'i': 10,
             'j':11,
             'k':12,
             'l':13,
             'm':14,
             'n':15,
             'o':16,
             'p':17,
             'q':18,
             'r':19,
             's':20,
             't':21,
             'u':22,
             'v':23,
             'w':24,
             'x':25,
             'y':26,
             'z':27,
             'A':28,
             'B':29,
             'C':30,
             'D':31,
             'E':32,
             'F':33,
             'G':34,
             'H':35,
             'I':36,
             'J':37,
             'K':38,
             'L':39,
             'N':40,
             'O':41,
             'P':42,
             'Q':43,
             'R':44,
             'S':45,
             'T':46,
             'U':47,
             'V':48,
             'W':49,
             'X':50,
             'Y':51,
             'Z':52,
             '1':53,
             '2':54,
             '3':55,
             '4':56,
             '5':57,
             '6':58,
             '7':59,
             '8':60,
             '9':61,
             '0':62,
             "'":63,
             ',':64,
             '.':65,
             ';':66,
             '*':67,
             '!':68,
             '@':68,
             '#':70,
             '$':71,
             '%':72,
             '^':73,
             '&':74,
             '(':75,
             ')':76,
             '_':77,
             '=':78,
             '-':79,
             ':':80,
             '+':81,
             '/':82,
             '\\':83,
             '*': 84})
    
    def vectorized_string(self, string):
            return [self.token2id[token] if token in self.token2id else self.token2id['UNK'] for token in str(string)]
        
    def vectorized_array(self, array):
        vecorized_array=[]
        for stringValue in array:
            vecorized_array.append(self.vectorized_string(str(stringValue)))
        return vecorized_array
    
    def pad_sequences(self, vectorized_seqs, seq_lengths):
        # create a zero matrix
        seq_tensor = torch.zeros((len(vectorized_seqs), seq_lengths.max())).long()

        # fill the index
        for idx, (seq, seqlen) in enumerate(zip(vectorized_seqs, seq_lengths)):
            seq_tensor[idx, :seqlen] = torch.LongTensor(seq)
        return seq_tensor


    def create_dataset(self, data, batch_size=1):

        vectorized_seqs = self.vectorized_array(data)
        seq_lengths = torch.LongTensor([len(s) for s in vectorized_seqs])
        seq_tensor = self.pad_sequences(vectorized_seqs, seq_lengths)
        target_tensor = torch.LongTensor([self.tag2id[y] for  y in data])
        raw_data = [x for x in data]
        
        return DataLoader(PaddedTensorDataset(seq_tensor, target_tensor, seq_lengths, raw_data), batch_size=batch_size)

    def sort_batch(self,batch, targets, lengths):
        seq_lengths, perm_idx = lengths.sort(0, descending=True)
        seq_tensor = batch[perm_idx]
        target_tensor = targets[perm_idx]

        return seq_tensor.transpose(0, 1), target_tensor, seq_lengths


    def evaluate_test_set(self, model, test):
        y_pred = list()
        all_predictionsforValue=[]

        for batch, targets, lengths, raw_data in self.create_dataset(test, batch_size=1):
            batch, targets, lengths = self.sort_batch(batch, targets, lengths)
            pred = model(torch.autograd.Variable(batch), lengths.cpu().numpy())
            pred_idx = torch.max(pred, 1)[1]
            def get_key(val):
                for key, value in self.tag2id.items():
                     if val == value:
                            return {'top_pred':key, 'tensor':pred, 'pred_idx':pred_idx}
#                             all_predictionsforValue.append({'key':key, 'tensor':pred, 'pred_idx':pred_idx})

            all_predictionsforValue.append(get_key(pred_idx[0]))
        return all_predictionsforValue
        
    def read_in_csv(self,path):
        self.df = pd.read_csv(path)
        df = pd.read_csv(path)
        return df

    
    def get_arrayOfValues_df(self, df):
        column_value_object={}

        for column in df.columns:
            guesses=[]
            column_value_object[column]=[]
            for _ in range(1,self.number_of_random_samples):
                random_values = str(np.random.choice(df[column]))
                random_col = column
                column_value_object[column].append(random_values)

        return column_value_object
    def averaged_predictions(self, all_predictions):
        all_arrays=[]
        for pred in all_predictions:
                all_arrays.append(pred['tensor'].detach().numpy())
        
        out = np.mean(all_arrays, axis=0)
        maxValue = np.amax(out)
        def get_key(val):
  
                for key, value in self.tag2id.items():
                     if val == value:
                        return key
        
        topcat=get_key(np.argmax(out))

        return {
            "averaged_tensor":out,
            'averaged_top_category': {True: 'None', False: topcat}[maxValue < self.predictionLimit]
        }
    
    def predictions(self, model, path_to_csv):
        df = self.read_in_csv(path=path_to_csv)
        column_value_object = self.get_arrayOfValues_df(df)
        self.column_value_object = column_value_object
        predictions=[]
        for column in column_value_object:

            all_predictions = self.evaluate_test_set(model, column_value_object[column])
            avg_predictions = self.averaged_predictions(all_predictions)
            predictions.append({
                    'column': column,
                    'values': column_value_object[column],
                    'avg_predictions': avg_predictions,
                    'model_predictions': all_predictions
                   
                })
    
        self.predictions = predictions
        return self.predictions
    
    # match two words return true based on ratio
    def fuzzyMatch(self,word1, word2, ratio=95):
                Ratio = fuzz.ratio(word1.lower(), word2.lower())

                if Ratio >ratio:
                    return True
    
    # map function to model prediction category
    def assign_heuristic_function(self, predictions):
        

        def none_f(values):
            return {'Category': 'None'}
                
        
        
        def city_f(values):
            print('start city lookup')
            country_match_bool=[]
            c_lookup=pd.read_csv('datasets/lookups/city.csv')
            c_lookup = np.asarray(c_lookup['city'])
            for city in values:
                match= fuzzywuzzy.process.extractOne(city, c_lookup, scorer=fuzz.token_sort_ratio)
                if match is not None:

                    if match[1]>85:
                        country_match_bool.append(True)
            
            if np.count_nonzero(country_match_bool) >= (len(values) * .30):  
                return {'Category':'City Name'}  
            else: 
                return {'Category':'Proper Noun'}  
             
        def state_f(values):
            print('start state lookup')
            country_match_bool=[]
            c_lookup=pd.read_csv('datasets/lookups/NA_states_provinces.csv')
            c_lookup = np.asarray(c_lookup['state_name'])
            for state in values:
                 for c in c_lookup:
                    country_match_bool.append(self.fuzzyMatch(state,c, ratio=85))
            
            if np.count_nonzero(country_match_bool) >= (len(values) * .40):  
                return {'Category':'State Name'} 
            else: 
                print('Starting fuzzy match on cities...')
                return city_f(values)
        
        def country_f(values):
            print('start country lookup')
            country_match_bool=[]
            country_lookup=pd.read_csv('datasets/lookups/country.csv')

            c_lookup=np.asarray(country_lookup['country_name'])

            for country in values:
                for c in c_lookup:
                    country_match_bool.append(self.fuzzyMatch(country,c, ratio=85))
            
            if np.count_nonzero(country_match_bool) >= (len(values) * .40):                
                return {'Category':'Country Name'} 
            else: 
                return state_f(values)
       
        
        
        def country_iso3(values):
            print('start iso3 lookup')
            ISO_in_lookup=[]
            country_lookup=pd.read_csv('datasets/lookups/country.csv')
            c_lookup=np.asarray(country_lookup['Alpha-3_Code'])

            for iso in values:
                for cc in c_lookup:
                    ISO_in_lookup.append(self.fuzzyMatch(str(iso),str(cc), ratio=85))


            if np.count_nonzero(ISO_in_lookup) >= (len(values) * .65):
                return {'Category':'ISO3'} 
            else:
                return country_iso2(values)
                
                
        def country_iso2(values):
            print('start iso2 lookup')
            ISO2_in_lookup=[]
            country_lookup=pd.read_csv('datasets/lookups/country.csv')
            iso2_lookup=np.asarray(country_lookup['Alpha-2_Code'])
            for iso in values:
                for cc in iso2_lookup:
                    ISO2_in_lookup.append(self.fuzzyMatch(str(iso),str(cc), ratio=85))
                           

            if np.count_nonzero(ISO2_in_lookup) >= (len(values) * .65):

                return {'Category':'ISO2'} 
            else:
                return  {'Category':'Unknown code'}
                
                
        def continent_f(values):
            print('start continent lookup')
            cont_in_lookup=[]
            country_lookup=pd.read_csv('datasets/lookups/continent_code.csv')
            cont_lookup=np.asarray(country_lookup['continent_name'])
            for cont in values:

                for c in cont_lookup:
                    cont_in_lookup.append(self.fuzzyMatch(str(iso),str(cc), ratio=85))
            
            if np.count_nonzero(cont_in_lookup) >= (len(values) * .65):

                return {'Category':'Continent'} 
            else:
                return  {'Category':'Proper Noun'} 
                
        def geo_f(values):
            print('start geo test')
            geo_valid=[]
            percent_array=[]
            for geo in values:
                try:
                    if float(geo) <=180 and float(geo) >= -180:
                        if float(geo) <=90 and float(geo) >= -90:
                            geo_valid.append('latlng')
                            if float(geo) <=1 and float(geo) >= -1:
                                percent_array.append("true")

                        else:
                            print('lng', geo)
                            geo_valid.append('lng')
                    else:
                        geo_valid.append('failed')
                except Execption as e:
                    print(e)


            if "failed" in geo_valid:
                return {'Category':'Number'}
            elif len(percent_array) >= len(values)*.95:
                return {'Category':'Number/Geo', 'type': 'Unknown-mostly between -1 and 1'} 
            elif "lng" in geo_valid:
                return {'Category':'Geo', 'type': 'Longitude (number)'} 
            elif 'latlng' in geo_valid:
                return {'Category':'Geo', 'type': 'Latitude (number)'} 
            else:
                return {'Category':'Number'}
                
        
                
        def year_f(values):
            print('start year test')
            year_values_valid=[]
            years_failed=[]
            strange_year=[]
            for year in values:
                if str.isdigit(str(year)):
                    if int(year) > 1300 and int(year)<2500:
                        year_values_valid.append('True')
                    else:
                        strange_year.append('Maybe')
                else:
                    years_failed.append('Failed')
                    
                    
            if len(years_failed)> len(values)*.15:
                return {'Category':'None'} 
            elif len(strange_year) > len(values)*15:
                return {'Category':'None'} 
            elif len(year_values_valid)> len(values)*.75:
                return {'Category':'Year'} 
                
        
        def bool_f(values):
            print('start boolian test')
            bool_arr=['true', 'false', 'T', 'F']
            bool_array=[]
            for bools in values:
                for b in bool_arr:
                    bool_array.append(self.fuzzyMatch(bools,b, ratio=85))
                
            if np.count_nonzero(bool_array) >= (len(values) * .85):
                
                return {'Category':'Boolian'} 
            else:
                return {'Category':'None'} 
        
        def dayFirstCheck(Values, seperator):

            for date in Values:
                try:
                    arr=date.split(seperator)
                    if len(arr[0])==4:
                        if int(arr[1])>12:
                            return True
                    else:
                        if int(arr[0])>12:
                            return True
                except:
                    print('error occured')
                    
            return False
                
                    
        
        def date_arrow(values, seperator):
            utils_array=[]
            for date in values:
                try:
                    dateArrow = arrow.get(str(date), normalize_whitespace=True).datetime
                    print('dateArrow', dateArrow)

                    if isinstance(dateArrow, datetime.date):
                        utils_array.append('true')
                    else:
                        print('Not a valid date format')
                except Exception as e:
                    print(e, 'Error from Arrow: Date had an error')
            return utils_array
        


    
            
        def date_arrow_1(values ):
            array_valid=date_arrow(values, seperator='none')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format':'ydd', 'Parser': 'arrow'} 
            else:
                return {'Category':'Unknown Date'} 
        
        def date_arrow_2(values ):
            array_valid=date_arrow(values,seperator='none')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format':'y-MM', 'Parser': 'arrow'} 
            else:
                return {'Category':'Unknown Date'} 
            
        def date_arrow_3(values ):
            array_valid=date_arrow(values,seperator='none')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'y/MM', 'Parser': 'arrow'} 
            else:
                return {'Category':'Unknown Date'} 
            
        def date_arrow_4(values):
            array_valid=date_arrow(values, seperator='none')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'y.MM', 'Parser': 'arrow'} 
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util(values, seperator):
            util_dates=[]
            if seperator != "none":
                dayFirst=dayFirstCheck(values, seperator)
            else:
                dayFirst=False
            print('dayFirst', dayFirst)
            for date in values:
                try:
                    print('dayFirst', dayFirst, 'date', date)

                    dateUtil = dateutil.parser.parse(str(date), dayfirst=dayFirst)
                    print('dateUtil', dateUtil)
                    if isinstance(dateUtil, datetime.date):
                        util_dates.append({'value': date, 'standard':dateUtil})
                    else:
                        print('failed')
                except Exception as e:
                    print(e)
            
            return util_dates, dayFirst
            

        def iso_time(values):
            array_valid, dayFirst=date_util(values, seperator='none')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'iso8601', 'Parser': 'Util', 'DayFirst': dayFirst}  
            else:
                return {'Category':'Unknown Date'} 

        def date_util_1(values):
            array_valid, dayFirst=date_util(values, seperator='-')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'MM-dd-y', 'Parser': 'Util', 'DayFirst': dayFirst}   
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util_2(values):
            array_valid, dayFirst=date_util(values, seperator='-')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format':'MM-dd-y', 'Parser': 'Util', 'DayFirst': dayFirst}  
            else:
                return {'Category':'Unknown Date'} 
        
        def date_util_3(values):
            array_valid, dayFirst=date_util(values, seperator='_')
            if len(array_valid) > len(values)*.85:
                return  {'Category':'Date', 'Format':'MM_dd_y', 'Parser': 'Util', 'DayFirst': dayFirst} 
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util_4(values):
            array_valid, dayFirst=date_util(values, seperator='_')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format':'MM_dd_yy', 'Parser': 'Util', 'DayFirst': dayFirst} 
            else:
                return {'Category':'Unknown Date'} 
         
        def date_util_5(values):
            array_valid, dayFirst=date_util(values, seperator='/')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format':'MM/dd/y', 'Parser': 'Util', 'DayFirst': dayFirst} 
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util_6(values):
            array_valid, dayFirst=date_util(values, seperator='/')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format':'MM/dd/yy', 'Parser': 'Util', 'DayFirst': dayFirst} 
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util_7(values):
            array_valid, dayFirst=date_util(values, seperator='.')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format':'MM.dd.y', 'Parser': 'Util', 'DayFirst': dayFirst} 
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util_8(values):
            array_valid, dayFirst=date_util(values, seperator='.')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format':'MM.dd.yy', 'Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util_9(values):
            array_valid, dayFirst=date_util(values, seperator='-')
            if len(array_valid) > len(values)*.85:
                return  {'Category':'Date', 'Format':'d-MM-y', 'Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util_10(values):
            array_valid, dayFirst=date_util(values, seperator='-')
            if len(array_valid) > len(values)*.85:
                return  {'Category':'Date', 'Format':'d-MM-yy', 'Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
            
                
        
        def date_util_11(values):
            array_valid, dayFirst=date_util(values, seperator='_')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format':'d_MM_y', 'Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
            
            
        def date_util_12(values):
            array_valid, dayFirst=date_util(values, seperator='_')
            if len(array_valid) > len(values)*.85:
                return  {'Category':'Date', 'Format':'d_MM_yy', 'Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 

        
        def date_util_13(values):
            array_valid, dayFirst=date_util(values, seperator='/')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'd/MM/y', 'Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util_14(values):

            array_valid, dayFirst=date_util(values, seperator='/')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'd/MM/yy', 'Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util_15(values ):
            array_valid, dayFirst=date_util(values,seperator='.')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'd.MM.y', 'Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util_16(values):
            array_valid, dayFirst=date_util(values,seperator='.')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'd.MM.yy', 'Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
            
        def date_util_17(values):
            array_valid, dayFirst=date_util(values, seperator='_')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'y_MM_dd','Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
                    
        def date_util_18(values):
            array_valid, dayFirst=date_util(values,  seperator='.')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'y.MM.dd','Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
            
            
        def date_util_19(values):
            array_valid, dayFirst=date_util(values, seperator='-')
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'y-MM-dd','Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
        
        def date_util_20(values):
            array_valid, dayFirst=date_util(values,seperator='/' )
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'y/MM/dd','Parser': 'Util', 'DayFirst': dayFirst}
            else:
                return {'Category':'Unknown Date'} 
        

            
            
        
        def date_long_format(values):
            for v in values:
                vs=v.split()
                print('vs', vs)
                
        def date_long_1(values):
#              #  01 April 2008 
            print('trying long1')
            array_valid, dayFirst=date_util(values, seperator='none')
            print('array_valid', array_valid)
            array_format_y = date_long_format(values)
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'dd LLLL y', 'Parser': 'Util'}
            else:
                return {'Category':'Unknown Date'} 
            
        def date_long_2(values):
            print('tring long 2')
            array_valid, dayFirst=date_util(values, seperator='none')
            print('array_valid2', array_valid)
            array_format_y = date_long_format(values)
            print(len(array_valid))
            if len(array_valid) > len(values)*.85:
#                 02 April 20
#                    dd/LLLL/yy
                return {'Category':'Date', 'Format': 'dd LLLL yy', 'Parser': 'Util'}
            else:
                return {'Category':'Unknown Date'} 
        

        
        def date_long_3(values):
            array_valid, dayFirst=date_util(values, seperator='none')
            array_format_y = date_long_format(values)
            print(len(array_valid))
            if len(array_valid) > len(values)*.85:
                return {'Category':'Date', 'Format': 'EEEE, LLLL dd,yy', 'Parser': 'Util'}
            else:
                return {'Category':'Unknown Date'} 
            
        def date_long_4(values):
            array_valid, dayFirst=date_util(values, seperator='none')
            array_format_y = date_long_format(values)
            print(len(array_valid))
            if len(array_valid) > len(values)*.85:
#                 April 10, 2008 
#                 LLLL dd, y
                return {'Category':'Date', 'Format': 'LLLL dd, y', 'Parser': 'Util'}
            else:
                return {'Category':'Unknown Date'}            

            
            
        def date_long_5(values):
            array_valid, dayFirst=date_util(values, seperator='none')
            array_format_y = date_long_format(values)
            print(len(array_valid))
            if len(array_valid) > len(values)*.85:
                #  Thursday, April 10, 2008 6:30:00 AM 
#                 EEEE, LLLL dd,yy HH:mm:ss
                return {'Category':'Date', 'Format': 'EEEE, LLLL dd,yy HH:mm:ss', 'Parser': 'Util'}
            else:
                return {'Category':'Unknown Date'} 
            
            
        def date_long_6(values):

            array_valid, dayFirst=date_util(values, seperator='none')
            array_format_y = date_long_format(values)
            print(len(array_valid))
            if len(array_valid) > len(values)*.85:
                #              03/23/21 01:55 PM
#                 MM/dd/yy HH:mm 
                return {'Category':'Date', 'Format': 'MM/dd/yy HH:mm'}
            else:
                return {'Category':'Unknown Date'} 
            
            
        def month_day_f(values):
            month_day_results=[]
            for i, md in enumerate(values):
                if str.isdigit(md):
                    if int(md) <=12 and int(md) >= 1:
                        month_day_results.append('month_day')
                    elif int(md) >12 and int(md) <= 31:
                        month_day_results.append('day')
                    else:
                        month_day_results.append('failed')
                else:
                    print('not a valid digit')
            
            if 'failed' in month_day_results:
                return {'Category':'None'}
            elif 'day' in month_day_results:
                return {'Category':'Day Number'}
            elif 'month_day' in month_day_results:
                return {'Category':'Month Number'}
            else:
                return {'Category':'None'}
        
        
        
        def month_name_f(values):
            print('start month lookup')
            month_array_valid=[]
            for month in values:
                for m in self.month_of_year:
                    month_array_valid.append(self.fuzzyMatch(str(month),str(m), ratio=85))
                           

            if np.count_nonzero(month_array_valid) >= (len(values) * .65):
                return  {'Category': 'Month Name'}
            else:
                return day_name_f(values)
                
        def day_name_f(values):
            print('start day lookup')
            day_array_valid=[]
            for day in values:
                for d in self.day_of_week:
                    day_array_valid.append(self.fuzzyMatch(str(day),str(d), ratio=85))
                           
            if np.count_nonzero(day_array_valid) >= (len(values) * .65):
                return {'Category': 'Day Name'}
            else:
                return {'Category':'None'} 
            
            

        functionlist = defaultdict(int,
            { 
                 'None':none_f,
                
                 'country_name': country_f,
                 'city': country_f,
                 'language_name': country_f,

                 'city_suffix':country_f,

                 

                 'country_GID':country_iso3,
                 'country_code':country_iso2,

                 'continent':continent_f,

                 'geo': geo_f,
                 'pyfloat': geo_f,
                 'percent': geo_f,


                 'first_name': none_f,
                

                 'ssn': none_f,
                 'phone_number': none_f,
                 'zipcode': none_f,
                 'paragraph': none_f,
                 'email': none_f,
                 'prefix': none_f,
                 'pystr': none_f,
                 'isbn': none_f,

                 'boolean': bool_f,

                 'iso8601': iso_time,


                 'year': year_f,

                 'day_of_month':month_day_f,
                 'month': month_day_f,

                 'month_name':month_name_f,
                 'day_of_week':month_name_f,


                 'date_%Y%d': date_arrow_1,
                 'date_%Y-%m': date_arrow_2,
                 'date_%Y/%m': date_arrow_3,
                 'date_%Y.%m': date_arrow_4,


                 'date_%m-%d-%Y': date_util_1,
                 'date_%m-%d-%y': date_util_2,
                 'date_%m_%d_%Y': date_util_3,
                 'date_%m_%d_%y': date_util_4,
                 'date_%m/%d/%Y': date_util_5,
                 'date_%m/%d/%y': date_util_6,
                 'date_%m.%d.%Y': date_util_7,
                 'date_%m.%d.%y': date_util_8,
                 'date_%d-%m-%Y': date_util_9,
                 'date_%d-%m-%y': date_util_10,
                 'date_%d_%m_%Y': date_util_11 ,
                 'date_%d_%m_%y': date_util_12,
                
                 'date_%d/%m/%Y': date_util_13,
                 'date_%d/%m/%y': date_util_14,
                 'date_%d.%m.%Y': date_util_15,
                 'date_%d.%m.%y': date_util_16,
                 'date_%Y_%m_%d': date_util_17,
                 'date_%Y.%m.%d': date_util_18,
                 'date_%Y-%m-%d': date_util_19,
                 'date_%Y/%m/%d': date_util_20,

                 'date_long_dmonthY': date_long_1,
                 'date_long_dmonthy': date_long_2,

                 'date_long_dmdy':date_long_3,
                 'date_long_mdy': date_long_4,
                 'date_long_dmdyt':date_long_5,
                 'date_long_mdyt_m':date_long_6

                 })
        final_column_classification=[]

        for pred in predictions:

            fun = []
            try:
                fun.append(functionlist[pred['avg_predictions']['averaged_top_category']](self.column_value_object[pred['column']]))
            except Exception as e:
                print(e)
            
            final_column_classification.append({'column': pred['column'], 'classification': fun})
#         print('f_response', fun)
        
        return final_column_classification

    
    def fuzzymatchColumns(self,classifications):
        predictions = classifications
        words_to_check= [
            'Date',
            'Datetime',
            'Timestamp',
            'Epoch',
            'Time',
            'Year',
            'Month',
            'Lat',
            'Latitude',
            'lng',
            'Longitude',
            'Geo',
            'Coordinates',
            'Location',
            'location',
            'West',
            'South',
            'East',
            'North',
            'Country',
            'CountryName',
            'CC',
            'CountryCode',
            'State',
            'City',
            'Town',
            'Region',
            'Province',
            'Territory',
            'Address',
            'ISO2',
            'ISO3',
            'ISO_code',
            'Results',
        ]
        
        for i,pred in enumerate(predictions):
            for cc in words_to_check:
                if self.fuzzyMatch(str(pred['column']),str(cc), 85):
                    predictions[i]['fuzzyColumn']=cc
                else:
                    pass
        return predictions
    
    
#     {'column': 'Ydm',
#   'classifcation': [{'Category': 'Date',
#     'Format': 'y/MM/dd',
#     'Parser': 'Util',
#     'DayFirst': True}]
    
    def add_standard_dateColumn(self, fuzzyOutput):
        for out in fuzzyOutput:
            print(out['classification'][0]['Category'])
            if out['classification'][0]['Category']=='Date':
                print(out['column'])
                
                           
            


In [590]:
#can be handeled by arrow
# YYYY-MM-DD,
# YYYY-M-DD, 
# YYYY-M-D, 
# YYYY/MM/DD, 
# YYYY/M/DD, 
# YYYY/M/D, 
# YYYY.MM.DD,
# YYYY.M.DD, 
# YYYY.M.D, 
# YYYYMMDD,  date_%Y%m%d
# YYYY-DDDD, 
# YYYYDDDD, date_%Y%d
# YYYY-MM, date_%Y-%m
# YYYY/MM, date_%Y/%m
# YYYY.MM, date_%Y.%m
# YYYY,
# W


# util
#%d/%m/%y
#%m-%d-%Y'
#%m-%d-%y'
#%m_%d_%Y': failed
#%m_%d_%y': failed
#%m/%d/%Y'
#%m/%d/%y'
# %m.%d.%Y'
# %m.%d.%y'
# %d-%m-%Y'
# %d-%m-%y'
# %d_%m_%Y': failed
# %d_%m_%y': failed,
# '%d/%m/%Y'
# %d/%m/%y'
# %d.%m.%Y'
# %d.%m.%y'

In [521]:
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
preds4=dft_tensor.predictions(model=model2, path_to_csv='datasets/data/africa_test.csv')


In [522]:
output=dft_tensor.assign_heuristic_function(preds4)

start country lookup
start geo test
start country lookup
start state lookup
Starting fuzzy match on cities...
start city lookup
match ('Porto-Novo', 100)
match ('Lusaka', 100)
match ('Riyadh', 100)
match ('Ouagadougou', 100)
match ('Dar es Salaam', 100)
match ('Pretoria', 100)
match ('Tripoli', 100)
match ('Kunwi', 73)
match ('Aden', 100)
match ('Algiers', 100)
match ('Tripoli', 100)
match ('Khartoum', 100)
match ('Monrovia', 100)
match ('Cairo', 100)
match ('Antananarivo', 100)
match ('Malabo', 100)
match ('Abu Dhabi', 100)
match ('Lome', 100)
match ('Windhoek', 100)
match ('Cairo', 100)
match ('Nouakchott', 100)
match ('Aden', 100)
match ('Accra', 100)
match ('Dar es Salaam', 100)
match ('La Solana', 80)
match ('La Solana', 80)
match ('Aden', 100)
match ('Riyadh', 100)
match ("N'Djamena", 100)
match ('Nouakchott', 100)
match ('Banjul', 100)
match ('Moroni', 100)
match ('Moroni', 100)
match ('Accra', 100)
match ('Nouakchott', 100)
match ('Addis Ababa', 100)
match ('Abuja', 100)
match 

In [398]:
dft_tensor.fuzzymatchColumns(output)

[{'column': 'fid', 'classifcation': [{'Category': 'None'}]},
 {'column': 'CNTRY_NAME',
  'classifcation': [{'Category': 'Country Name'}],
  'fuzzyColumn': 'CountryName'},
 {'column': 'AREA', 'classifcation': [{'Category': 'Number'}]},
 {'column': 'CAPNAME', 'classifcation': [{'Category': 'City Name'}]},
 {'column': 'CAPLONG',
  'classifcation': [{'Category': 'Geo', 'type': 'Latitude (number)'}]},
 {'column': 'CAPLAT',
  'classifcation': [{'Category': 'Geo', 'type': 'Latitude (number)'}]},
 {'column': 'FEATUREID', 'classifcation': [{'Category': 'None'}]},
 {'column': 'COWCODE', 'classifcation': [{'Category': 'None'}]},
 {'column': 'COWSYEAR', 'classifcation': [{'Category': 'Year'}]},
 {'column': 'COWSMONTH', 'classifcation': [{'Category': 'Month Number'}]},
 {'column': 'COWSDAY', 'classifcation': [{'Category': 'Day Number'}]},
 {'column': 'COWEYEAR', 'classifcation': [{'Category': 'Year'}]},
 {'column': 'COWEMONTH', 'classifcation': [{'Category': 'Month Number'}]},
 {'column': 'COWEDAY'

In [532]:
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
preds3=dft_tensor.predictions(model=model2, path_to_csv='datasets/datesToBeDeleted.csv')
preds3


[{'column': 'Ydm',
  'values': ['2020/13/2',
   '2020/15/02',
   '2020/15/11',
   '2020/15/10',
   '2020/13/2',
   '2020/15/05',
   '2020/01/12',
   '2020/12/7',
   '2020/15/04',
   '2020/15/03',
   '2020/12/8',
   '2020/12/9',
   '2020/13/4',
   '2020/13/3',
   '2020/15/09',
   '2020/13/2',
   '2020/12/10',
   '2020/15/06',
   '2020/12/2',
   '2020/13/3',
   '2020/01/11',
   '2020/15/03',
   '2020/15/02',
   '2020/01/09',
   '2020/13/12',
   '2020/12/11',
   '2020/12/3',
   '2020/12/11',
   '2020/12/9',
   '2020/13/5',
   '2020/12/10',
   '2020/13/5',
   '2020/15/02',
   '2020/12/8',
   '2020/13/6',
   '2020/13/12',
   '2020/13/11',
   '2020/12/3',
   '2020/12/9',
   '2020/13/3',
   '2020/13/12',
   '2020/15/03',
   '2020/15/03',
   '2020/01/09',
   '2020/13/12',
   '2020/15/05',
   '2020/13/2',
   '2020/15/01',
   '2020/12/9'],
  'avg_predictions': {'averaged_tensor': array([[-1.71238251e+01, -1.87360096e+01, -1.77494602e+01,
           -1.43447104e+01, -1.65098095e+01, -1.54972134e+

In [533]:
output=dft_tensor.assign_heuristic_function(preds3)

dayFirst True
dayFirst True date 2020/13/2
dateUtil 2020-02-13 00:00:00
dayFirst True date 2020/15/02
dateUtil 2020-02-15 00:00:00
dayFirst True date 2020/15/11
dateUtil 2020-11-15 00:00:00
dayFirst True date 2020/15/10
dateUtil 2020-10-15 00:00:00
dayFirst True date 2020/13/2
dateUtil 2020-02-13 00:00:00
dayFirst True date 2020/15/05
dateUtil 2020-05-15 00:00:00
dayFirst True date 2020/01/12
dateUtil 2020-12-01 00:00:00
dayFirst True date 2020/12/7
dateUtil 2020-07-12 00:00:00
dayFirst True date 2020/15/04
dateUtil 2020-04-15 00:00:00
dayFirst True date 2020/15/03
dateUtil 2020-03-15 00:00:00
dayFirst True date 2020/12/8
dateUtil 2020-08-12 00:00:00
dayFirst True date 2020/12/9
dateUtil 2020-09-12 00:00:00
dayFirst True date 2020/13/4
dateUtil 2020-04-13 00:00:00
dayFirst True date 2020/13/3
dateUtil 2020-03-13 00:00:00
dayFirst True date 2020/15/09
dateUtil 2020-09-15 00:00:00
dayFirst True date 2020/13/2
dateUtil 2020-02-13 00:00:00
dayFirst True date 2020/12/10
dateUtil 2020-10-12 

In [534]:
out2=dft_tensor.fuzzymatchColumns(output)

In [536]:
out2

[{'column': 'Ydm',
  'classifcation': [{'Category': 'Date',
    'Format': 'y/MM/dd',
    'Parser': 'Util',
    'DayFirst': True}]},
 {'column': 'Ydm.1',
  'classifcation': [{'Category': 'Date',
    'Format': 'y-MM-dd',
    'Parser': 'Util',
    'DayFirst': True}]},
 {'column': 'dmy',
  'classifcation': [{'Category': 'Date',
    'Format': 'd/MM/y',
    'Parser': 'Util',
    'DayFirst': True}]},
 {'column': 'dmY',
  'classifcation': [{'Category': 'Date',
    'Format': 'd/MM/y',
    'Parser': 'Util',
    'DayFirst': True}]},
 {'column': 'normal',
  'classifcation': [{'Category': 'Date',
    'Format': 'MM/dd/yy',
    'Parser': 'Util',
    'DayFirst': False}]},
 {'column': 'normal-',
  'classifcation': [{'Category': 'Date',
    'Format': 'MM.dd.y',
    'Parser': 'Util',
    'DayFirst': False}]},
 {'column': 'normallast',
  'classifcation': [{'Category': 'Date',
    'Format': 'MM-dd-y',
    'Parser': 'Util',
    'DayFirst': False}]},
 {'column': 'Unnamed: 7',
  'classifcation': [{'Category':

In [535]:
dft_tensor.add_standard_dateColumn(out2)

Date


KeyError: 'classification'

In [383]:
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
preds3=dft_tensor.predictions(model=model2, path_to_csv='datasets/data/cshape_africa.csv')
preds3

[{'column': 'country',
  'values': ['Libya',
   'Gabon',
   'Botswana',
   'Saudi Arabia',
   'Sao Tome and Principe',
   'Burkina Faso',
   'Tanzania',
   'Libya',
   'South Sudan',
   'Libya',
   'Rwanda',
   'Libya',
   'Ethiopia',
   'Guinea-Bissau',
   'Angola',
   'Morocco',
   'South Sudan',
   'Equatorial Guinea',
   'Mauritania',
   'United Arab Emirates',
   'Burundi',
   'Yemen Arab Republic',
   'Zanzibar',
   'Tanzania',
   'Namibia',
   'Somalia',
   'Sao Tome and Principe',
   'Somalia',
   'Nigeria',
   'Sudan',
   'Algeria',
   'Bahrain',
   'Yemen',
   'Libya',
   'Tanzania',
   'Togo',
   'Jordan',
   "Cote d'Ivoire",
   'Egypt',
   'Burkina Faso',
   'Ethiopia',
   'Israel',
   'South Sudan',
   'Tanzania',
   'Lesotho',
   'Central African Republic',
   'Israel',
   'Malawi',
   'Libya'],
  'avg_predictions': {'averaged_tensor': array([[ -6.6600237,  -3.2380085, -18.32253  , -16.359993 , -21.709024 ,
           -11.254369 ,  -2.7101576,  -0.6610362, -15.137761 , -1

In [384]:
output=dft_tensor.assign_heuristic_function(preds3)

start country lookup
dayFirst False
dayFirst False date 02/06/21
dateUtil 2021-02-06 00:00:00
dayFirst False date 04/28/21
dateUtil 2021-04-28 00:00:00
dayFirst False date 04/29/21
dateUtil 2021-04-29 00:00:00
dayFirst False date 03/19/21
dateUtil 2021-03-19 00:00:00
dayFirst False date 02/23/21
dateUtil 2021-02-23 00:00:00
dayFirst False date 04/02/21
dateUtil 2021-04-02 00:00:00
dayFirst False date 04/12/21
dateUtil 2021-04-12 00:00:00
dayFirst False date 04/09/21
dateUtil 2021-04-09 00:00:00
dayFirst False date 03/06/21
dateUtil 2021-03-06 00:00:00
dayFirst False date 04/21/21
dateUtil 2021-04-21 00:00:00
dayFirst False date 02/12/21
dateUtil 2021-02-12 00:00:00
dayFirst False date 03/17/21
dateUtil 2021-03-17 00:00:00
dayFirst False date 03/23/21
dateUtil 2021-03-23 00:00:00
dayFirst False date 03/31/21
dateUtil 2021-03-31 00:00:00
dayFirst False date 04/09/21
dateUtil 2021-04-09 00:00:00
dayFirst False date 02/15/21
dateUtil 2021-02-15 00:00:00
dayFirst False date 03/09/21
dateUti

dayFirst False date 2019/02/25
dateUtil 2019-02-25 00:00:00
dayFirst True
dayFirst True date 2017/22/2
dateUtil 2017-02-22 00:00:00
dayFirst True date 2017/21/1
dateUtil 2017-01-21 00:00:00
dayFirst True date 2017/22/8
dateUtil 2017-08-22 00:00:00
dayFirst True date 2017/20/5
dateUtil 2017-05-20 00:00:00
dayFirst True date 2017/18/5
dateUtil 2017-05-18 00:00:00
dayFirst True date 2018/15/1
dateUtil 2018-01-15 00:00:00
dayFirst True date 2017/20/12
dateUtil 2017-12-20 00:00:00
dayFirst True date 2018/15/3
dateUtil 2018-03-15 00:00:00
dayFirst True date 2018/15/12
dateUtil 2018-12-15 00:00:00
dayFirst True date 2017/19/5
dateUtil 2017-05-19 00:00:00
dayFirst True date 2018/15/3
dateUtil 2018-03-15 00:00:00
dayFirst True date 2018/15/1
dateUtil 2018-01-15 00:00:00
dayFirst True date 2017/22/7
dateUtil 2017-07-22 00:00:00
dayFirst True date 2017/19/11
dateUtil 2017-11-19 00:00:00
dayFirst True date 2018/15/1
dateUtil 2018-01-15 00:00:00
dayFirst True date 2018/15/7
dateUtil 2018-07-15 00:0

In [385]:
dft_tensor.fuzzymatchColumns(output)

[{'column': 'country',
  'classifcation': [{'Category': 'Country Name'}],
  'fuzzyColumn': 'Country'},
 {'column': 'value', 'classifcation': [{'Category': 'None'}]},
 {'column': 'date-mdy',
  'classifcation': [{'Category': 'Date',
    'Format': 'MM/dd/yy',
    'Parser': 'Util',
    'DayFirst': False}]},
 {'column': 'date-m-d-Y',
  'classifcation': [{'Category': 'Date',
    'Format': 'd-MM-y',
    'Parser': 'Util',
    'DayFirst': False}]},
 {'column': 'dmY',
  'classifcation': [{'Category': 'Date',
    'Format': 'd/MM/yy',
    'Parser': 'Util',
    'DayFirst': True}]},
 {'column': 'Long-date-Y',
  'classifcation': [{'Category': 'Date',
    'Format': 'dd LLLL y',
    'Parser': 'Util'}]},
 {'column': 'long-date-y',
  'classifcation': [{'Category': 'Date',
    'Format': 'dd LLLL yy',
    'Parser': 'Util'}]},
 {'column': 'Long-date-day',
  'classifcation': [{'Category': 'Date',
    'Format': 'dd LLLL y',
    'Parser': 'Util'}]},
 {'column': 'Iso-date-time',
  'classifcation': [{'Category':

In [510]:
# create our class and load the saved model
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
test1=dft_tensor.predictions(model=model2, path_to_csv='datasets/Test_1.csv')
test1

[{'column': 'COUNTRY',
  'values': ['Palau',
   'New Zealand',
   'Macedonia',
   'Yemen',
   'Colombia',
   'Grenada',
   'Pakistan',
   'Mongolia',
   'Gambia',
   'Reunion',
   'Armenia',
   'Mauritania',
   'Cyprus',
   'Mexico',
   'Mauritania',
   'Liechtenstein',
   'Marshall Islands',
   'France',
   'Luxembourg',
   'New Caledonia',
   'Eritrea',
   'Tunisia',
   'Romania',
   'Netherlands Antilles',
   'Ethiopia',
   'Gabon',
   'Vietnam',
   'Bermuda',
   'Saint Lucia',
   'Niue',
   'Anguilla',
   'United Arab Emirates',
   'Jamaica',
   'Equatorial Guinea',
   'Dominica',
   'Taiwan',
   'Saint Barthelemy',
   'Greenland',
   'Chad',
   'Vatican',
   'Pakistan',
   'Kyrgyzstan',
   'Macau',
   'Montenegro',
   'Samoa',
   'Saint Lucia',
   'Puerto Rico',
   'Saudi Arabia',
   'Austria'],
  'avg_predictions': {'averaged_tensor': array([[ -6.7405195,  -3.1593876, -18.588604 , -16.60516  , -21.74318  ,
           -11.40748  ,  -3.0858593,  -0.5027756, -15.399698 , -15.878954 

In [511]:
output=dft_tensor.assign_heuristic_function(test1)

start country lookup
not a valid digit
not a valid digit
not a valid digit
not a valid digit
start iso3 lookup
start iso2 lookup
start iso2 lookup
start iso3 lookup


In [512]:
out=dft_tensor.fuzzymatchColumns(output)

COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
cc Country
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
cc CountryCode
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
COUNTRY CODE
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES
ISO CODES


In [528]:
out


[{'column': 'COUNTRY',
  'classifcation': [{'Category': 'Country Name'}],
  'fuzzyColumn': 'Country'},
 {'column': 'COUNTRY CODE',
  'classifcation': [{'Category': 'None'}],
  'fuzzyColumn': 'CountryCode'},
 {'column': 'ISO CODES', 'classifcation': [{'Category': 'Unknown code'}]},
 {'column': 'iso2',
  'classifcation': [{'Category': 'ISO2'}],
  'fuzzyColumn': 'ISO2'},
 {'column': 'iso3',
  'classifcation': [{'Category': 'ISO3'}],
  'fuzzyColumn': 'ISO3'},
 {'column': 'iso2_lower', 'classifcation': [{'Category': 'None'}]},
 {'column': 'iso3_lower', 'classifcation': [{'Category': 'None'}]},
 {'column': 'POPULATION ', 'classifcation': [{'Category': 'None'}]},
 {'column': 'AREA KM2', 'classifcation': [{'Category': 'None'}]},
 {'column': 'GDP $USD', 'classifcation': [{'Category': 'None'}]}]

In [529]:
dft_tensor.add_standard_dateColumn(out)

Country Name


KeyError: 'classification'

In [732]:
# Just test a random array by itself.


In [515]:
# create our class and load the saved model
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
test1=dft_tensor.predictions(model=model2, path_to_csv='datasets/Test_2_kaggle.csv')
test1

[{'column': 'country',
  'values': ['Luxembourg',
   'Oman',
   'England',
   'United Arab Emirates',
   'Oman',
   'China',
   'Slovakia',
   'Canada',
   'Croatia',
   'Saint Helena',
   'Chile',
   'Nepal',
   'Gibraltar',
   'Oman',
   'Portugal',
   'Wales',
   'Netherlands',
   'Cayman Islands',
   'Turkey',
   'Slovakia',
   'Argentina',
   'Bolivia',
   'Ireland',
   'Isle of Man',
   'Spain',
   'Norway',
   'Belgium',
   'Norway',
   'Norway',
   'Greece',
   'England',
   'India',
   'Hungary',
   'Italy',
   'Russia',
   'Oman',
   'United Kingdom',
   'Wales',
   'Iceland',
   'India',
   'Israel',
   'Germany',
   'Estonia',
   'Slovakia',
   'Sweden',
   'Costa Rica',
   'Chile',
   'Lithuania',
   'Dominican Republic'],
  'avg_predictions': {'averaged_tensor': array([[ -6.387923 ,  -2.4504757, -17.895676 , -16.164276 , -21.163279 ,
           -11.910635 ,  -3.307516 ,  -0.6687734, -15.109624 , -15.451242 ,
           -18.018345 , -11.937752 , -11.4250555, -16.152845 , -

In [516]:
output=dft_tensor.assign_heuristic_function(test1)

start country lookup
start iso3 lookup
dayFirst False
dayFirst False date 2021-01-17
dateUtil 2021-01-17 00:00:00
dayFirst False date 2021-02-19
dateUtil 2021-02-19 00:00:00
dayFirst False date 2021-02-16
dateUtil 2021-02-16 00:00:00
dayFirst False date 2021-02-07
dateUtil 2021-02-07 00:00:00
dayFirst False date 2020-12-30
dateUtil 2020-12-30 00:00:00
dayFirst False date 2021-02-14
dateUtil 2021-02-14 00:00:00
dayFirst False date 2021-01-21
dateUtil 2021-01-21 00:00:00
dayFirst False date 2021-01-12
dateUtil 2021-01-12 00:00:00
dayFirst False date 2021-01-02
dateUtil 2021-01-02 00:00:00
dayFirst False date 2021-01-16
dateUtil 2021-01-16 00:00:00
dayFirst False date 2021-02-20
dateUtil 2021-02-20 00:00:00
dayFirst False date 2021-01-31
dateUtil 2021-01-31 00:00:00
dayFirst False date 2021-02-01
dateUtil 2021-02-01 00:00:00
dayFirst False date 2021-01-20
dateUtil 2021-01-20 00:00:00
dayFirst False date 2021-01-26
dateUtil 2021-01-26 00:00:00
dayFirst False date 2021-02-12
dateUtil 2021-0

In [517]:
out2=dft_tensor.fuzzymatchColumns(output)

country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
cc Country
country
country
country
country
country
country
country
country
country
country
country
country
country
country
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
iso_code
cc ISO_code
iso_code
date
cc Date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
date
total_vaccinations
total_vaccinations
total_vaccinations
total_vaccinations
total_vaccinations
total_vaccinations
total_vaccinations
total_vaccinations
total_vaccinations
total_vaccinations
total_vaccinations
total_vaccin

In [518]:
dft_tensor.add_standard_dateColumn(out2)

[{'Category': 'Country Name'}]
[{'Category': 'ISO3'}]
[{'Category': 'Date', 'Format': 'y-MM-dd', 'Parser': 'Util', 'DayFirst': False}]
[{'Category': 'Number'}]
[{'Category': 'None'}]
[{'Category': 'None'}]
[{'Category': 'None'}]
[{'Category': 'Number'}]
[{'Category': 'Number'}]
[{'Category': 'None'}]
[{'Category': 'None'}]
[{'Category': 'Number'}]
[{'Category': 'Proper Noun'}]
[{'Category': 'Proper Noun'}]
[{'Category': 'None'}]


In [493]:
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
test1=dft_tensor.predictions(model=model2, path_to_csv='datasets/Test_4_kaggle.csv')
test1

[{'column': 'show_id',
  'values': ['s4843',
   's6645',
   's7480',
   's2738',
   's540',
   's7552',
   's512',
   's812',
   's976',
   's3641',
   's3548',
   's3653',
   's5458',
   's896',
   's3616',
   's2697',
   's358',
   's3742',
   's5407',
   's1902',
   's5359',
   's2631',
   's1573',
   's4280',
   's3153',
   's4308',
   's1391',
   's4530',
   's4181',
   's3613',
   's1017',
   's6087',
   's5034',
   's2155',
   's6284',
   's3607',
   's4797',
   's3543',
   's2186',
   's4972',
   's6155',
   's3750',
   's5720',
   's189',
   's3707',
   's7313',
   's1914',
   's1548',
   's6617'],
  'avg_predictions': {'averaged_tensor': array([[-21.262375 , -26.359993 ,  -8.340513 , -12.957201 ,  -5.1821313,
           -11.402237 , -22.55517  , -24.163303 ,  -9.215255 ,  -9.235647 ,
            -0.7063665, -14.646448 , -22.325527 ,  -9.18665  , -14.952071 ,
           -18.623032 , -18.262072 , -18.934841 , -18.589119 ,  -9.954639 ,
           -19.547401 , -18.047634 , -15.46

In [494]:
output=dft_tensor.assign_heuristic_function(test1)

start country lookup
start state lookup
Starting fuzzy match on cities...
start city lookup
match ('Mons-en-Baroeul', 59)
match ('Steeles', 67)
match ('Saint-Pol-sur-Mer', 53)
match ('Gold Coast', 64)
match ("Ra's Bayrut", 62)
match ('The Bronx', 64)
match ('North Little Rock', 69)
match ('Thousand Oaks', 69)
match ('Alegrete', 64)
match ('Ives Estates', 56)
match ('Juan Jose Rios', 55)
match ('Cholet', 67)
match ('Cisterna di Latina', 57)
match ('Lancut', 71)
match ('Nguigmi', 73)
match ('Santa Teresa del Tuy', 60)
match ('Barra do Corda', 50)
match ('Espirito Santo do Pinhal', 52)
match ('Sainthia', 74)
match ('The Hammocks', 59)
match ('Lake in the Hills', 52)
match ('Aix-les-Bains', 56)
match ('Romano Banco', 64)
match ('Dayton', 77)
match ('Zuerich (Kreis 9) / Albisrieden', 51)
match ('The Beaches', 59)
match ('Asahikawa', 76)
match ('Boyle Heights', 55)
match ('Bountiful', 74)
match ('Vitry-sur-Seine', 65)
match ('Facatativa', 70)
match ('Balzar', 77)
match ('Los Angeles', 67)
ma

match ('Islington-City Centre West', 53)
match ('Santa Teresa del Tuy', 47)
match ('San Vicente', 59)
match ('San Vicente', 59)
match ('San Juan de los Lagos', 44)
match ('San Antonio de los Banos', 44)
match ('Carrieres-sous-Poissy', 52)
match ('Dumfries', 67)
match ('Olesa de Montserrat', 51)
match ('East Independence', 67)
match ('Cormeilles-en-Parisis', 56)
match ('Islington-City Centre West', 53)
match ('Les Clayes-sous-Bois', 46)
match ('Cormeilles-en-Parisis', 56)
match ('Montigny-les-Cormeilles', 59)
match ('Olesa de Montserrat', 51)
match ('East Independence', 59)
match ('Delegacion Cuajimalpa de Morelos', 49)
match ('Santo Antonio dos Olivais', 53)
match ('Hot Springs National Park', 49)
match ('Dumfries', 67)
match ('Santo Antonio do Amparo', 46)
match ('Coney Island', 67)
match ('Mariners Harbor', 64)
match ('East Independence', 67)
match ('Sao Mateus do Maranhao', 45)
match ('Montigny-les-Cormeilles', 64)
match ('Delegacion Cuajimalpa de Morelos', 49)
match ('East Independ

In [495]:
out2=dft_tensor.fuzzymatchColumns(output)

show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
show_id
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
type
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
title
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
director
cast
cast
cast
cast
cast
cast
cast
cast
cast
cas

In [496]:
dft_tensor.add_standard_dateColumn(out2)

{'column': 'show_id', 'classifcation': ['None']}
{'column': 'type', 'classifcation': ['None']}
{'column': 'title', 'classifcation': ['Proper Noun']}
{'column': 'director', 'classifcation': [{'Category': 'City Name'}]}
{'column': 'cast', 'classifcation': ['Proper Noun']}
{'column': 'country', 'classifcation': [{'Category': 'Country Name'}], 'fuzzyColumn': 'Country'}
{'column': 'date_added', 'classifcation': [{'Category': 'Date', 'Format': 'LLLL dd, y', 'Parser': 'Util'}]}
{'column': 'release_year', 'classifcation': [{'Category': 'Year'}]}
{'column': 'rating', 'classifcation': ['None']}
{'column': 'duration', 'classifcation': ['None']}
{'column': 'listed_in', 'classifcation': ['Proper Noun']}
{'column': 'description', 'classifcation': ['None']}


In [498]:
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
test1=dft_tensor.predictions(model=model2, path_to_csv='datasets/Test_6_kaggle.csv')
test1

[{'column': 'state',
  'values': ['Ohio',
   'Massachusetts',
   'Iowa',
   'Massachusetts',
   'Texas',
   'Massachusetts',
   'South Carolina',
   'California',
   'New York',
   'Minnesota',
   'California',
   'Texas',
   'Illinois',
   'Alabama',
   'Utah',
   'Indiana',
   'Texas',
   'New Mexico',
   'California',
   'Michigan',
   'Texas',
   'West Virginia',
   'Texas',
   'Virginia',
   'California',
   'California',
   'Georgia',
   'Colorado',
   'California',
   'California',
   'Michigan',
   'Missouri',
   'Colorado',
   'Arkansas',
   'Maryland',
   'Indiana',
   'California',
   'New Jersey',
   'Illinois',
   'California',
   'Texas',
   'California',
   'Arizona',
   'Texas',
   'Florida',
   'Georgia',
   'Texas',
   'Colorado',
   'Illinois'],
  'avg_predictions': {'averaged_tensor': array([[ -6.56467  ,  -2.4235218, -16.931906 , -14.734173 , -20.693874 ,
           -11.95195  ,  -3.2233186,  -1.3043985, -14.689971 , -15.327658 ,
           -17.996292 , -11.800094 

In [499]:
output=dft_tensor.assign_heuristic_function(test1)

start country lookup
start state lookup
start country lookup
start state lookup
Starting fuzzy match on cities...
start city lookup
match ('Collinsville', 82)
match ('Columbia', 100)
match ('Oxon Hill', 100)
match ('Westminster', 100)
match ('Kearny', 100)
match ('Monterey Park', 100)
match ('Chesterfield', 100)
match ('Florence', 100)
match ('Folsom', 100)
match ('Austin', 100)
match ('Chicago', 100)
match ('Framingham', 74)
match ('Overland Park', 100)
match ('Hillsboro', 100)
match ('Brooklyn', 100)
match ('Akron', 100)
match ('Mobile', 100)
match ('Westmont', 100)
match ('Fort Lauderdale', 100)
match ('Portage', 100)
match ('Dallas', 100)
match ('Spring', 100)
match ('Lehi', 100)
match ('Vina del Mar', 77)
match ('York', 100)
match ('Plantation', 100)
match ('Salinas', 100)
match ('East Meadow', 100)
match ('Henderson', 100)
match ('Berkeley', 100)
match ('Sterling', 100)
match ('Dallas', 100)
match ('Tucson', 100)
match ('Lakeland', 100)
match ('West New York', 76)
match ('Inglewo

In [500]:
out2=dft_tensor.fuzzymatchColumns(output)

state
state
state
state
state
state
state
state
state
state
state
state
state
state
state
state
state
state
state
state
state
state
state
state
cc State
state
state
state
state
state
state
state
state
state
state
location
location
location
location
location
location
location
location
location
location
location
location
location
location
cc Location
location
cc location
location
location
location
location
location
location
location
location
location
location
location
location
location
location
location
location
location
location
location
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
address
cc Address
address
address
address
address
latitude
latitude
latitude
latitude
latitude
latitude
latitude
latitude
latitude
cc Latitude
latitude
latitude
latitude
latitude
latitude
latitude
latitude
latitude
latitude


In [501]:
dft_tensor.add_standard_dateColumn(out2)

[{'Category': 'State Name'}]
[{'Category': 'City Name'}]
['Unknown code']
[{'Category': 'Geo', 'type': 'Latitude (number)'}]
[{'Category': 'Geo', 'type': 'Longitude (number)'}]


In [423]:
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
test1=dft_tensor.predictions(model=model2, path_to_csv='datasets/Test_7_kaggle.csv')
test1

[{'column': 'event_id',
  'values': ['2270096323',
   '2270676870',
   '2259240500',
   '2259220313',
   '2259206234',
   '2270660400',
   '2270095253',
   '2259245110',
   '2270662528',
   '2259198101',
   '2270655986',
   '2269725405',
   '2270201127',
   '2269947984',
   '2269936043',
   '2270078697',
   '2269957181',
   '2270648013',
   '2269725723',
   '2269967749',
   '2270684658',
   '2270084590',
   '2270673783',
   '2270128514',
   '2269832183',
   '2269976426',
   '2269936043',
   '2270123748',
   '2270118190',
   '2269944085',
   '2270679588',
   '2270103017',
   '2259206668',
   '2270075416',
   '2270670170',
   '2270677130',
   '2270206414',
   '2270106427',
   '2269724460',
   '2270075938',
   '2269945357',
   '2270665473',
   '2270074571',
   '2259218306',
   '2270645179',
   '2259239960',
   '2259224379',
   '2270647977',
   '2270048787'],
  'avg_predictions': {'averaged_tensor': array([[-29.11158   , -18.977388  ,  -5.1936903 , -14.435608  ,
           -13.68825   ,  -

In [425]:
output=dft_tensor.assign_heuristic_function(test1)

not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
dayFirst False
dayFirst False date 2006-06-24T11:03:08Z
dateUtil 2006-06-24 11:03:08+00:00
dayFirst False date 2007-11

In [426]:
dft_tensor.fuzzymatchColumns(output)

event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
event_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
animal_id
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
study_site
season
sea

[{'column': 'event_id', 'classifcation': ['None']},
 {'column': 'animal_id', 'classifcation': [{'Category': 'None'}]},
 {'column': 'study_site', 'classifcation': ['None']},
 {'column': 'season', 'classifcation': ['None']},
 {'column': 'timestamp',
  'classifcation': [{'Category': 'Date',
    'Format': 'iso8601',
    'Parser': 'Util',
    'DayFirst': False}],
  'fuzzyColumn': 'Timestamp'},
 {'column': 'longitude',
  'classifcation': [{'Category': 'Geo', 'type': 'Longitude (number)'}],
  'fuzzyColumn': 'Longitude'},
 {'column': 'latitude',
  'classifcation': [{'Category': 'Geo', 'type': 'Latitude (number)'}],
  'fuzzyColumn': 'Latitude'}]

In [427]:
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
test1=dft_tensor.predictions(model=model2, path_to_csv='datasets/Test_8_kaggle.csv')
test1

[{'column': 'geonameid',
  'values': ['6690786',
   '7302825',
   '1512449',
   '2515698',
   '108512',
   '1710357',
   '483551',
   '3101950',
   '5794559',
   '550280',
   '1734810',
   '3177300',
   '4166638',
   '3091831',
   '4397340',
   '3935572',
   '1261580',
   '6691831',
   '1261342',
   '569223',
   '1168412',
   '1793899',
   '3591181',
   '1508161',
   '1262240',
   '3864331',
   '1790371',
   '1619650',
   '1221714',
   '2664203',
   '1734651',
   '1817990',
   '4888892',
   '320557',
   '2853928',
   '7284885',
   '2521335',
   '1863592',
   '4514746',
   '481985',
   '3450272',
   '1181096',
   '2525790',
   '5127536',
   '9972964',
   '481985',
   '616629',
   '1859094',
   '5007531'],
  'avg_predictions': {'averaged_tensor': array([[-24.001446 , -19.39893  ,  -5.668418 , -14.497209 ,  -9.146205 ,
            -7.858959 , -17.095558 , -21.638857 ,  -3.8127851, -11.738388 ,
            -0.9869548, -11.940026 , -21.661816 ,  -6.0119796, -13.23709  ,
           -16.51137

In [428]:
output=dft_tensor.assign_heuristic_function(test1)

start country lookup
start state lookup
Starting fuzzy match on cities...
start city lookup
start country lookup
start state lookup
Starting fuzzy match on cities...
start city lookup
start country lookup
start state lookup
Starting fuzzy match on cities...
start city lookup
start geo test
start geo test
lng 116.82887
lng -123.43074
lng -94.5569
lng 120.73168
lng 120.97137
lng 107.44333
lng 140.05
lng 125.31667
lng 116.16869
start iso2 lookup
start iso3 lookup
start iso2 lookup
start iso2 lookup
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
not a valid digit
start geo test
lng 149.0
lng 145.0
lng 122.0
lng 93.0
lng 148.0
lng 169.0
lng 164.0
start country lookup
start state lookup
Starting fuzzy match on cities...
start city lookup
dayFirst False
dayFirst False date 10/14/16
dateUtil 2016-10-14 00:00:00
dayFirst False date 2/18/16
dateUtil 2016-02-18 00:00:00
dayFirst False date 10/14/14
dateUtil 2014-10-14 00:00:00
dayFirst 

In [429]:
dft_tensor.fuzzymatchColumns(output)

geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
geonameid
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
name
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
asciiname
alternatenames
alternatenames
alternatenames
alternatenames
alternatenames
alternatenames
alternatenames
alternatenames
alternatenames
alternatenames
alternatenames
alternaten

[{'column': 'geonameid', 'classifcation': ['None']},
 {'column': 'name', 'classifcation': [{'Category': 'City Name'}]},
 {'column': 'asciiname', 'classifcation': [{'Category': 'City Name'}]},
 {'column': 'alternatenames', 'classifcation': ['Proper Noun']},
 {'column': 'latitude',
  'classifcation': [{'Category': 'Geo', 'type': 'Latitude (number)'}],
  'fuzzyColumn': 'Latitude'},
 {'column': 'longitude',
  'classifcation': [{'Category': 'Geo', 'type': 'Longitude (number)'}],
  'fuzzyColumn': 'Longitude'},
 {'column': 'feature class', 'classifcation': ['Unknown code']},
 {'column': 'feature code', 'classifcation': ['Unknown code']},
 {'column': 'country code',
  'classifcation': [{'Category': 'ISO2'}],
  'fuzzyColumn': 'CountryCode'},
 {'column': 'cc2', 'classifcation': ['None']},
 {'column': 'admin1 code', 'classifcation': [{'Category': 'None'}]},
 {'column': 'admin2 code', 'classifcation': ['None']},
 {'column': 'admin3 code', 'classifcation': ['None']},
 {'column': 'admin4 code', 'cla

In [430]:
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
test1=dft_tensor.predictions(model=model2, path_to_csv='datasets/Test_9_kaggle.csv')
test1

[{'column': 'Rank',
  'values': ['20',
   '50',
   '17',
   '16',
   '39',
   '49',
   '25',
   '35',
   '33',
   '45',
   '12',
   '43',
   '19',
   '39',
   '16',
   '42',
   '44',
   '50',
   '45',
   '35',
   '32',
   '43',
   '5',
   '26',
   '41',
   '30',
   '3',
   '47',
   '14',
   '31',
   '24',
   '50',
   '47',
   '39',
   '34',
   '33',
   '42',
   '48',
   '1',
   '4',
   '30',
   '37',
   '35',
   '3',
   '12',
   '36',
   '14',
   '25',
   '7'],
  'avg_predictions': {'averaged_tensor': array([[-15.094974 , -21.421902 , -10.716511 ,  -8.314037 ,  -8.149385 ,
           -11.129508 , -17.99052  , -19.156961 ,  -8.981533 ,  -1.8884243,
            -6.8089247, -11.014648 , -18.395702 , -10.401285 , -11.165259 ,
           -16.428493 , -12.69373  , -15.38595  , -18.107597 ,  -9.461809 ,
           -17.264366 , -16.548    , -12.251311 , -11.77058  , -11.319714 ,
           -10.4890585, -13.15642  , -11.141179 , -12.119599 , -12.570188 ,
           -12.632447 , -10.399108 , -11

In [431]:
output=dft_tensor.assign_heuristic_function(test1)

start country lookup
start state lookup
Starting fuzzy match on cities...
start city lookup
dayFirst True
dayFirst True date 28.2%
Unknown string format: 28.2%
dayFirst True date 28.2%
Unknown string format: 28.2%
dayFirst True date 28.0%
Unknown string format: 28.0%
dayFirst True date 130.5%
Unknown string format: 130.5%
dayFirst True date 36.8%
Unknown string format: 36.8%
dayFirst True date 32.5%
Unknown string format: 32.5%
dayFirst True date 30.1%
Unknown string format: 30.1%
dayFirst True date 23.0%
Unknown string format: 23.0%
dayFirst True date 34.5%
Unknown string format: 34.5%
dayFirst True date 79.7%
Unknown string format: 79.7%
dayFirst True date 34.5%
Unknown string format: 34.5%
dayFirst True date 130.5%
Unknown string format: 130.5%
dayFirst True date 16.0%
Unknown string format: 16.0%
dayFirst True date 30.8%
Unknown string format: 30.8%
dayFirst True date 17.6%
Unknown string format: 17.6%
dayFirst True date 16.0%
Unknown string format: 16.0%
dayFirst True date 79.7%
U

In [432]:
dft_tensor.fuzzymatchColumns(output)

Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Rank
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Restaurant
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
cc Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
Sales
S

[{'column': 'Rank', 'classifcation': [{'Category': 'None'}]},
 {'column': 'Restaurant', 'classifcation': ['Proper Noun']},
 {'column': 'Location', 'classifcation': ['None'], 'fuzzyColumn': 'Location'},
 {'column': 'Sales', 'classifcation': [{'Category': 'None'}]},
 {'column': 'YOY_Sales', 'classifcation': ['Unknown Date']},
 {'column': 'Units', 'classifcation': [{'Category': 'None'}]},
 {'column': 'YOY_Units', 'classifcation': ['Unknown Date']},
 {'column': 'Unit_Volume', 'classifcation': ['None']},
 {'column': 'Franchising', 'classifcation': ['None']}]

In [461]:
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
test1=dft_tensor.predictions(model=model2, path_to_csv='datasets/Test_10_kaggle.csv')
test1

[{'column': 'Date',
  'values': ['2009-05-30',
   '2010-07-11',
   '2010-09-25',
   '2014-10-21',
   '2010-11-16',
   '2011-08-30',
   '2014-09-04',
   '2016-12-12',
   '2016-06-05',
   '2013-06-09',
   '2013-05-28',
   '2011-06-12',
   '2011-01-07',
   '2010-07-02',
   '2009-05-10',
   '2010-04-08',
   '2014-07-15',
   '2013-05-04',
   '2013-03-07',
   '2014-02-04',
   '2010-01-08',
   '2012-06-25',
   '2012-04-24',
   '2010-06-11',
   '2012-11-19',
   '2015-04-29',
   '2017-04-20',
   '2011-07-14',
   '2016-04-05',
   '2015-04-30',
   '2013-05-20',
   '2016-03-10',
   '2015-05-10',
   '2015-06-05',
   '2009-01-20',
   '2014-11-21',
   '2010-11-30',
   '2015-08-04',
   '2015-12-23',
   '2010-05-08',
   '2009-03-12',
   '2012-08-02',
   '2016-05-09',
   '2010-08-01',
   '2017-01-21',
   '2014-01-17',
   '2013-10-26',
   '2016-06-17',
   '2017-01-27'],
  'avg_predictions': {'averaged_tensor': array([[-2.16635246e+01, -1.38794956e+01, -1.70828056e+01,
           -1.82350292e+01, -1.93468

In [462]:
output=dft_tensor.assign_heuristic_function(test1)

dayFirst False
dayFirst False date 2009-05-30
dateUtil 2009-05-30 00:00:00
dayFirst False date 2010-07-11
dateUtil 2010-07-11 00:00:00
dayFirst False date 2010-09-25
dateUtil 2010-09-25 00:00:00
dayFirst False date 2014-10-21
dateUtil 2014-10-21 00:00:00
dayFirst False date 2010-11-16
dateUtil 2010-11-16 00:00:00
dayFirst False date 2011-08-30
dateUtil 2011-08-30 00:00:00
dayFirst False date 2014-09-04
dateUtil 2014-09-04 00:00:00
dayFirst False date 2016-12-12
dateUtil 2016-12-12 00:00:00
dayFirst False date 2016-06-05
dateUtil 2016-06-05 00:00:00
dayFirst False date 2013-06-09
dateUtil 2013-06-09 00:00:00
dayFirst False date 2013-05-28
dateUtil 2013-05-28 00:00:00
dayFirst False date 2011-06-12
dateUtil 2011-06-12 00:00:00
dayFirst False date 2011-01-07
dateUtil 2011-01-07 00:00:00
dayFirst False date 2010-07-02
dateUtil 2010-07-02 00:00:00
dayFirst False date 2009-05-10
dateUtil 2009-05-10 00:00:00
dayFirst False date 2010-04-08
dateUtil 2010-04-08 00:00:00
dayFirst False date 2014-

In [463]:
dft_tensor.fuzzymatchColumns(output)

Date
cc Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
cc Location
Location
cc location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxT

[{'column': 'Date',
  'classifcation': [{'Category': 'Date',
    'Format': 'y-MM-dd',
    'Parser': 'Util',
    'DayFirst': False}],
  'fuzzyColumn': 'Date'},
 {'column': 'Location',
  'classifcation': [{'Category': 'City Name'}],
  'fuzzyColumn': 'location'},
 {'column': 'MinTemp',
  'classifcation': [{'Category': 'Geo', 'type': 'Latitude (number)'}]},
 {'column': 'MaxTemp',
  'classifcation': [{'Category': 'Geo', 'type': 'Latitude (number)'}]},
 {'column': 'Rainfall', 'classifcation': [{'Category': 'Number'}]},
 {'column': 'Evaporation', 'classifcation': [{'Category': 'Number'}]},
 {'column': 'Sunshine', 'classifcation': ['None']},
 {'column': 'WindGustDir', 'classifcation': ['Unknown code']},
 {'column': 'WindGustSpeed', 'classifcation': [{'Category': 'Number'}]},
 {'column': 'WindDir9am', 'classifcation': ['Unknown code']},
 {'column': 'WindDir3pm', 'classifcation': ['Unknown code']},
 {'column': 'WindSpeed9am', 'classifcation': [{'Category': 'Number'}]},
 {'column': 'WindSpeed3pm'

In [451]:
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
test1=dft_tensor.predictions(model=model2, path_to_csv='datasets/Test_10_kaggle.csv')
test1

[{'column': 'Date',
  'values': ['2014-06-05',
   '2014-11-14',
   '2017-03-22',
   '2016-08-13',
   '2009-07-24',
   '2014-01-24',
   '2009-04-30',
   '2015-04-19',
   '2011-06-06',
   '2017-05-24',
   '2014-01-14',
   '2015-04-01',
   '2015-03-16',
   '2010-04-20',
   '2014-05-22',
   '2016-09-20',
   '2009-08-17',
   '2012-03-06',
   '2016-06-07',
   '2014-12-01',
   '2013-07-17',
   '2012-07-26',
   '2014-09-12',
   '2017-01-30',
   '2012-05-22',
   '2009-01-30',
   '2015-03-06',
   '2010-12-02',
   '2012-03-01',
   '2011-08-28',
   '2017-04-20',
   '2017-01-01',
   '2016-01-09',
   '2011-12-24',
   '2014-10-23',
   '2012-05-24',
   '2015-08-16',
   '2012-03-26',
   '2012-02-24',
   '2010-11-19',
   '2009-10-01',
   '2016-01-10',
   '2009-07-27',
   '2013-05-06',
   '2010-12-07',
   '2016-02-05',
   '2017-02-05',
   '2015-10-03',
   '2017-05-20'],
  'avg_predictions': {'averaged_tensor': array([[-2.19471359e+01, -1.40542002e+01, -1.70478020e+01,
           -1.83769054e+01, -1.92865

In [452]:
output=dft_tensor.assign_heuristic_function(test1)

dayFirst False
dayFirst False date 2014-06-05
dateUtil 2014-06-05 00:00:00
dayFirst False date 2014-11-14
dateUtil 2014-11-14 00:00:00
dayFirst False date 2017-03-22
dateUtil 2017-03-22 00:00:00
dayFirst False date 2016-08-13
dateUtil 2016-08-13 00:00:00
dayFirst False date 2009-07-24
dateUtil 2009-07-24 00:00:00
dayFirst False date 2014-01-24
dateUtil 2014-01-24 00:00:00
dayFirst False date 2009-04-30
dateUtil 2009-04-30 00:00:00
dayFirst False date 2015-04-19
dateUtil 2015-04-19 00:00:00
dayFirst False date 2011-06-06
dateUtil 2011-06-06 00:00:00
dayFirst False date 2017-05-24
dateUtil 2017-05-24 00:00:00
dayFirst False date 2014-01-14
dateUtil 2014-01-14 00:00:00
dayFirst False date 2015-04-01
dateUtil 2015-04-01 00:00:00
dayFirst False date 2015-03-16
dateUtil 2015-03-16 00:00:00
dayFirst False date 2010-04-20
dateUtil 2010-04-20 00:00:00
dayFirst False date 2014-05-22
dateUtil 2014-05-22 00:00:00
dayFirst False date 2016-09-20
dateUtil 2016-09-20 00:00:00
dayFirst False date 2009-

In [458]:
dft_tensor.fuzzymatchColumns(output)

Date
cc Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Date
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
cc Location
Location
cc location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
Location
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MinTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxTemp
MaxT

[{'column': 'Date',
  'classifcation': [{'Category': 'Date',
    'Format': 'y-MM-dd',
    'Parser': 'Util',
    'DayFirst': False}],
  'fuzzyColumn': 'Date'},
 {'column': 'Location',
  'classifcation': ['Proper Noun'],
  'fuzzyColumn': 'location'},
 {'column': 'MinTemp',
  'classifcation': [{'Category': 'Geo', 'type': 'Latitude (number)'}]},
 {'column': 'MaxTemp', 'classifcation': [{'Category': 'Number'}]},
 {'column': 'Rainfall',
  'classifcation': [{'Category': 'Geo', 'type': 'Latitude (number)'}]},
 {'column': 'Evaporation', 'classifcation': [{'Category': 'Number'}]},
 {'column': 'Sunshine', 'classifcation': ['None']},
 {'column': 'WindGustDir', 'classifcation': ['Unknown code']},
 {'column': 'WindGustSpeed', 'classifcation': [{'Category': 'Number'}]},
 {'column': 'WindDir9am', 'classifcation': ['Unknown code']},
 {'column': 'WindDir3pm', 'classifcation': ['Unknown code']},
 {'column': 'WindSpeed9am', 'classifcation': [{'Category': 'Number'}]},
 {'column': 'WindSpeed3pm',
  'classif

In [1064]:
dft_tensor=DF_To_Tensors()
model2 = torch.load('./models/LSTM_RNN_Geotime_Classify_v_0.06.pth')
model2.eval()
test1=dft_tensor.predictions(model=model2, path_to_csv='datasets/small_test.csv')
test1

date_dataset ['Bani Yas City', 'Dibba Al-Hisn', 'Sar-e Pul', 'Al Ain City', 'Aibak', 'Sharjah', 'Ajman City', 'Sang-e Charak', 'Khawr Fakkan', 'Dubai', 'Dibba Al-Fujairah', 'Zaranj', 'Al Ain City', 'les Escaldes', 'Shindand', 'Ar Ruways', 'Adh Dhayd', 'les Escaldes', 'Pul-e Khumri', 'Pul-e Khumri', 'Dibba Al-Hisn', 'Al Ain City', 'Bani Yas City', 'Adh Dhayd', 'Ajman City', 'Shindand', 'Al Shamkhah City', 'Sang-e Charak', 'Shahrak', 'Shindand', 'Aibak', 'Paghman', 'Taloqan', 'Maymana', 'Aibak', 'Nahrin', 'Musaffah', 'Khalifah A City', 'Qarqin', 'Dibba Al-Hisn', 'Sharjah', 'Abu Dhabi', 'Dibba Al-Fujairah', 'Nahrin', 'Ras Al Khaimah City', 'Lashkar Gah', 'Ras Al Khaimah City', 'Ras Al Khaimah City', 'Abu Dhabi']
topcat country_name
minv -2.2330832
date_dataset ['14', '35', '34', '12', '49', '51', '19', '16', '13', '30', '41', '13', '47', '14', '49', '34', '21', '30', '22', '22', '27', '12', '44', '29', '17', '31', '16', '27', '47', '42', '22', '31', '39', '37', '26', '18', '22', '42', '25

[{'column': 'city',
  'values': ['Bani Yas City',
   'Dibba Al-Hisn',
   'Sar-e Pul',
   'Al Ain City',
   'Aibak',
   'Sharjah',
   'Ajman City',
   'Sang-e Charak',
   'Khawr Fakkan',
   'Dubai',
   'Dibba Al-Fujairah',
   'Zaranj',
   'Al Ain City',
   'les Escaldes',
   'Shindand',
   'Ar Ruways',
   'Adh Dhayd',
   'les Escaldes',
   'Pul-e Khumri',
   'Pul-e Khumri',
   'Dibba Al-Hisn',
   'Al Ain City',
   'Bani Yas City',
   'Adh Dhayd',
   'Ajman City',
   'Shindand',
   'Al Shamkhah City',
   'Sang-e Charak',
   'Shahrak',
   'Shindand',
   'Aibak',
   'Paghman',
   'Taloqan',
   'Maymana',
   'Aibak',
   'Nahrin',
   'Musaffah',
   'Khalifah A City',
   'Qarqin',
   'Dibba Al-Hisn',
   'Sharjah',
   'Abu Dhabi',
   'Dibba Al-Fujairah',
   'Nahrin',
   'Ras Al Khaimah City',
   'Lashkar Gah',
   'Ras Al Khaimah City',
   'Ras Al Khaimah City',
   'Abu Dhabi'],
  'avg_predictions': {'averaged_tensor': array([[ -4.877091 ,  -2.9607747, -19.05976  , -15.662936 , -22.085571 ,
   

In [1065]:
dft_tensor.assign_heuristic_function(test1)

pred
['Bani Yas City', 'Dibba Al-Hisn', 'Sar-e Pul', 'Al Ain City', 'Aibak', 'Sharjah', 'Ajman City', 'Sang-e Charak', 'Khawr Fakkan', 'Dubai', 'Dibba Al-Fujairah', 'Zaranj', 'Al Ain City', 'les Escaldes', 'Shindand', 'Ar Ruways', 'Adh Dhayd', 'les Escaldes', 'Pul-e Khumri', 'Pul-e Khumri', 'Dibba Al-Hisn', 'Al Ain City', 'Bani Yas City', 'Adh Dhayd', 'Ajman City', 'Shindand', 'Al Shamkhah City', 'Sang-e Charak', 'Shahrak', 'Shindand', 'Aibak', 'Paghman', 'Taloqan', 'Maymana', 'Aibak', 'Nahrin', 'Musaffah', 'Khalifah A City', 'Qarqin', 'Dibba Al-Hisn', 'Sharjah', 'Abu Dhabi', 'Dibba Al-Fujairah', 'Nahrin', 'Ras Al Khaimah City', 'Lashkar Gah', 'Ras Al Khaimah City', 'Ras Al Khaimah City', 'Abu Dhabi']
<class 'numpy.ndarray'>
['Afghanistan' 'Åland Islands' 'Albania' 'Algeria' 'American Samoa'
 'Andorra' 'Angola' 'Anguilla' 'Antigua and Barbuda' 'Argentina' 'Armenia'
 'Aruba' 'Australia' 'Austria' 'Azerbaijan' 'Bahamas' 'Bahrain'
 'Bangladesh' 'Barbados' 'Belarus' 'Belgium' 'Belize' 'Ben

value Al Ain City state Rhode Island
value Al Ain City state South Carolina
value Al Ain City state South Dakota
value Al Ain City state Tennessee
value Al Ain City state Texas
value Al Ain City state Utah
value Al Ain City state Virginia
value Al Ain City state Virgin Islands
value Al Ain City state Vermont
value Al Ain City state Washington
value Al Ain City state Wisconsin
value Al Ain City state West Virginia
value Al Ain City state Wyoming
value Al Ain City state  Alberta
value Al Ain City state  British Columbia
value Al Ain City state Manitoba
value Al Ain City state New Brunswick
value Al Ain City state Newfoundland
value Al Ain City state Northwest Territories
value Al Ain City state Nova Scotia
value Al Ain City state Nunavut
value Al Ain City state Ontario
value Al Ain City state Prince Edward Island
value Al Ain City state Quebec
value Al Ain City state Saskatchewan
value Al Ain City state Yukon
value les Escaldes state Alabama
value les Escaldes state Alaska
value les Esca

value Al Shamkhah City state North Dakota
value Al Shamkhah City state Northern Mariana Islands
value Al Shamkhah City state Ohio
value Al Shamkhah City state Oklahoma
value Al Shamkhah City state Oregon
value Al Shamkhah City state Pennsylvania
value Al Shamkhah City state Puerto Rico
value Al Shamkhah City state Palau
value Al Shamkhah City state Rhode Island
value Al Shamkhah City state South Carolina
value Al Shamkhah City state South Dakota
value Al Shamkhah City state Tennessee
value Al Shamkhah City state Texas
value Al Shamkhah City state Utah
value Al Shamkhah City state Virginia
value Al Shamkhah City state Virgin Islands
value Al Shamkhah City state Vermont
value Al Shamkhah City state Washington
value Al Shamkhah City state Wisconsin
value Al Shamkhah City state West Virginia
value Al Shamkhah City state Wyoming
value Al Shamkhah City state  Alberta
value Al Shamkhah City state  British Columbia
value Al Shamkhah City state Manitoba
value Al Shamkhah City state New Brunswic

value Sharjah state Montana
value Sharjah state Nebraska
value Sharjah state Nevada
value Sharjah state New Hampshire
value Sharjah state New Jersey
value Sharjah state New Mexico
value Sharjah state New York
value Sharjah state North Carolina
value Sharjah state North Dakota
value Sharjah state Northern Mariana Islands
value Sharjah state Ohio
value Sharjah state Oklahoma
value Sharjah state Oregon
value Sharjah state Pennsylvania
value Sharjah state Puerto Rico
value Sharjah state Palau
value Sharjah state Rhode Island
value Sharjah state South Carolina
value Sharjah state South Dakota
value Sharjah state Tennessee
value Sharjah state Texas
value Sharjah state Utah
value Sharjah state Virginia
value Sharjah state Virgin Islands
value Sharjah state Vermont
value Sharjah state Washington
value Sharjah state Wisconsin
value Sharjah state West Virginia
value Sharjah state Wyoming
value Sharjah state  Alberta
value Sharjah state  British Columbia
value Sharjah state Manitoba
value Sharjah

Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fitttttttttttttttttttttt
Great Fi

value Vermont state Marshall Islands
value Vermont state Maryland
value Vermont state Massachusetts
value Vermont state Michigan
value Vermont state Minnesota
value Vermont state Mississippi
value Vermont state Missouri
value Vermont state Montana
value Vermont state Nebraska
value Vermont state Nevada
value Vermont state New Hampshire
value Vermont state New Jersey
value Vermont state New Mexico
value Vermont state New York
value Vermont state North Carolina
value Vermont state North Dakota
value Vermont state Northern Mariana Islands
value Vermont state Ohio
value Vermont state Oklahoma
value Vermont state Oregon
value Vermont state Pennsylvania
value Vermont state Puerto Rico
value Vermont state Palau
value Vermont state Rhode Island
value Vermont state South Carolina
value Vermont state South Dakota
value Vermont state Tennessee
value Vermont state Texas
value Vermont state Utah
value Vermont state Virginia
value Vermont state Virgin Islands
value Vermont state Vermont
Great Fitttt

value Hawaii state Prince Edward Island
value Hawaii state Quebec
value Hawaii state Saskatchewan
value Hawaii state Yukon
value Oaklahoma state Alabama
value Oaklahoma state Alaska
value Oaklahoma state America Samoa
value Oaklahoma state Arizona
value Oaklahoma state Arkansas
value Oaklahoma state California
value Oaklahoma state Colorado
value Oaklahoma state Connecticut
value Oaklahoma state Delaware
value Oaklahoma state District of Columbia
value Oaklahoma state Micronesia
value Oaklahoma state Florida
value Oaklahoma state Georgia
value Oaklahoma state Guam
value Oaklahoma state Hawaii
value Oaklahoma state Idaho
value Oaklahoma state Illinois
value Oaklahoma state Indiana
value Oaklahoma state Iowa
value Oaklahoma state Kansas
value Oaklahoma state Kentucky
value Oaklahoma state Louisiana
value Oaklahoma state Maine
value Oaklahoma state Marshall Islands
value Oaklahoma state Maryland
value Oaklahoma state Massachusetts
value Oaklahoma state Michigan
value Oaklahoma state Minne

value Oregon state Minnesota
value Oregon state Mississippi
value Oregon state Missouri
value Oregon state Montana
value Oregon state Nebraska
value Oregon state Nevada
value Oregon state New Hampshire
value Oregon state New Jersey
value Oregon state New Mexico
value Oregon state New York
value Oregon state North Carolina
value Oregon state North Dakota
value Oregon state Northern Mariana Islands
value Oregon state Ohio
value Oregon state Oklahoma
value Oregon state Oregon
Great Fitttttttttttttttttttttt
value Oregon state Pennsylvania
value Oregon state Puerto Rico
value Oregon state Palau
value Oregon state Rhode Island
value Oregon state South Carolina
value Oregon state South Dakota
value Oregon state Tennessee
value Oregon state Texas
value Oregon state Utah
value Oregon state Virginia
value Oregon state Virgin Islands
value Oregon state Vermont
value Oregon state Washington
value Oregon state Wisconsin
value Oregon state West Virginia
value Oregon state Wyoming
value Oregon state 

value alabama state New Jersey
value alabama state New Mexico
value alabama state New York
value alabama state North Carolina
value alabama state North Dakota
value alabama state Northern Mariana Islands
value alabama state Ohio
value alabama state Oklahoma
value alabama state Oregon
value alabama state Pennsylvania
value alabama state Puerto Rico
value alabama state Palau
value alabama state Rhode Island
value alabama state South Carolina
value alabama state South Dakota
value alabama state Tennessee
value alabama state Texas
value alabama state Utah
value alabama state Virginia
value alabama state Virgin Islands
value alabama state Vermont
value alabama state Washington
value alabama state Wisconsin
value alabama state West Virginia
value alabama state Wyoming
value alabama state  Alberta
value alabama state  British Columbia
value alabama state Manitoba
value alabama state New Brunswick
value alabama state Newfoundland
value alabama state Northwest Territories
value alabama state No

[{'column': 'city', 'classifcation': 'City Name'},
 {'column': 'Test', 'classifcation': 'None'},
 {'column': 'states', 'classifcation': 'State Name'}]

In [41]:

# for later use
def randomChoice(self, values):
    return values[random.randint(0, len(values) - 1)]

def getRandomSet(self):
    category = self.randomChoice(self.all_categories)
#         print(category)
    line = self.randomChoice(list(self.category_values[category]['obj']))
#         print('line', line)
    return (line, category)