In [1]:
from typing import List, Optional
from pydantic import BaseModel,Field
import pandas as pd
import json
from geotime_classify import geotime_classify as gc

## geotime_classify schema
Geotime_classify will return classification data in a logical and relable way. For each file there is a Classifications model which is a list of each column's Classification. The Classification model defines the classification of one column of the dataframe. The fuzzyColumn model is only defined when the column header matches a word of interest that geotime_classify looks for. 

In [3]:
# geotime_classify pydantic models
from enum import Enum, IntEnum
from pydantic import BaseModel, constr, Field
from typing import List, Optional, Literal

class fuzzyCategory(str,Enum):
    """
        fuzzyCategory are the categories we try to capture with fuzzy matching.
    """
    Date= "Date"
    Datetime = "Datetime"
    Timestamp="Timestamp"
    Epoch= "Epoch"
    Time= "Time"
    Year= "Year"
    Month= "Month"
    Latitude = "Latitude"
    Longitude= "Longitude"
    Geo= "Geo"
    Coordinates= "Coordinates"
    Location= "Location"
    West= "West"
    South= "South"
    East= "East"
    North= "North"
    Country= "Country"
    CountryName= "CountryName"
    CC="CC"
    CountryCode= "CountryCode"
    State= "State"
    City ="City"
    Town= "Town"
    Region ="Region"
    Province= "Province"
    Territory= "Territory"
    Address= "Address"
    ISO2: "ISO2"
    ISO3 = "ISO3"
    ISO_code= "ISO_code"
    Results= "Results"

class category(str, Enum):
    """
    category is the general classification for a column
    """
    geo= "geo"
    time="time"
    boolean="boolean"
    unknown_date = "unknown_date"

class subcategory(str, Enum):
    """
    subcategory is the classification of the column at a finer scale than category.
    """
    city_name="city_name"
    state_name="state_name"
    country_name="country_name"
    ISO3="ISO3"
    ISO2="ISO2"
    continent="continent"
    longitude="longitude"
    latitude="latitude"
    date="date"

class fuzzyColumn(BaseModel):
    """
       fuzzyColumn is only defined when a column header matches a word we are looking for. fuzzyCategory is used for classifying a column.
    """
    matchedKey: str = Field(default=None, description='This is the word that was matched with the column header. If a column header was Lat, it would match with the the matchedKey of Lat, since it is one of the lookup words. In this case the fuzzyCategory would be returned as "Latitude".')
    fuzzyCategory: Optional[fuzzyCategory]
    ratio: int = Field(default=None, description='Ratio of the fuzzy match. If it was an exact match it would be 100')

class Parser(str,Enum):
    """
        Parser records which python library the date was parsed with. dateutil or arrow.
    """
    Util="Util"
    arrow="arrow"

class Classification(BaseModel):
    """
        Classification is the classifciation information for one column.
    """
    column: str = Field(default=None, description='column name')
    category: Optional[category]
    subcategory: Optional[subcategory]
    format: str = Field(default=None, description='the date represented in strftime format')
    match_type: List[Literal["LSTM", "fuzzy"]]
    Parser: Optional[Parser]
    DayFirst: bool = Field(default=None, description='Boolean: if day is first in date format' )
    fuzzyColumn: Optional[fuzzyColumn]


class Classifications(BaseModel):
    """
        Classifications are a list of Classification objects. This is what is returned from geotime_classify.
    """
    classifications: List[Classification]




In [4]:
# The csv filed used for this example
pd.read_csv('example_4.csv').head()

Unnamed: 0,country,value,Latitude,Longitude,Y/d/m,month_name,month_number,year,Day_name,Day Number,bool
0,Cote d'Ivoire,1,45.499494,12.039393,2018/15/1,January,1,1888,Wednesday,1,T
1,Ghana,2,6.944658,39.240347,2018/15/2,February,2,1987,Monday,2,T
2,Liberia,3,7.944658,40.240347,2018/15/1,January,3,2003,Tuesday,3,F
3,Morocco,4,8.944658,41.240347,2018/15/1,February,4,2004,Thursday,4,T
4,Burkina Faso,5,9.944658,42.240347,2018/15/1,March,5,2005,Friday,5,T


In [9]:
# instantiate the GeoTimeClassify class and run the model over our csv
gc_instantiated=gc.GeoTimeClassify(20)
preds=gc_instantiated.columns_classified('/home/kyle/Desktop/blank.csv')


Start LSTM predictions ...
Start boolean validation ...
Start month validation ...
Start day validation ...


In [10]:
#print output
preds


[{'column': 'one', 'classification': [{'Category': 'Day Number'}]},
 {'column': 'two', 'classification': [{'Category': 'Day Number'}]},
 {'column': 'blank', 'classification': [{'Category': 'None'}]},
 {'column': 'five', 'classification': [{'Category': 'None'}]},
 {'column': 'bank3', 'classification': [{'Category': 'None'}]},
 {'column': 'NA', 'classification': [{'Category': 'None'}]},
 {'column': 'Null', 'classification': [{'Category': 'None'}]},
 {'column': 'nan', 'classification': [{'Category': 'None'}]},
 {'column': 'Latitude',
  'classification': [{'Category': 'Day Number'}],
  'fuzzyColumn': 'Latitude'}]

In [8]:
print(json.dumps(preds, indent=2))

AttributeError: 'list' object has no attribute 'dict'