In [4]:
import pandas as pd
from config import api_config

## Data Types source 1: metadata downloaded from site (.xlsx)

In [5]:
data_dict = pd.read_excel("../Files/FIR-0001_DataDictionary_fire-incidents__OK.ods")

In [6]:
data_dict.head(3)

Unnamed: 0,Field Name,Data Type,Definition,Notes (optional)
0,Incident Number,Text,A unique 8-digit number assigned by DEM to thi...,
1,Exposure Number,Numeric,A sequence number indicates the order in which...,
2,Address,Text,Address of the incident,


In [7]:
datadict_cols = list(data_dict["Field Name"])
datadict_datatypes = list(data_dict["Data Type"].unique())
print(f"Number of cols: {len(datadict_cols)}")
print(f"Data Types: {datadict_datatypes}")

Number of cols: 63
Data Types: ['Text', 'Numeric', 'Date & Time', 'Coordinates']


## Datatypes source 2: Socrata metadata API

In [8]:
import requests

In [9]:
APP_TOKEN = api_config["app_token"]

headers = {
  "X-App-Token": APP_TOKEN,
  "Content-Type": "application/json"
}

In [10]:
DATASET_ID = "wr8u-xric"
METADATA_QUERY = f"?ids={DATASET_ID}"
METADATA_URL = f"http://api.us.socrata.com/api/catalog/v1"

# source 2: Socrata metadata request
metadata_req = requests.get(
    f"{METADATA_URL}{METADATA_QUERY}",
    headers=headers
)

metadata = metadata_req.json()

In [11]:
resource_metadata = metadata["results"][0]["resource"]
metadata_cols = resource_metadata["columns_name"]
metadata_datatypes = resource_metadata["columns_datatype"]
print(f"Number of cols: {len(metadata_cols)}")
print(f"Data Types: {set(metadata_datatypes)}")

Number of cols: 64
Data Types: {'Text', 'Number', 'Point'}


In [12]:
data_dict_missing = [c for c in metadata_cols if c not in datadict_cols ]
metadata_missing = [c for c in datadict_cols if c not in metadata_cols ]
print(f"Cols not in datadict: {data_dict_missing}")
print(f"Cols not in metadata: {metadata_missing}")

Cols not in datadict: ['ID', 'neighborhood_district', 'point', 'zipcode']
Cols not in metadata: ['Zipcode', 'Location', 'Neighborhood District']


In [13]:
point_index = metadata_cols.index("point")
print(f"Data Type for 'point' column: {metadata_datatypes[point_index]}")

Data Type for 'point' column: Point


## Data types source 3: actual data

In [14]:
import json

In [16]:
# previously downloaded json
with open('../Data/data.json') as json_file:
    api_data = json.load(json_file)

In [17]:
# check 1: all data points from json 
# have keys equal to column names
cols_diff = []
for i, item in enumerate(api_data):
    diff = [ c for c in set(item.keys()) if c not in metadata_cols ]
    if len(diff) > 0:
        cols_diff.append((i, diff))

In [18]:
datadict_cols[:5]

['Incident Number',
 'Exposure Number',
 'Address',
 'Incident Date',
 'Call Number']

In [19]:
cols_diff[0][1][:5]

['incident_number', 'id', 'address', 'action_taken_primary', 'other_units']

## JSON Metadata (manual download)

In [20]:
# this json was manually downloaded from the site
with open("../Data/sf-fires-download.json") as json_file:
    download = json.load(json_file)

In [21]:
type(download)

dict

In [22]:
download.keys()

dict_keys(['meta', 'data'])

### Meta exploration

In [23]:
type(download["meta"])

dict

In [24]:
download["meta"].keys()

dict_keys(['view'])

In [25]:
download["meta"]["view"].keys()

dict_keys(['id', 'name', 'assetType', 'averageRating', 'category', 'createdAt', 'description', 'displayType', 'downloadCount', 'hideFromCatalog', 'hideFromDataJson', 'licenseId', 'newBackend', 'numberOfComments', 'oid', 'provenance', 'publicationAppendEnabled', 'publicationDate', 'publicationGroup', 'publicationStage', 'rowIdentifierColumnId', 'rowsUpdatedAt', 'rowsUpdatedBy', 'tableId', 'totalTimesRated', 'viewCount', 'viewLastModified', 'viewType', 'approvals', 'columns', 'disabledFeatureFlags', 'grants', 'license', 'metadata', 'owner', 'query', 'rights', 'tableAuthor', 'flags'])

In [26]:
type(download["meta"]["view"]["columns"])

list

In [27]:
len(download["meta"]["view"]["columns"])

72

In [28]:
download_meta_cols = download["meta"]["view"]["columns"]
download_meta_cols[-5:]

[{'id': 496028758,
  'name': 'Automatic Extinguishing Sytem Failure Reason',
  'dataTypeName': 'text',
  'description': '',
  'fieldName': 'automatic_extinguishing_sytem_failure_reason',
  'position': 60,
  'renderTypeName': 'text',
  'tableColumnId': 33383966,
  'width': 628,
  'cachedContents': {'non_null': '9624',
   'largest': 'U -Undetermined',
   'null': '564266',
   'top': [{'item': '-', 'count': '9563'},
    {'item': 'Fire not in area protected by the system', 'count': '24'},
    {'item': '5 -Fire not in area protected by the syste', 'count': '9'},
    {'item': 'System shut off', 'count': '6'},
    {'item': 'U -Undetermined', 'count': '5'},
    {'item': 'Undetermined', 'count': '4'},
    {'item': '0 -Reason system not effective, other', 'count': '3'},
    {'item': '1 -System shut off', 'count': '2'},
    {'item': 'Not enough agent discharged to control the fire', 'count': '2'},
    {'item': 'Reason system not effective, other', 'count': '2'},
    {'item': '3 -Agent discharged, 

### "Data" key exploration

In [29]:
download["data"][0]

['row-6jzi-i6d4~vfd8',
 '00000000-0000-0000-09C7-D3EBD32C3454',
 0,
 1638102821,
 None,
 1638102821,
 None,
 '{ }',
 '8028304',
 '0',
 '080283040',
 '150 Elsie St.',
 '2008-04-01T00:00:00',
 '080920257',
 '2008-04-01T18:06:37',
 '2008-04-01T18:15:19',
 '2008-04-01T18:21:48',
 'SF',
 '94110',
 'B06',
 '11',
 None,
 '1',
 '4',
 '0',
 '0',
 '0',
 '0',
 'E11',
 None,
 None,
 '0',
 '0',
 '0',
 '0',
 '1',
 '412 - Gas leak (natural gas or LPG)',
 'None',
 '86 - Investigate',
 '-',
 '-',
 '-',
 '962 - Residential street, road or residential dr',
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 '9',
 'Bernal Heights',
 'POINT (-122.41837339 37.74208979)']

### Download vs API data

In [30]:
api_field_names = set()
for item in api_data:
    api_field_names = api_field_names.union(set(item.keys()))

In [31]:
download_field_names = []
download_names = []
download_datatypes = set()
for col in download_meta_cols:
    download_names.append(col["name"])
    download_field_names.append(col["fieldName"])
    download_datatypes.add(col["dataTypeName"])

In [32]:
# check against api data
download_missing = [ c for c in api_field_names if c not in download_field_names ]
api_missing = [ c for c in download_field_names if c not in api_field_names ]

In [33]:
print(f"Columns not in downloaded file: {download_missing}")
print(f"Columns not in API call: {api_missing}")

Columns not in downloaded file: []
Columns not in API call: [':sid', ':id', ':position', ':created_at', ':created_meta', ':updated_at', ':updated_meta', ':meta']


In [34]:
download_datatypes

{'meta_data', 'number', 'point', 'text'}

### Datatypes: json download vs metadata

In [35]:
download_numeric_cols = []
for col in download_meta_cols:
    if col["dataTypeName"] == "number":
        download_numeric_cols.append(col["name"])

data_dict_numeric = data_dict[data_dict["Data Type"] == "Numeric"]["Field Name"].unique()

In [36]:
numeric_diff = [ c for c in data_dict_numeric if c not in download_numeric_cols ]
numeric_diff

[]

In [37]:
download_numeric_cols

['Incident Number',
 'Exposure Number',
 'Suppression Units',
 'Suppression Personnel',
 'EMS Units',
 'EMS Personnel',
 'Other Units',
 'Other Personnel',
 'Estimated Property Loss',
 'Estimated Contents Loss',
 'Fire Fatalities',
 'Fire Injuries',
 'Civilian Fatalities',
 'Civilian Injuries',
 'Number of Alarms',
 'Floor of Fire Origin',
 'Number of floors with minimum damage',
 'Number of floors with significant damage',
 'Number of floors with heavy damage',
 'Number of floors with extreme damage',
 'Number of Sprinkler Heads Operating']

### pandas data conversion

In [38]:
df = pd.DataFrame.from_records(download["data"])

In [39]:
df.columns = download_field_names
df.head(3)

Unnamed: 0,:sid,:id,:position,:created_at,:created_meta,:updated_at,:updated_meta,:meta,incident_number,exposure_number,...,detector_effectiveness,detector_failure_reason,automatic_extinguishing_system_present,automatic_extinguishing_sytem_type,automatic_extinguishing_sytem_perfomance,automatic_extinguishing_sytem_failure_reason,number_of_sprinkler_heads_operating,supervisor_district,neighborhood_district,point
0,row-6jzi-i6d4~vfd8,00000000-0000-0000-09C7-D3EBD32C3454,0,1638102821,,1638102821,,{ },8028304,0,...,,,,,,,,9.0,Bernal Heights,POINT (-122.41837339 37.74208979)
1,row-ufeh~vmy3-3wxn,00000000-0000-0000-6556-4158A5282F17,0,1638102821,,1638102821,,{ },8028303,0,...,,,,,,,,10.0,Potrero Hill,POINT (-122.39489 37.756291)
2,row-g8xs.26dr_zxem,00000000-0000-0000-841C-DB78FD72E2E6,0,1638102821,,1638102821,,{ },8028309,0,...,,,,,,,,,South of Market,POINT (-122.407468 37.78008)


In [40]:
import numpy as np

In [41]:
for col in download_meta_cols:
    col_field_name = col["fieldName"]
    if col["dataTypeName"] == "number":
        df[col_field_name] = pd.to_numeric(df[col_field_name])

In [42]:
numeric_cols_field = []
for col in download_meta_cols:
    if col["dataTypeName"] == "number":
        numeric_cols_field.append(col["fieldName"])

In [43]:
df[numeric_cols_field].describe().loc[["max", "min"]].T

Unnamed: 0,max,min
incident_number,21145805.0,3000001.0
exposure_number,5.0,0.0
suppression_units,3333.0,0.0
suppression_personnel,5960.0,0.0
ems_units,150.0,0.0
ems_personnel,312.0,0.0
other_units,1000.0,0.0
other_personnel,1000.0,0.0
estimated_property_loss,100000000.0,-25000.0
estimated_contents_loss,100000000.0,-5000.0


In [44]:
df[numeric_cols_field].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 573890 entries, 0 to 573889
Data columns (total 21 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   incident_number                           573890 non-null  int64  
 1   exposure_number                           573890 non-null  int64  
 2   suppression_units                         573890 non-null  int64  
 3   suppression_personnel                     573890 non-null  int64  
 4   ems_units                                 573890 non-null  int64  
 5   ems_personnel                             573890 non-null  int64  
 6   other_units                               573890 non-null  int64  
 7   other_personnel                           573890 non-null  int64  
 8   estimated_property_loss                   130955 non-null  float64
 9   estimated_contents_loss                   137942 non-null  float64
 10  fire_fatalities     

In [45]:
numeric_cols_field

['incident_number',
 'exposure_number',
 'suppression_units',
 'suppression_personnel',
 'ems_units',
 'ems_personnel',
 'other_units',
 'other_personnel',
 'estimated_property_loss',
 'estimated_contents_loss',
 'fire_fatalities',
 'fire_injuries',
 'civilian_fatalities',
 'civilian_injuries',
 'number_of_alarms',
 'floor_of_fire_origin',
 'number_of_floors_with_minimum_damage',
 'number_of_floors_with_significant_damage',
 'number_of_floors_with_heavy_damage',
 'number_of_floors_with_extreme_damage',
 'number_of_sprinkler_heads_operating']

In [46]:
df["incident_number"].dtype == "int64"

True

### MySQL data types

In [47]:
mysql_types = {}
# data types for numeric columns
for col in numeric_cols_field:
    abs_max = np.max(np.abs(df[col]))
    data_type = df[col].dtype
    if data_type == "float64":
        mysql_types[col] = "FLOAT"
    elif abs_max > 1000000:
        mysql_types[col] = "INT"
    else:
        mysql_types[col] = "SMALLINT"

In [48]:
# data types for timestamp columns
time_cols = data_dict[data_dict["Data Type"] == "Date & Time"]["Field Name"].unique()
for col in download_meta_cols:
    if col["name"] in time_cols:
        mysql_types[col["fieldName"]] = "TIMESTAMP"

In [49]:
for col in download_meta_cols:
    if (col["dataTypeName"] == "text" 
        and col["name"] not in time_cols):
        mysql_types[col["fieldName"]] = "VARCHAR(255)"

In [50]:
missing_keys = [ c for c in download_field_names if c not in mysql_types.keys() ]
missing_keys

[':sid',
 ':id',
 ':position',
 ':created_at',
 ':created_meta',
 ':updated_at',
 ':updated_meta',
 ':meta',
 'point']

In [51]:
mysql_types["latitude"] = "FLOAT"
mysql_types["longitude"] = "FLOAT"

In [52]:
set(mysql_types.values())

{'FLOAT', 'INT', 'SMALLINT', 'TIMESTAMP', 'VARCHAR(255)'}

In [53]:
import pickle

with open('../Files/mysql_data_types.pickle', 'wb') as handle:
    pickle.dump(mysql_types, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Redshift data types

In [54]:
rs_types = {}
# data types for numeric columns
for col in numeric_cols_field:
    abs_max = np.max(np.abs(df[col]))
    data_type = df[col].dtype
    if abs_max > 10000:
        if data_type == "float64":
            rs_types[col] = "DECIMAL(12,3)"
        else:
            rs_types[col] = "INTEGER"
    elif data_type == "float64":
        rs_types[col] = "DECIMAL(7,2)"
    else:
        rs_types[col] = "SMALLINT"

In [55]:
# data types for timestamp columns
time_cols = data_dict[data_dict["Data Type"] == "Date & Time"]["Field Name"].unique()
for col in download_meta_cols:
    if col["name"] in time_cols:
        rs_types[col["fieldName"]] = "TIMESTAMP"

In [56]:
for col in download_meta_cols:
    if (col["dataTypeName"] == "text" 
        and col["name"] not in time_cols):
        rs_types[col["fieldName"]] = "VARCHAR"

In [57]:
missing_keys = [ c for c in download_field_names if c not in rs_types.keys() ]
missing_keys

[':sid',
 ':id',
 ':position',
 ':created_at',
 ':created_meta',
 ':updated_at',
 ':updated_meta',
 ':meta',
 'point']

In [64]:
rs_types["latitude"] = "DECIMAL(9,6)"
rs_types["longitude"] = "DECIMAL(9,6)"

In [65]:
set(rs_types.values())

{'DECIMAL(12,3)',
 'DECIMAL(7,2)',
 'DECIMAL(9,6)',
 'INTEGER',
 'SMALLINT',
 'TIMESTAMP',
 'VARCHAR'}

In [66]:
import pickle

with open('../Files/rs_data_types.pickle', 'wb') as handle:
    pickle.dump(rs_types, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Checking out POINT datatype

In [61]:
api_data[0]["point"]

{'type': 'Point', 'coordinates': [-122.41837339, 37.74208979]}

In [62]:
for i, item in enumerate(api_data):
    item_point = item.get("point")
    if item_point:
        point_type = item_point.get("type")
        if point_type != "Point":
            print(i)
            assert False

In [63]:
df["incident_date"].head(3)

0    2008-04-01T00:00:00
1    2008-04-01T00:00:00
2    2008-04-01T00:00:00
Name: incident_date, dtype: object