# Parsing raw NWS text files in Python

Author: 

* Kate Weinberger (kate.weinberger@ubc.ca)
* Ivan Gu (zifan_gu@hms.harvard.edu)

Date last updated: August 12, 2022

What this script does:
- Imports the raw text records of NWS non-precipitation watches, advisories, and warnings, 2006-2018
- Extracts key information for each watch, advisory, and warning (date, location, type of weather phenomenon, etc.)
- Keeps only information for watches, advisories, and warnings related to heat
- Exports a csv file that can be further processed in R (script name: "Convert_parsed_warnings_to_time_series_2006_2018.R") to construct a daily record of whether a heat watch, advisory, or warning was in effect for each forecast zone that issued at least one heat alert between 2006 and 2018

Notes:
- This notebook is heavily inspired by Kate Weinberger, while Ivan Gu added advisory local time of issue mapping and GEOID mapping 
- File paths will need to be changed if you want to run this on your own computer
- This script can be altered to extract alerts for phenomena other than extreme heat
- The R script described above ("Convert_parsed_warnings_to_time_series_2006_2018.R") will create columns to keep track of how an alert was first issued versus how it was last issued. In other words, these columns can help you keep track of whether an alert was upgraded (i.e., watch to advisory) or downgraded (warning to advisory) in the days leading up to the heat event. In order for these columns to populate correctly, the raw text records have to be read in to this Python script in chronological order. NB: There is certainly a better way to preserve order but I have not had the time to figure it out.

# (1)  Do some set-up

### Identify current working directory

In [3]:
import os
os.getcwd()

'/n/home_fasse/zgu/heat_advisory'

In [4]:
import numpy as np
import pandas as pd
import re
import datetime
import altair as alt

from vega_datasets import data

### Import packages, create a function for identifying if a string is entirely alphabetical characters, create an object class for time zones 

In [5]:


def containsLetters(line):                
    for ch in line:
        if ch.isalpha():
            return True
    return False

class TZone(datetime.tzinfo):
    def __init__(self,offset,isdst,name):
        self.offset = offset
        self.isdst = isdst
        self.name = name
    def utcoffset(self, dt):
        return datetime.timedelta(hours=self.offset) + self.dst(dt)
    def dst(self, dt):
        return datetime.timedelta(hours=1) if self.isdst else datetime.timedelta(0)
    def tzname(self,dt):
         return self.name

GMT = TZone(0,False,'GMT')
EST = TZone(-5,False,'EST')
CST = TZone(-6,False,'CST')
MST = TZone(-7,False,'MST')
PST = TZone(-8,False,'PST')

EDT = TZone(-5,True,'EDT')
CDT = TZone(-6,True,'CDT')
MDT = TZone(-7,True,'MDT')
PDT = TZone(-8,True,'PDT')

t_zone_dict = {"GMT": TZone(0,False,'GMT'),
"EST" : TZone(-5,False,'EST'),
"CST" : TZone(-6,False,'CST'),
"MST" : TZone(-7,False,'MST'),
"PST" : TZone(-8,False,'PST'),
               
"EDT" : TZone(-5,True,'EDT'),
"CDT" : TZone(-6,True,'CDT'),
"MDT" : TZone(-7,True,'MDT'),
"PDT" : TZone(-8,True,'PDT')}


### Create object classes for different pieces of a text record: message, segment, zone, and vtec header:

In [6]:
class Message:
    def __init__(self, raw_text):
        self.raw_text = raw_text
        self.add_id()
        self.add_office()
        self.add_issue_local_datetime()
        self.add_local_timezone()
        self.add_raw_segments()
        self.add_segments()
        #self.get_tables()
    def add_id(self):
        issue_year = re.findall('[0-9]+\s[AP]M\s[A-Z][A-Z][A-Z]\s[A-Z][A-Za-z][A-Za-z]\s[A-Z][A-Za-z][A-Za-z]\s[0-9]+\s([0-9][0-9][0-9][0-9])', self.raw_text)[0]
        issue_day = re.findall('[0-9]+\s[AP]M\s[A-Z][A-Z][A-Z]\s[A-Z][A-Za-z][A-Za-z]\s[A-Z][A-Za-z][A-Za-z]\s([0-9]+)\s[0-9][0-9][0-9][0-9]', self.raw_text)[0]
        issue_day = issue_day.zfill(2)
        issue_month = re.findall('[0-9]+\s[AP]M\s[A-Z][A-Z][A-Z]\s[A-Z][A-Za-z][A-Za-z]\s([A-Z][A-Za-z][A-Za-z])\s[0-9]+\s[0-9][0-9][0-9][0-9]', self.raw_text)[0]
        issue_month = datetime.datetime.strptime(issue_month, '%b')
        issue_month = str(issue_month.month)
        issue_month = issue_month.zfill(2)
        self.id = issue_year + issue_month + issue_day + '_' + self.raw_text.split("\n")[0][-11:-7] + '_' + self.raw_text.split("\n")[0][-6:]
    def add_office(self):
        self.office = self.raw_text.split("\n")[0][-11:-7]
    def add_issue_local_datetime(self):
        local_time = re.findall('([0-9]+\s[AP]M)\s[A-Z][A-Z][A-Z]\s[A-Z][A-Za-z][A-Za-z]\s[A-Z][A-Za-z][A-Za-z]\s[0-9]+\s[0-9][0-9][0-9][0-9]', self.raw_text)[0]
        if len(local_time) == 6:
            local_time = local_time.zfill(7)
        local_date = re.findall('[0-9]+\s[AP]M\s[A-Z][A-Z][A-Z]\s[A-Z][A-Za-z][A-Za-z]\s([A-Z][A-Za-z][A-Za-z]\s[0-9]+\s[0-9][0-9][0-9][0-9])', self.raw_text)[0]
        both = local_time + ' ' + local_date
        self.issue_local_datetime = datetime.datetime.strptime(both,'%I%M %p %b %d %Y')        
    def add_local_timezone(self):
        self.local_timezone = re.findall('\n[0-9]+\s[AP]M\s([A-Z][A-Z][A-Z])\s[A-Z][A-Za-z][A-Za-z]\s', self.raw_text)[0]
    def add_raw_segments(self):
        self.raw_segments = re.findall('\n([A-Z][A-Z]Z.*?)[&$][&$]', self.raw_text, re.DOTALL)
    def add_segments(self):
        self.segments = []
        for i,s in enumerate(self.raw_segments):
            self.segments.append(Segment(raw_segment=s, id=i, message=self))
    def get_tables(self):
        # first get a row just for this message
        print('Messages Table Entry')
        print('\nMessage ID:\t{id}\nForecast Office:\t{off}\nIssue Date/Time:\t{idt}\nTimezone:\t{tz}\nMessage Text:\t{txt}'.format(id=self.id, off=self.office, idt=self.issue_local_datetime, tz=self.local_timezone,txt=self.raw_text[0:9]))
 

In [7]:
class Segment:
    def __init__(self, raw_segment, id, message):
        self.id = id
        self.raw_segment = raw_segment
        self.parent_message = message
        self.add_raw_zones()
        self.add_zones()
        self.add_raw_vtecs()
        self.add_vtecs()
    def add_raw_zones(self):
        seg_strip = self.raw_segment.replace("\n", "")
        zone_string = re.findall('([A-Z][A-Z]Z[0-9].+)-[0-9][0-9][0-9][0-9][0-9][0-9]-', seg_strip)[0]
        #zone_string = re.findall('([A-Z][A-Z]Z[0-9].+)[0-9][0-9][0-9][0-9][0-9][0-9]-', self.raw_segment, re.DOTALL)[0]
        zone_noline = zone_string.replace("\n","") # strip out existing line breaks
        zone_addbreak = re.sub(r'(-)([A-Z][A-Z][A-Z])', r'\n\2', zone_noline) # add new breaks before state names
        split_zones = zone_addbreak.split("\n") # split based on a new breaks 
        listofzones=[]
        for i in split_zones:
            state = i[0:3] # get the name of the state
            nodash = i.split("-") # split on hyphen
            for j in nodash: 
                if len(j) == 0: # deal with final hyphen in string
                    continue
                elif containsLetters(j) == True: # zones starting with state name
                    num=j[3:]
                    if len(num.split(">")) == 1:
                        listofzones.append(state+num)
                    elif len(num.split(">")) == 2:
                        spl = num.split(">")
                        spl_list = [k for k in range(int(spl[0]), int(spl[1])+1)]
                        for m in spl_list:
                            st = str(m)
                            stz = st.zfill(3)
                            listofzones.append(state+stz)
                elif containsLetters(j) == False: # zones not starting with state name
                    num = j
                    if len(num.split(">")) == 1:
                        listofzones.append(state+num)
                    elif len(num.split(">")) == 2:
                        spl = num.split(">")
                        spl_list = [k for k in range(int(spl[0]), int(spl[1])+1)]
                        for m in spl_list:
                            st = str(m)
                            stz = st.zfill(3)
                            listofzones.append(state+stz)
        self.raw_zones = listofzones        
    def add_zones(self):
        self.zones = []
        for z in self.raw_zones:
            self.zones.append(Zone(raw_zone=z, segment=self))
    def add_raw_vtecs(self):    
        lines = self.raw_segment.split("\n")
        listofvtec = []
        for line in lines:
            if line.startswith("/O."):
                listofvtec.append(line)
        self.raw_vtecs = listofvtec
    def add_vtecs(self):
        self.vtecs = []
        for v in self.raw_vtecs:
            self.vtecs.append(Vtec(raw_vtec=v, segment=self))


In [8]:
class Zone:
    def __init__(self, raw_zone, segment):
        self.raw_zone = raw_zone
        self.parent_seg = segment
        self.add_zonestate()
        self.add_zonenum()
    def add_zonestate(self):
        self.zonestate = self.raw_zone[0:2]
    def add_zonenum(self):
        self.zonenum = self.raw_zone[3:6]
    def get_tables(self):
        # first get a row just for this message
        print('Zones Table Entry')
        print('\nMessage ID:\t{id}\nSegment ID:\t{sid}\nZone:\t{rz}\nState:\t{zst}\nNumber:\t{znm}'.format(id=self.parent_seg.parent_message.id, sid=self.parent_seg.id, rz=self.raw_zone, zst=self.zonestate, znm=self.zonenum))
        

In [9]:
class Vtec:
    def __init__(self, raw_vtec, segment):
        self.raw_vtec = raw_vtec
        self.parent_seg = segment
        self.add_product()
        self.add_action()
        self.add_phenom()
        self.add_sig()
        self.add_eventid()
        self.add_start_UTC()
        self.add_end_UTC()
        self.add_start_LTZ()
        self.add_end_LTZ()
        self.add_start_date()
        self.add_end_date()
        self.add_dates_covered()
    def add_product(self):
        self.product = self.raw_vtec[1]
    def add_action(self):
        self.action = self.raw_vtec[3:6]
    def add_phenom(self):
        self.phenom = self.raw_vtec[12:14]
    def add_sig(self):
        self.sig = self.raw_vtec[15]
    def add_eventid(self):
        self.eventid = self.raw_vtec[17:21]
    def add_start_UTC(self):
        format = "%y%m%dT%H%MZ"
        if self.raw_vtec[22:34] == "000000T0000Z":
            self.start_UTC = "NaN"
        else:
            self.start_UTC = datetime.datetime.strptime(self.raw_vtec[22:34],format).replace(tzinfo=GMT)
    def add_end_UTC(self):
        format = "%y%m%dT%H%MZ"
        if self.raw_vtec[35:47] == "000000T0000Z":
            self.end_UTC = "NaN"
        else:
            self.end_UTC = datetime.datetime.strptime(self.raw_vtec[35:47],format).replace(tzinfo=GMT)
    def add_start_LTZ(self):
        format = "%y%m%dT%H%MZ"
        if self.raw_vtec[22:34] == "000000T0000Z":
            self.start_LTZ = self.parent_seg.parent_message.issue_local_datetime
        else:
            start_U = datetime.datetime.strptime(self.raw_vtec[22:34],format)
            start_U = start_U.replace(tzinfo=GMT)
            if self.parent_seg.parent_message.local_timezone == "EST":
                self.start_LTZ = start_U.astimezone(EST)
            elif self.parent_seg.parent_message.local_timezone == "CST":
                self.start_LTZ = start_U.astimezone(CST) 
            elif self.parent_seg.parent_message.local_timezone == "MST":
                self.start_LTZ = start_U.astimezone(MST)
            elif self.parent_seg.parent_message.local_timezone == "PST":
                self.start_LTZ = start_U.astimezone(PST)
            elif self.parent_seg.parent_message.local_timezone == "EDT":
                self.start_LTZ = start_U.astimezone(EDT)
            elif self.parent_seg.parent_message.local_timezone == "CDT":
                self.start_LTZ = start_U.astimezone(CDT) 
            elif self.parent_seg.parent_message.local_timezone == "MDT":
                self.start_LTZ = start_U.astimezone(MDT)
            elif self.parent_seg.parent_message.local_timezone == "PDT":
                self.start_LTZ = start_U.astimezone(PDT)
    def add_end_LTZ(self):
        format = "%y%m%dT%H%MZ"
        if self.raw_vtec[35:47] == "000000T0000Z":
            self.end_LTZ = "NaN"
            self.end_date = "NaN"
        else:
            end_U = datetime.datetime.strptime(self.raw_vtec[35:47],format)
            end_U = end_U.replace(tzinfo=GMT)
            if self.parent_seg.parent_message.local_timezone == "EST":
                self.end_LTZ = end_U.astimezone(EST)
            elif self.parent_seg.parent_message.local_timezone == "CST":
                self.end_LTZ = end_U.astimezone(CST) 
            elif self.parent_seg.parent_message.local_timezone == "MST":
                self.end_LTZ = end_U.astimezone(MST)
            elif self.parent_seg.parent_message.local_timezone == "PST":
                self.end_LTZ = end_U.astimezone(PST)
            elif self.parent_seg.parent_message.local_timezone == "EDT":
                self.end_LTZ = end_U.astimezone(EDT)
            elif self.parent_seg.parent_message.local_timezone == "CDT":
                self.end_LTZ = end_U.astimezone(CDT) 
            elif self.parent_seg.parent_message.local_timezone == "MDT":
                self.end_LTZ = end_U.astimezone(MDT)
            elif self.parent_seg.parent_message.local_timezone == "PDT":
                self.end_LTZ = end_U.astimezone(PDT)
    def add_start_date(self):
        format = "%y%m%dT%H%MZ"
        if self.raw_vtec[22:34] == "000000T0000Z":
            self.start_date = self.parent_seg.parent_message.issue_local_datetime.date()
        else:
            start_U = datetime.datetime.strptime(self.raw_vtec[22:34],format)
            start_U = start_U.replace(tzinfo=GMT)
            if self.parent_seg.parent_message.local_timezone == "EST":
                self.start_date = start_U.astimezone(EST).date()
            elif self.parent_seg.parent_message.local_timezone == "CST":
                self.start_date = start_U.astimezone(CST).date()
            elif self.parent_seg.parent_message.local_timezone == "MST":
                self.start_date = start_U.astimezone(MST).date()
            elif self.parent_seg.parent_message.local_timezone == "PST":
                self.start_date = start_U.astimezone(PST).date()
            elif self.parent_seg.parent_message.local_timezone == "EDT":
                self.start_date = start_U.astimezone(EDT).date()
            elif self.parent_seg.parent_message.local_timezone == "CDT":
                self.start_date = start_U.astimezone(CDT).date()
            elif self.parent_seg.parent_message.local_timezone == "MDT":
                self.start_date = start_U.astimezone(MDT).date()
            elif self.parent_seg.parent_message.local_timezone == "PDT":
                self.start_date = start_U.astimezone(PDT).date()
    def add_end_date(self):
        format = "%y%m%dT%H%MZ"
        if self.raw_vtec[35:47] == "000000T0000Z":
            self.end_date = "NaN"
        else:
            end_U = datetime.datetime.strptime(self.raw_vtec[35:47],format)
            end_U = end_U.replace(tzinfo=GMT)
            if self.parent_seg.parent_message.local_timezone == "EST":
                self.end_date = end_U.astimezone(EST).date()
            elif self.parent_seg.parent_message.local_timezone == "CST":
                self.end_date = end_U.astimezone(CST).date()
            elif self.parent_seg.parent_message.local_timezone == "MST":
                self.end_date = end_U.astimezone(MST).date()
            elif self.parent_seg.parent_message.local_timezone == "PST":
                self.end_date = end_U.astimezone(PST).date()
            elif self.parent_seg.parent_message.local_timezone == "EDT":
                self.end_date = end_U.astimezone(EDT).date()
            elif self.parent_seg.parent_message.local_timezone == "CDT":
                self.end_date = end_U.astimezone(CDT).date()
            elif self.parent_seg.parent_message.local_timezone == "MDT":
                self.end_date = end_U.astimezone(MDT).date()
            elif self.parent_seg.parent_message.local_timezone == "PDT":
                self.end_date = end_U.astimezone(PDT).date()
    def add_dates_covered(self):
        d1 = self.start_date
        d2 = self.end_date
        delta = d2 - d1
        all_dates = []
        for i in range(delta.days + 1):
            all_dates.append(d1 + datetime.timedelta(days=i))
        self.dates_covered = all_dates    
    def get_tables(self):
        # first get a row just for this message
        print('Vtecs Table Entry')
        print('\nMessage ID:\t{id}\nSegment ID:\t{sid}\nRaw Vtec:\t{rvt}\nProduct:\t{vpr}\nAction:\t{vac}\nPhenomenon:\t{vph}\nSignificance:\t{vsg}\nEvent ID:\t{vid}\nStart in UTC:\t{vst}\nEnd in UTC\t{ven}'.format(id=self.parent_seg.parent_message.id, sid=self.parent_seg.id, rvt=self.raw_vtec, vpr=self.product, vac=self.action, vph=self.phenom, vsg=self.sig, vid=self.eventid, vst=self.start_UTC, ven=self.end_UTC))
 

# (2) Import and parse each text file containing NWS alert records

### Import each text file, loop through records, remove duplicate records, make a table where each message is a row

In [10]:
all_msgs = []
raw_advisory_path = 'All text records 2006 to 2018 cleaned ordered'

for i in sorted(os.listdir(raw_advisory_path)):
    if i.endswith('.txt'):
        fhand = open(os.path.join(raw_advisory_path, i))
        data = fhand.read()
        chunks = data.split('\n\n')
        for chunk in chunks:
            all_msgs.append(chunk)
                
# Remove duplicates
seen = set()
all_msgs_no_duplicates = []
for msg in all_msgs:
    if msg not in seen:
        seen.add(msg)
        all_msgs_no_duplicates.append(msg)
        
# Make a table where each message = one row
message_table = []

for msg in all_msgs_no_duplicates:
    m = Message(msg)
    m_entry = {'m_id': m.id, 'office': m.office, 'issue_datetime': m.issue_local_datetime, 'issue_timezone': m.local_timezone, 'raw_text': m.raw_text}
    message_table.append(m_entry)

# Convert to pandas dataframe
message_table_df = pd.DataFrame(message_table)
message_table_df = message_table_df[['m_id', 'office', 'issue_datetime', 'issue_timezone', 'raw_text']]

# First five records
message_table_df.head()



Unnamed: 0,m_id,office,issue_datetime,issue_timezone,raw_text
0,20060331_KFSD_010004,KFSD,2006-03-31 18:04:00,CST,WWUS73 KFSD 010004\nNPWFSD\nURGENT - WEATHER M...
1,20060331_KLOT_010008,KLOT,2006-03-31 18:08:00,CST,WWUS73 KLOT 010008\nNPWLOT\nURGENT - WEATHER M...
2,20060331_KIWX_010054,KIWX,2006-03-31 19:54:00,EST,WWUS73 KIWX 010054\nNPWIWX\nURGENT - WEATHER M...
3,20060331_KILX_010110,KILX,2006-03-31 19:10:00,CST,WWUS73 KILX 010110\nNPWILX\nURGENT - WEATHER M...
4,20060401_KGSP_010900,KGSP,2006-04-01 04:00:00,EST,WWUS72 KGSP 010900\nNPWGSP\nURGENT - WEATHER M...


### How many messages are there?

In [11]:
len(message_table_df)

122762

### Remove duplicate m_id

In [12]:
dup_index = message_table_df.duplicated(subset=['m_id'])
unique_message = message_table_df[-dup_index]
unique_message

Unnamed: 0,m_id,office,issue_datetime,issue_timezone,raw_text
0,20060331_KFSD_010004,KFSD,2006-03-31 18:04:00,CST,WWUS73 KFSD 010004\nNPWFSD\nURGENT - WEATHER M...
1,20060331_KLOT_010008,KLOT,2006-03-31 18:08:00,CST,WWUS73 KLOT 010008\nNPWLOT\nURGENT - WEATHER M...
2,20060331_KIWX_010054,KIWX,2006-03-31 19:54:00,EST,WWUS73 KIWX 010054\nNPWIWX\nURGENT - WEATHER M...
3,20060331_KILX_010110,KILX,2006-03-31 19:10:00,CST,WWUS73 KILX 010110\nNPWILX\nURGENT - WEATHER M...
4,20060401_KGSP_010900,KGSP,2006-04-01 04:00:00,EST,WWUS72 KGSP 010900\nNPWGSP\nURGENT - WEATHER M...
...,...,...,...,...,...
122757,20181031_KGGW_312018,KGGW,2018-10-31 14:18:00,MDT,WWUS75 KGGW 312018\nNPWGGW\nURGENT - WEATHER M...
122758,20181031_KLOX_312025,KLOX,2018-10-31 13:25:00,PDT,WWUS76 KLOX 312025\nNPWLOX\nURGENT - WEATHER M...
122759,20181031_KUNR_312031,KUNR,2018-10-31 14:31:00,MDT,WWUS73 KUNR 312031\nNPWUNR\nURGENT - WEATHER M...
122760,20181031_KTWC_312052,KTWC,2018-10-31 13:52:00,MST,WWUS75 KTWC 312052\nNPWTWC\nURGENT - WEATHER M...


In [13]:
# make sure they are unique
assert len(set(unique_message['m_id'])) == len(unique_message['m_id'])

In [14]:
# set with copy warning comes from series.tz_localize()
# discard because the value is truly set with .apply(). Compare unique_message with above cell to check
unique_message.loc[:, 'issue_datetime_LTZ'] = unique_message.apply(lambda x: x['issue_datetime'].tz_localize(t_zone_dict[x['issue_timezone']]), axis=1)
unique_message

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_message.loc[:, 'issue_datetime_LTZ'] = unique_message.apply(lambda x: x['issue_datetime'].tz_localize(t_zone_dict[x['issue_timezone']]), axis=1)


Unnamed: 0,m_id,office,issue_datetime,issue_timezone,raw_text,issue_datetime_LTZ
0,20060331_KFSD_010004,KFSD,2006-03-31 18:04:00,CST,WWUS73 KFSD 010004\nNPWFSD\nURGENT - WEATHER M...,2006-03-31 18:04:00-06:00
1,20060331_KLOT_010008,KLOT,2006-03-31 18:08:00,CST,WWUS73 KLOT 010008\nNPWLOT\nURGENT - WEATHER M...,2006-03-31 18:08:00-06:00
2,20060331_KIWX_010054,KIWX,2006-03-31 19:54:00,EST,WWUS73 KIWX 010054\nNPWIWX\nURGENT - WEATHER M...,2006-03-31 19:54:00-05:00
3,20060331_KILX_010110,KILX,2006-03-31 19:10:00,CST,WWUS73 KILX 010110\nNPWILX\nURGENT - WEATHER M...,2006-03-31 19:10:00-06:00
4,20060401_KGSP_010900,KGSP,2006-04-01 04:00:00,EST,WWUS72 KGSP 010900\nNPWGSP\nURGENT - WEATHER M...,2006-04-01 04:00:00-05:00
...,...,...,...,...,...,...
122757,20181031_KGGW_312018,KGGW,2018-10-31 14:18:00,MDT,WWUS75 KGGW 312018\nNPWGGW\nURGENT - WEATHER M...,2018-10-31 14:18:00-06:00
122758,20181031_KLOX_312025,KLOX,2018-10-31 13:25:00,PDT,WWUS76 KLOX 312025\nNPWLOX\nURGENT - WEATHER M...,2018-10-31 13:25:00-07:00
122759,20181031_KUNR_312031,KUNR,2018-10-31 14:31:00,MDT,WWUS73 KUNR 312031\nNPWUNR\nURGENT - WEATHER M...,2018-10-31 14:31:00-06:00
122760,20181031_KTWC_312052,KTWC,2018-10-31 13:52:00,MST,WWUS75 KTWC 312052\nNPWTWC\nURGENT - WEATHER M...,2018-10-31 13:52:00-07:00


### Make a table where each unique zone for each segment of a message is a row

In [15]:
# Make a table where each zone = one row
zone_table = []

for msg in all_msgs_no_duplicates:
    m = Message(msg)
    for i in range(len(m.raw_segments)):
        for j in range(len(m.segments[i].raw_zones)):
            z_entry = {'m_id': m.segments[i].zones[j].parent_seg.parent_message.id, 's_id': m.segments[i].zones[j].parent_seg.id, 'zone': m.segments[i].zones[j].raw_zone, 'zone_state': m.segments[i].zones[j].zonestate, 'zone_num': m.segments[i].zones[j].zonenum}
            zone_table.append(z_entry)           
    
zone_table_df = pd.DataFrame(zone_table)
zone_table_df=zone_table_df[['m_id', 's_id', 'zone', 'zone_num', 'zone_state']]
zone_table_df.head()

Unnamed: 0,m_id,s_id,zone,zone_num,zone_state
0,20060331_KFSD_010004,0,IAZ002,2,IA
1,20060331_KFSD_010004,0,IAZ003,3,IA
2,20060331_KFSD_010004,0,IAZ013,13,IA
3,20060331_KFSD_010004,0,IAZ014,14,IA
4,20060331_KFSD_010004,0,IAZ022,22,IA


In [16]:
zone_table_df.head(10)

Unnamed: 0,m_id,s_id,zone,zone_num,zone_state
0,20060331_KFSD_010004,0,IAZ002,2,IA
1,20060331_KFSD_010004,0,IAZ003,3,IA
2,20060331_KFSD_010004,0,IAZ013,13,IA
3,20060331_KFSD_010004,0,IAZ014,14,IA
4,20060331_KFSD_010004,0,IAZ022,22,IA
5,20060331_KFSD_010004,0,MNZ071,71,MN
6,20060331_KFSD_010004,0,MNZ072,72,MN
7,20060331_KFSD_010004,0,MNZ080,80,MN
8,20060331_KFSD_010004,0,MNZ081,81,MN
9,20060331_KFSD_010004,0,MNZ089,89,MN


In [17]:
len(zone_table_df)

2075478

### Make a table where each unique VTEC header in each segment of a message is a row

In [18]:
# Make a table where each vtec header = one row
vtec_table = []

for msg in all_msgs_no_duplicates:
    m = Message(msg)
    for i in range(len(m.raw_segments)):
        for j in range(len(m.segments[i].raw_vtecs)):            
            v_entry = {'m_id': m.segments[i].vtecs[j].parent_seg.parent_message.id, 's_id': m.segments[i].vtecs[j].parent_seg.id, 'raw_vtec': m.segments[i].vtecs[j].raw_vtec, 'product': m.segments[i].vtecs[j].product, 'action': m.segments[i].vtecs[j].action, 'phenom': m.segments[i].vtecs[j].phenom, 'sig': m.segments[i].vtecs[j].sig, 'event_id': m.segments[i].vtecs[j].eventid, 'start_UTC': m.segments[i].vtecs[j].start_UTC, 'end_UTC': m.segments[i].vtecs[j].end_UTC, 'start_LTZ': m.segments[i].vtecs[j].start_LTZ, 'start_date': m.segments[i].vtecs[j].start_date, 'end_LTZ': m.segments[i].vtecs[j].end_LTZ, 'end_date': m.segments[i].vtecs[j].end_date, 'dates_covered': m.segments[i].vtecs[j].dates_covered}
            vtec_table.append(v_entry)
vtec_table_df = pd.DataFrame(vtec_table)
vtec_table_df=vtec_table_df[['m_id', 's_id', 'product', 'action', 'phenom', 'sig', 'event_id', 'start_UTC', 'end_UTC', 'start_LTZ', 'end_LTZ', 'raw_vtec', 'start_date', 'end_date', 'dates_covered']]

vtec_table_df.head()

Unnamed: 0,m_id,s_id,product,action,phenom,sig,event_id,start_UTC,end_UTC,start_LTZ,end_LTZ,raw_vtec,start_date,end_date,dates_covered
0,20060331_KFSD_010004,0,O,EXP,WI,Y,5,,2006-04-01 00:00:00+00:00,2006-03-31 18:04:00,2006-03-31 18:00:00-06:00,/O.EXP.KFSD.WI.Y.0005.000000T0000Z-060401T0000Z/,2006-03-31,2006-03-31,[2006-03-31]
1,20060331_KLOT_010008,0,O,EXP,HW,W,1,,2006-04-01 00:00:00+00:00,2006-03-31 18:08:00,2006-03-31 18:00:00-06:00,/O.EXP.KLOT.HW.W.0001.000000T0000Z-060401T0000Z/,2006-03-31,2006-03-31,[2006-03-31]
2,20060331_KLOT_010008,1,O,EXP,WI,Y,3,,2006-04-01 00:00:00+00:00,2006-03-31 18:08:00,2006-03-31 18:00:00-06:00,/O.EXP.KLOT.WI.Y.0003.000000T0000Z-060401T0000Z/,2006-03-31,2006-03-31,[2006-03-31]
3,20060331_KIWX_010054,0,O,EXP,HW,W,2,,2006-04-01 01:00:00+00:00,2006-03-31 19:54:00,2006-03-31 20:00:00-05:00,/O.EXP.KIWX.HW.W.0002.000000T0000Z-060401T0100Z/,2006-03-31,2006-03-31,[2006-03-31]
4,20060331_KILX_010110,0,O,EXP,WI,Y,4,,2006-04-01 01:00:00+00:00,2006-03-31 19:10:00,2006-03-31 19:00:00-06:00,/O.EXP.KILX.WI.Y.0004.000000T0000Z-060401T0100Z/,2006-03-31,2006-03-31,[2006-03-31]


In [19]:
vtec_table_df.head(10)

Unnamed: 0,m_id,s_id,product,action,phenom,sig,event_id,start_UTC,end_UTC,start_LTZ,end_LTZ,raw_vtec,start_date,end_date,dates_covered
0,20060331_KFSD_010004,0,O,EXP,WI,Y,5,,2006-04-01 00:00:00+00:00,2006-03-31 18:04:00,2006-03-31 18:00:00-06:00,/O.EXP.KFSD.WI.Y.0005.000000T0000Z-060401T0000Z/,2006-03-31,2006-03-31,[2006-03-31]
1,20060331_KLOT_010008,0,O,EXP,HW,W,1,,2006-04-01 00:00:00+00:00,2006-03-31 18:08:00,2006-03-31 18:00:00-06:00,/O.EXP.KLOT.HW.W.0001.000000T0000Z-060401T0000Z/,2006-03-31,2006-03-31,[2006-03-31]
2,20060331_KLOT_010008,1,O,EXP,WI,Y,3,,2006-04-01 00:00:00+00:00,2006-03-31 18:08:00,2006-03-31 18:00:00-06:00,/O.EXP.KLOT.WI.Y.0003.000000T0000Z-060401T0000Z/,2006-03-31,2006-03-31,[2006-03-31]
3,20060331_KIWX_010054,0,O,EXP,HW,W,2,,2006-04-01 01:00:00+00:00,2006-03-31 19:54:00,2006-03-31 20:00:00-05:00,/O.EXP.KIWX.HW.W.0002.000000T0000Z-060401T0100Z/,2006-03-31,2006-03-31,[2006-03-31]
4,20060331_KILX_010110,0,O,EXP,WI,Y,4,,2006-04-01 01:00:00+00:00,2006-03-31 19:10:00,2006-03-31 19:00:00-06:00,/O.EXP.KILX.WI.Y.0004.000000T0000Z-060401T0100Z/,2006-03-31,2006-03-31,[2006-03-31]
5,20060401_KGSP_010900,0,O,NEW,LW,Y,27,2006-04-01 14:00:00+00:00,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01]
6,20060401_KLZK_011002,0,O,NEW,FG,Y,5,2006-04-01 10:02:00+00:00,2006-04-01 16:00:00+00:00,2006-04-01 04:02:00-06:00,2006-04-01 10:00:00-06:00,/O.NEW.KLZK.FG.Y.0005.060401T1002Z-060401T1600Z/,2006-04-01,2006-04-01,[2006-04-01]
7,20060401_KMEG_011150,0,O,NEW,FG,Y,3,2006-04-01 11:50:00+00:00,2006-04-01 15:00:00+00:00,2006-04-01 05:50:00-06:00,2006-04-01 09:00:00-06:00,/O.NEW.KMEG.FG.Y.0003.060401T1150Z-060401T1500Z/,2006-04-01,2006-04-01,[2006-04-01]
8,20060401_KMEG_011251,0,O,EXA,FG,Y,3,,2006-04-01 15:00:00+00:00,2006-04-01 06:51:00,2006-04-01 09:00:00-06:00,/O.EXA.KMEG.FG.Y.0003.000000T0000Z-060401T1500Z/,2006-04-01,2006-04-01,[2006-04-01]
9,20060401_KMEG_011251,1,O,CON,FG,Y,3,,2006-04-01 15:00:00+00:00,2006-04-01 06:51:00,2006-04-01 09:00:00-06:00,/O.CON.KMEG.FG.Y.0003.000000T0000Z-060401T1500Z/,2006-04-01,2006-04-01,[2006-04-01]


In [20]:
len(vtec_table_df)

222122

### Merge zone and VTEC tables

In [21]:
combined = pd.merge(zone_table_df, vtec_table_df, on=['m_id', 's_id'])

In [22]:
combined.head()

Unnamed: 0,m_id,s_id,zone,zone_num,zone_state,product,action,phenom,sig,event_id,start_UTC,end_UTC,start_LTZ,end_LTZ,raw_vtec,start_date,end_date,dates_covered
0,20060331_KFSD_010004,0,IAZ002,2,IA,O,EXP,WI,Y,5,,2006-04-01 00:00:00+00:00,2006-03-31 18:04:00,2006-03-31 18:00:00-06:00,/O.EXP.KFSD.WI.Y.0005.000000T0000Z-060401T0000Z/,2006-03-31,2006-03-31,[2006-03-31]
1,20060331_KFSD_010004,0,IAZ003,3,IA,O,EXP,WI,Y,5,,2006-04-01 00:00:00+00:00,2006-03-31 18:04:00,2006-03-31 18:00:00-06:00,/O.EXP.KFSD.WI.Y.0005.000000T0000Z-060401T0000Z/,2006-03-31,2006-03-31,[2006-03-31]
2,20060331_KFSD_010004,0,IAZ013,13,IA,O,EXP,WI,Y,5,,2006-04-01 00:00:00+00:00,2006-03-31 18:04:00,2006-03-31 18:00:00-06:00,/O.EXP.KFSD.WI.Y.0005.000000T0000Z-060401T0000Z/,2006-03-31,2006-03-31,[2006-03-31]
3,20060331_KFSD_010004,0,IAZ014,14,IA,O,EXP,WI,Y,5,,2006-04-01 00:00:00+00:00,2006-03-31 18:04:00,2006-03-31 18:00:00-06:00,/O.EXP.KFSD.WI.Y.0005.000000T0000Z-060401T0000Z/,2006-03-31,2006-03-31,[2006-03-31]
4,20060331_KFSD_010004,0,IAZ022,22,IA,O,EXP,WI,Y,5,,2006-04-01 00:00:00+00:00,2006-03-31 18:04:00,2006-03-31 18:00:00-06:00,/O.EXP.KFSD.WI.Y.0005.000000T0000Z-060401T0000Z/,2006-03-31,2006-03-31,[2006-03-31]


In [23]:
len(combined)

2342437

### Keep all types of messaging except expiration notices

In [24]:
# Get all vtec events for zone KSZ102 that are not expiration notices
combined = combined.loc[combined['action'] != 'EXP']
combined.head()

Unnamed: 0,m_id,s_id,zone,zone_num,zone_state,product,action,phenom,sig,event_id,start_UTC,end_UTC,start_LTZ,end_LTZ,raw_vtec,start_date,end_date,dates_covered
69,20060401_KGSP_010900,0,NCZ033,33,NC,O,NEW,LW,Y,27,2006-04-01 14:00:00+00:00,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01]
70,20060401_KGSP_010900,0,NCZ049,49,NC,O,NEW,LW,Y,27,2006-04-01 14:00:00+00:00,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01]
71,20060401_KGSP_010900,0,NCZ050,50,NC,O,NEW,LW,Y,27,2006-04-01 14:00:00+00:00,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01]
72,20060401_KLZK_011002,0,ARZ021,21,AR,O,NEW,FG,Y,5,2006-04-01 10:02:00+00:00,2006-04-01 16:00:00+00:00,2006-04-01 04:02:00-06:00,2006-04-01 10:00:00-06:00,/O.NEW.KLZK.FG.Y.0005.060401T1002Z-060401T1600Z/,2006-04-01,2006-04-01,[2006-04-01]
73,20060401_KLZK_011002,0,ARZ022,22,AR,O,NEW,FG,Y,5,2006-04-01 10:02:00+00:00,2006-04-01 16:00:00+00:00,2006-04-01 04:02:00-06:00,2006-04-01 10:00:00-06:00,/O.NEW.KLZK.FG.Y.0005.060401T1002Z-060401T1600Z/,2006-04-01,2006-04-01,[2006-04-01]


In [25]:
len(combined)

2138742

### Add a row ID just in case any duplicates slipped through

In [26]:
combined['row_id'] = range(len(combined))

### Count how many days are in each row (tells you how many days the alert in each row was in effect)

In [27]:
combined['num_days'] = combined['dates_covered'].apply(lambda x: len(x))
combined

Unnamed: 0,m_id,s_id,zone,zone_num,zone_state,product,action,phenom,sig,event_id,start_UTC,end_UTC,start_LTZ,end_LTZ,raw_vtec,start_date,end_date,dates_covered,row_id,num_days
69,20060401_KGSP_010900,0,NCZ033,033,NC,O,NEW,LW,Y,0027,2006-04-01 14:00:00+00:00,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],0,1
70,20060401_KGSP_010900,0,NCZ049,049,NC,O,NEW,LW,Y,0027,2006-04-01 14:00:00+00:00,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],1,1
71,20060401_KGSP_010900,0,NCZ050,050,NC,O,NEW,LW,Y,0027,2006-04-01 14:00:00+00:00,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],2,1
72,20060401_KLZK_011002,0,ARZ021,021,AR,O,NEW,FG,Y,0005,2006-04-01 10:02:00+00:00,2006-04-01 16:00:00+00:00,2006-04-01 04:02:00-06:00,2006-04-01 10:00:00-06:00,/O.NEW.KLZK.FG.Y.0005.060401T1002Z-060401T1600Z/,2006-04-01,2006-04-01,[2006-04-01],3,1
73,20060401_KLZK_011002,0,ARZ022,022,AR,O,NEW,FG,Y,0005,2006-04-01 10:02:00+00:00,2006-04-01 16:00:00+00:00,2006-04-01 04:02:00-06:00,2006-04-01 10:00:00-06:00,/O.NEW.KLZK.FG.Y.0005.060401T1002Z-060401T1600Z/,2006-04-01,2006-04-01,[2006-04-01],4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2342432,20181031_KUNR_312031,0,SDZ001,001,SD,O,NEW,WI,Y,0017,2018-11-01 18:00:00+00:00,2018-11-02 01:00:00+00:00,2018-11-01 12:00:00-06:00,2018-11-01 19:00:00-06:00,/O.NEW.KUNR.WI.Y.0017.181101T1800Z-181102T0100Z/,2018-11-01,2018-11-01,[2018-11-01],2138737,1
2342433,20181031_KUNR_312031,0,SDZ012,012,SD,O,NEW,WI,Y,0017,2018-11-01 18:00:00+00:00,2018-11-02 01:00:00+00:00,2018-11-01 12:00:00-06:00,2018-11-01 19:00:00-06:00,/O.NEW.KUNR.WI.Y.0017.181101T1800Z-181102T0100Z/,2018-11-01,2018-11-01,[2018-11-01],2138738,1
2342434,20181031_KTWC_312052,0,AZZ508,508,AZ,O,CON,FZ,W,0004,2018-11-01 09:00:00+00:00,2018-11-01 15:00:00+00:00,2018-11-01 02:00:00-07:00,2018-11-01 08:00:00-07:00,/O.CON.KTWC.FZ.W.0004.181101T0900Z-181101T1500Z/,2018-11-01,2018-11-01,[2018-11-01],2138739,1
2342435,20181031_KTWC_312052,0,AZZ509,509,AZ,O,CON,FZ,W,0004,2018-11-01 09:00:00+00:00,2018-11-01 15:00:00+00:00,2018-11-01 02:00:00-07:00,2018-11-01 08:00:00-07:00,/O.CON.KTWC.FZ.W.0004.181101T0900Z-181101T1500Z/,2018-11-01,2018-11-01,[2018-11-01],2138740,1


### Use reindex to make that many copies of each row (so, an alert for a particular zone that was in effect on 6/17/18 and 6/18/18 would result in two rows, one for each day)

In [28]:
combined = combined.reindex(np.repeat(combined.index.values, combined['num_days']), method='ffill').reset_index(drop=True)
combined

Unnamed: 0,m_id,s_id,zone,zone_num,zone_state,product,action,phenom,sig,event_id,start_UTC,end_UTC,start_LTZ,end_LTZ,raw_vtec,start_date,end_date,dates_covered,row_id,num_days
0,20060401_KGSP_010900,0,NCZ033,033,NC,O,NEW,LW,Y,0027,2006-04-01 14:00:00+00:00,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],0,1
1,20060401_KGSP_010900,0,NCZ049,049,NC,O,NEW,LW,Y,0027,2006-04-01 14:00:00+00:00,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],1,1
2,20060401_KGSP_010900,0,NCZ050,050,NC,O,NEW,LW,Y,0027,2006-04-01 14:00:00+00:00,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],2,1
3,20060401_KLZK_011002,0,ARZ021,021,AR,O,NEW,FG,Y,0005,2006-04-01 10:02:00+00:00,2006-04-01 16:00:00+00:00,2006-04-01 04:02:00-06:00,2006-04-01 10:00:00-06:00,/O.NEW.KLZK.FG.Y.0005.060401T1002Z-060401T1600Z/,2006-04-01,2006-04-01,[2006-04-01],3,1
4,20060401_KLZK_011002,0,ARZ022,022,AR,O,NEW,FG,Y,0005,2006-04-01 10:02:00+00:00,2006-04-01 16:00:00+00:00,2006-04-01 04:02:00-06:00,2006-04-01 10:00:00-06:00,/O.NEW.KLZK.FG.Y.0005.060401T1002Z-060401T1600Z/,2006-04-01,2006-04-01,[2006-04-01],4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2972524,20181031_KUNR_312031,0,SDZ001,001,SD,O,NEW,WI,Y,0017,2018-11-01 18:00:00+00:00,2018-11-02 01:00:00+00:00,2018-11-01 12:00:00-06:00,2018-11-01 19:00:00-06:00,/O.NEW.KUNR.WI.Y.0017.181101T1800Z-181102T0100Z/,2018-11-01,2018-11-01,[2018-11-01],2138737,1
2972525,20181031_KUNR_312031,0,SDZ012,012,SD,O,NEW,WI,Y,0017,2018-11-01 18:00:00+00:00,2018-11-02 01:00:00+00:00,2018-11-01 12:00:00-06:00,2018-11-01 19:00:00-06:00,/O.NEW.KUNR.WI.Y.0017.181101T1800Z-181102T0100Z/,2018-11-01,2018-11-01,[2018-11-01],2138738,1
2972526,20181031_KTWC_312052,0,AZZ508,508,AZ,O,CON,FZ,W,0004,2018-11-01 09:00:00+00:00,2018-11-01 15:00:00+00:00,2018-11-01 02:00:00-07:00,2018-11-01 08:00:00-07:00,/O.CON.KTWC.FZ.W.0004.181101T0900Z-181101T1500Z/,2018-11-01,2018-11-01,[2018-11-01],2138739,1
2972527,20181031_KTWC_312052,0,AZZ509,509,AZ,O,CON,FZ,W,0004,2018-11-01 09:00:00+00:00,2018-11-01 15:00:00+00:00,2018-11-01 02:00:00-07:00,2018-11-01 08:00:00-07:00,/O.CON.KTWC.FZ.W.0004.181101T0900Z-181101T1500Z/,2018-11-01,2018-11-01,[2018-11-01],2138740,1


### Use groupby to find a running count of how many days are in each id

In [29]:
combined['day_index'] = combined.groupby(['m_id', 's_id', 'row_id']).cumcount()
combined

Unnamed: 0,m_id,s_id,zone,zone_num,zone_state,product,action,phenom,sig,event_id,...,end_UTC,start_LTZ,end_LTZ,raw_vtec,start_date,end_date,dates_covered,row_id,num_days,day_index
0,20060401_KGSP_010900,0,NCZ033,033,NC,O,NEW,LW,Y,0027,...,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],0,1,0
1,20060401_KGSP_010900,0,NCZ049,049,NC,O,NEW,LW,Y,0027,...,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],1,1,0
2,20060401_KGSP_010900,0,NCZ050,050,NC,O,NEW,LW,Y,0027,...,2006-04-01 23:00:00+00:00,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],2,1,0
3,20060401_KLZK_011002,0,ARZ021,021,AR,O,NEW,FG,Y,0005,...,2006-04-01 16:00:00+00:00,2006-04-01 04:02:00-06:00,2006-04-01 10:00:00-06:00,/O.NEW.KLZK.FG.Y.0005.060401T1002Z-060401T1600Z/,2006-04-01,2006-04-01,[2006-04-01],3,1,0
4,20060401_KLZK_011002,0,ARZ022,022,AR,O,NEW,FG,Y,0005,...,2006-04-01 16:00:00+00:00,2006-04-01 04:02:00-06:00,2006-04-01 10:00:00-06:00,/O.NEW.KLZK.FG.Y.0005.060401T1002Z-060401T1600Z/,2006-04-01,2006-04-01,[2006-04-01],4,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2972524,20181031_KUNR_312031,0,SDZ001,001,SD,O,NEW,WI,Y,0017,...,2018-11-02 01:00:00+00:00,2018-11-01 12:00:00-06:00,2018-11-01 19:00:00-06:00,/O.NEW.KUNR.WI.Y.0017.181101T1800Z-181102T0100Z/,2018-11-01,2018-11-01,[2018-11-01],2138737,1,0
2972525,20181031_KUNR_312031,0,SDZ012,012,SD,O,NEW,WI,Y,0017,...,2018-11-02 01:00:00+00:00,2018-11-01 12:00:00-06:00,2018-11-01 19:00:00-06:00,/O.NEW.KUNR.WI.Y.0017.181101T1800Z-181102T0100Z/,2018-11-01,2018-11-01,[2018-11-01],2138738,1,0
2972526,20181031_KTWC_312052,0,AZZ508,508,AZ,O,CON,FZ,W,0004,...,2018-11-01 15:00:00+00:00,2018-11-01 02:00:00-07:00,2018-11-01 08:00:00-07:00,/O.CON.KTWC.FZ.W.0004.181101T0900Z-181101T1500Z/,2018-11-01,2018-11-01,[2018-11-01],2138739,1,0
2972527,20181031_KTWC_312052,0,AZZ509,509,AZ,O,CON,FZ,W,0004,...,2018-11-01 15:00:00+00:00,2018-11-01 02:00:00-07:00,2018-11-01 08:00:00-07:00,/O.CON.KTWC.FZ.W.0004.181101T0900Z-181101T1500Z/,2018-11-01,2018-11-01,[2018-11-01],2138740,1,0


### Then use apply to look up the corresponding day

In [30]:
combined['day'] = combined.apply(lambda x: x['dates_covered'][x['day_index']], axis=1)
combined.head()

Unnamed: 0,m_id,s_id,zone,zone_num,zone_state,product,action,phenom,sig,event_id,...,start_LTZ,end_LTZ,raw_vtec,start_date,end_date,dates_covered,row_id,num_days,day_index,day
0,20060401_KGSP_010900,0,NCZ033,33,NC,O,NEW,LW,Y,27,...,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],0,1,0,2006-04-01
1,20060401_KGSP_010900,0,NCZ049,49,NC,O,NEW,LW,Y,27,...,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],1,1,0,2006-04-01
2,20060401_KGSP_010900,0,NCZ050,50,NC,O,NEW,LW,Y,27,...,2006-04-01 09:00:00-05:00,2006-04-01 18:00:00-05:00,/O.NEW.KGSP.LW.Y.0027.060401T1400Z-060401T2300Z/,2006-04-01,2006-04-01,[2006-04-01],2,1,0,2006-04-01
3,20060401_KLZK_011002,0,ARZ021,21,AR,O,NEW,FG,Y,5,...,2006-04-01 04:02:00-06:00,2006-04-01 10:00:00-06:00,/O.NEW.KLZK.FG.Y.0005.060401T1002Z-060401T1600Z/,2006-04-01,2006-04-01,[2006-04-01],3,1,0,2006-04-01
4,20060401_KLZK_011002,0,ARZ022,22,AR,O,NEW,FG,Y,5,...,2006-04-01 04:02:00-06:00,2006-04-01 10:00:00-06:00,/O.NEW.KLZK.FG.Y.0005.060401T1002Z-060401T1600Z/,2006-04-01,2006-04-01,[2006-04-01],4,1,0,2006-04-01


In [31]:
len(combined)

2972529

In [32]:
combined.columns

Index(['m_id', 's_id', 'zone', 'zone_num', 'zone_state', 'product', 'action',
       'phenom', 'sig', 'event_id', 'start_UTC', 'end_UTC', 'start_LTZ',
       'end_LTZ', 'raw_vtec', 'start_date', 'end_date', 'dates_covered',
       'row_id', 'num_days', 'day_index', 'day'],
      dtype='object')

### Restrict to heat-related messaging only

In [33]:
combined = combined.loc[(combined['phenom'] == 'HT') | (combined['phenom'] == 'EH')]

# filter for only significance W (warning) and Y (advisory)
combined_heat_wy = combined.loc[(combined['sig'] == 'Y') | (combined['sig'] == 'W')]

# filter for new events only
combined_heat_wy_new = combined_heat_wy.loc[(combined_heat_wy['action'] == 'NEW')]

In [34]:
combined_heat_wy_new.sig.unique()

array(['Y', 'W'], dtype=object)

In [35]:
len(combined_heat_wy_new)

145127

In [36]:
unique_message.head()

Unnamed: 0,m_id,office,issue_datetime,issue_timezone,raw_text,issue_datetime_LTZ
0,20060331_KFSD_010004,KFSD,2006-03-31 18:04:00,CST,WWUS73 KFSD 010004\nNPWFSD\nURGENT - WEATHER M...,2006-03-31 18:04:00-06:00
1,20060331_KLOT_010008,KLOT,2006-03-31 18:08:00,CST,WWUS73 KLOT 010008\nNPWLOT\nURGENT - WEATHER M...,2006-03-31 18:08:00-06:00
2,20060331_KIWX_010054,KIWX,2006-03-31 19:54:00,EST,WWUS73 KIWX 010054\nNPWIWX\nURGENT - WEATHER M...,2006-03-31 19:54:00-05:00
3,20060331_KILX_010110,KILX,2006-03-31 19:10:00,CST,WWUS73 KILX 010110\nNPWILX\nURGENT - WEATHER M...,2006-03-31 19:10:00-06:00
4,20060401_KGSP_010900,KGSP,2006-04-01 04:00:00,EST,WWUS72 KGSP 010900\nNPWGSP\nURGENT - WEATHER M...,2006-04-01 04:00:00-05:00


In [37]:
# how many HT messages before the TZ merge
unique_ht_message = len(combined_heat_wy_new['m_id'].unique())
unique_ht_message

5599

In [38]:
combined_with_LTZ = combined_heat_wy_new.merge(unique_message[['m_id', 'issue_datetime_LTZ']], how='left', on='m_id')
combined_with_LTZ

Unnamed: 0,m_id,s_id,zone,zone_num,zone_state,product,action,phenom,sig,event_id,...,end_LTZ,raw_vtec,start_date,end_date,dates_covered,row_id,num_days,day_index,day,issue_datetime_LTZ
0,20060510_KCRP_101915,0,TXZ229,229,TX,O,NEW,HT,Y,0001,...,2006-05-10 19:00:00-05:00,/O.NEW.KCRP.HT.Y.0001.060510T1915Z-060511T0000Z/,2006-05-10,2006-05-10,[2006-05-10],23160,1,0,2006-05-10,2006-05-10 14:15:00-05:00
1,20060510_KCRP_101915,0,TXZ230,230,TX,O,NEW,HT,Y,0001,...,2006-05-10 19:00:00-05:00,/O.NEW.KCRP.HT.Y.0001.060510T1915Z-060511T0000Z/,2006-05-10,2006-05-10,[2006-05-10],23161,1,0,2006-05-10,2006-05-10 14:15:00-05:00
2,20060510_KCRP_101915,0,TXZ231,231,TX,O,NEW,HT,Y,0001,...,2006-05-10 19:00:00-05:00,/O.NEW.KCRP.HT.Y.0001.060510T1915Z-060511T0000Z/,2006-05-10,2006-05-10,[2006-05-10],23162,1,0,2006-05-10,2006-05-10 14:15:00-05:00
3,20060510_KCRP_101915,0,TXZ232,232,TX,O,NEW,HT,Y,0001,...,2006-05-10 19:00:00-05:00,/O.NEW.KCRP.HT.Y.0001.060510T1915Z-060511T0000Z/,2006-05-10,2006-05-10,[2006-05-10],23163,1,0,2006-05-10,2006-05-10 14:15:00-05:00
4,20060510_KCRP_101915,0,TXZ233,233,TX,O,NEW,HT,Y,0001,...,2006-05-10 19:00:00-05:00,/O.NEW.KCRP.HT.Y.0001.060510T1915Z-060511T0000Z/,2006-05-10,2006-05-10,[2006-05-10],23164,1,0,2006-05-10,2006-05-10 14:15:00-05:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145122,20180919_KMEG_191702,0,ARZ048,048,AR,O,NEW,HT,Y,0023,...,2018-09-19 18:00:00-05:00,/O.NEW.KMEG.HT.Y.0023.180919T1702Z-180919T2300Z/,2018-09-19,2018-09-19,[2018-09-19],2107140,1,0,2018-09-19,2018-09-19 12:02:00-05:00
145123,20180919_KMEG_191702,0,ARZ049,049,AR,O,NEW,HT,Y,0023,...,2018-09-19 18:00:00-05:00,/O.NEW.KMEG.HT.Y.0023.180919T1702Z-180919T2300Z/,2018-09-19,2018-09-19,[2018-09-19],2107141,1,0,2018-09-19,2018-09-19 12:02:00-05:00
145124,20180919_KMEG_191702,0,ARZ058,058,AR,O,NEW,HT,Y,0023,...,2018-09-19 18:00:00-05:00,/O.NEW.KMEG.HT.Y.0023.180919T1702Z-180919T2300Z/,2018-09-19,2018-09-19,[2018-09-19],2107142,1,0,2018-09-19,2018-09-19 12:02:00-05:00
145125,20180919_KMEG_191702,0,MOZ113,113,MO,O,NEW,HT,Y,0023,...,2018-09-19 18:00:00-05:00,/O.NEW.KMEG.HT.Y.0023.180919T1702Z-180919T2300Z/,2018-09-19,2018-09-19,[2018-09-19],2107143,1,0,2018-09-19,2018-09-19 12:02:00-05:00


In [39]:
# check if any duplicates have been accidentally added
len(combined_with_LTZ['m_id'].unique())

5599

In [40]:
# imoprtant conceptially. Explains the NaN in start_UTC below
combined_with_LTZ['action'].unique()

array(['NEW'], dtype=object)

In [41]:
# finding records with NaN starting time
na_utc = combined_with_LTZ['start_UTC'] == 'NaN'

time_difference = combined_with_LTZ.loc[-na_utc, 'start_UTC'] - combined_with_LTZ.loc[-na_utc, 'issue_datetime_LTZ']
combined_with_LTZ.loc[-na_utc, 'time_delta_issue_start'] = time_difference

# if start time is NaN, meaning it's not a new advisory.
combined_with_LTZ.loc[na_utc, 'time_delta_issue_start'] = np.nan


In [42]:
# the following actions do not have a effective start time, because they are not new advisories
df_nan_time = combined_with_LTZ[na_utc]
df_nan_time['action'].unique()

array([], dtype=object)

In [43]:
# checking the end product
combined_with_LTZ

Unnamed: 0,m_id,s_id,zone,zone_num,zone_state,product,action,phenom,sig,event_id,...,raw_vtec,start_date,end_date,dates_covered,row_id,num_days,day_index,day,issue_datetime_LTZ,time_delta_issue_start
0,20060510_KCRP_101915,0,TXZ229,229,TX,O,NEW,HT,Y,0001,...,/O.NEW.KCRP.HT.Y.0001.060510T1915Z-060511T0000Z/,2006-05-10,2006-05-10,[2006-05-10],23160,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days
1,20060510_KCRP_101915,0,TXZ230,230,TX,O,NEW,HT,Y,0001,...,/O.NEW.KCRP.HT.Y.0001.060510T1915Z-060511T0000Z/,2006-05-10,2006-05-10,[2006-05-10],23161,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days
2,20060510_KCRP_101915,0,TXZ231,231,TX,O,NEW,HT,Y,0001,...,/O.NEW.KCRP.HT.Y.0001.060510T1915Z-060511T0000Z/,2006-05-10,2006-05-10,[2006-05-10],23162,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days
3,20060510_KCRP_101915,0,TXZ232,232,TX,O,NEW,HT,Y,0001,...,/O.NEW.KCRP.HT.Y.0001.060510T1915Z-060511T0000Z/,2006-05-10,2006-05-10,[2006-05-10],23163,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days
4,20060510_KCRP_101915,0,TXZ233,233,TX,O,NEW,HT,Y,0001,...,/O.NEW.KCRP.HT.Y.0001.060510T1915Z-060511T0000Z/,2006-05-10,2006-05-10,[2006-05-10],23164,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145122,20180919_KMEG_191702,0,ARZ048,048,AR,O,NEW,HT,Y,0023,...,/O.NEW.KMEG.HT.Y.0023.180919T1702Z-180919T2300Z/,2018-09-19,2018-09-19,[2018-09-19],2107140,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days
145123,20180919_KMEG_191702,0,ARZ049,049,AR,O,NEW,HT,Y,0023,...,/O.NEW.KMEG.HT.Y.0023.180919T1702Z-180919T2300Z/,2018-09-19,2018-09-19,[2018-09-19],2107141,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days
145124,20180919_KMEG_191702,0,ARZ058,058,AR,O,NEW,HT,Y,0023,...,/O.NEW.KMEG.HT.Y.0023.180919T1702Z-180919T2300Z/,2018-09-19,2018-09-19,[2018-09-19],2107142,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days
145125,20180919_KMEG_191702,0,MOZ113,113,MO,O,NEW,HT,Y,0023,...,/O.NEW.KMEG.HT.Y.0023.180919T1702Z-180919T2300Z/,2018-09-19,2018-09-19,[2018-09-19],2107143,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days


In [44]:
# making sure the timedelta calculation is actually working.
# checking records with advisories not issued as the same time as start time

not_zero_day_index = combined_with_LTZ['time_delta_issue_start'] != combined_with_LTZ['time_delta_issue_start'][0]
ltz_df_quick_check = combined_with_LTZ[['m_id','zone', 'zone_num', 'zone_state', 'action', 'start_UTC', 'end_UTC', 'start_LTZ',
       'end_LTZ','issue_datetime_LTZ', 'time_delta_issue_start']]
ltz_df_quick_check[not_zero_day_index]

Unnamed: 0,m_id,zone,zone_num,zone_state,action,start_UTC,end_UTC,start_LTZ,end_LTZ,issue_datetime_LTZ,time_delta_issue_start
15,20060601_KPSR_012102,AZZ022,022,AZ,NEW,2006-06-02 17:00:00+00:00,2006-06-03 03:00:00+00:00,2006-06-02 10:00:00-07:00,2006-06-02 20:00:00-07:00,2006-06-01 14:02:00-07:00,0 days 19:58:00
16,20060601_KPSR_012102,AZZ023,023,AZ,NEW,2006-06-02 17:00:00+00:00,2006-06-03 03:00:00+00:00,2006-06-02 10:00:00-07:00,2006-06-02 20:00:00-07:00,2006-06-01 14:02:00-07:00,0 days 19:58:00
17,20060601_KPSR_012102,AZZ027,027,AZ,NEW,2006-06-02 17:00:00+00:00,2006-06-03 03:00:00+00:00,2006-06-02 10:00:00-07:00,2006-06-02 20:00:00-07:00,2006-06-01 14:02:00-07:00,0 days 19:58:00
18,20060601_KPSR_012102,AZZ028,028,AZ,NEW,2006-06-02 17:00:00+00:00,2006-06-03 03:00:00+00:00,2006-06-02 10:00:00-07:00,2006-06-02 20:00:00-07:00,2006-06-01 14:02:00-07:00,0 days 19:58:00
19,20060601_KPSR_012215,AZZ022,022,AZ,NEW,2006-06-02 17:00:00+00:00,2006-06-03 03:00:00+00:00,2006-06-02 10:00:00-07:00,2006-06-02 20:00:00-07:00,2006-06-01 15:15:00-07:00,0 days 18:45:00
...,...,...,...,...,...,...,...,...,...,...,...
145108,20180917_KMOB_170813,ALZ266,266,AL,NEW,2018-09-17 15:00:00+00:00,2018-09-17 23:00:00+00:00,2018-09-17 10:00:00-05:00,2018-09-17 18:00:00-05:00,2018-09-17 03:13:00-05:00,0 days 06:47:00
145109,20180917_KMOB_170813,FLZ201,201,FL,NEW,2018-09-17 15:00:00+00:00,2018-09-17 23:00:00+00:00,2018-09-17 10:00:00-05:00,2018-09-17 18:00:00-05:00,2018-09-17 03:13:00-05:00,0 days 06:47:00
145110,20180917_KMOB_170813,FLZ202,202,FL,NEW,2018-09-17 15:00:00+00:00,2018-09-17 23:00:00+00:00,2018-09-17 10:00:00-05:00,2018-09-17 18:00:00-05:00,2018-09-17 03:13:00-05:00,0 days 06:47:00
145111,20180917_KMOB_170813,FLZ203,203,FL,NEW,2018-09-17 15:00:00+00:00,2018-09-17 23:00:00+00:00,2018-09-17 10:00:00-05:00,2018-09-17 18:00:00-05:00,2018-09-17 03:13:00-05:00,0 days 06:47:00


## Matching zones to GEOIDs

In [45]:
zone_to_geoid_filename = "20190705_final_cnty_forecast_zones/cnty10_forecast_zones.csv"
zone_geoid_df = pd.read_csv(zone_to_geoid_filename)
zone_geoid_df.head()

Unnamed: 0,geoid10,cnty,state,zone_id,zone,zone_type,zone_fname,zone_date,cnty_pop10,cnty_pop10_prop,cnty_area_prop
0,1001,Autauga,AL,AL-041,41,zone,z_16mr06,2006-03-16,54571,53179,0.9745
1,1003,Baldwin,AL,AL-062,62,zone,z_16mr06,2006-03-16,182265,76989,0.4224
2,1005,Barbour,AL,AL-050,50,zone,z_16mr06,2006-03-16,27457,26691,0.9721
3,1007,Bibb,AL,AL-034,34,zone,z_16mr06,2006-03-16,22915,22420,0.9784
4,1009,Blount,AL,AL-017,17,zone,z_16mr06,2006-03-16,57322,56382,0.9836


In [46]:
zone_geoid_df[zone_geoid_df['zone_id'] == 'AZ-022']
zone_geoid_df[(zone_geoid_df['zone'] == 22) & (zone_geoid_df['state'] == 'AZ')]

Unnamed: 0,geoid10,cnty,state,zone_id,zone,zone_type,zone_fname,zone_date,cnty_pop10,cnty_pop10_prop,cnty_area_prop


In [47]:
zone_geoid_df[zone_geoid_df['geoid10'] == 1003]

Unnamed: 0,geoid10,cnty,state,zone_id,zone,zone_type,zone_fname,zone_date,cnty_pop10,cnty_pop10_prop,cnty_area_prop
1,1003,Baldwin,AL,AL-062,62,zone,z_16mr06,2006-03-16,182265,76989,0.4224
3222,1003,Baldwin,AL,AL-062,62,zone,z_11mr07,2007-03-11,182265,76989,0.4224
6443,1003,Baldwin,AL,AL-062,62,zone,z_31my07,2007-05-31,182265,76989,0.4224
9664,1003,Baldwin,AL,AL-062,62,zone,z_01au07,2007-08-01,182265,76989,0.4224
12885,1003,Baldwin,AL,AL-062,62,zone,z_5sep07,2007-09-05,182265,76989,0.4224
16106,1003,Baldwin,AL,AL-062,62,zone,z_01ap08,2008-04-01,182265,76989,0.4224
19327,1003,Baldwin,AL,AL-062,62,zone,z_09se08,2008-09-09,182265,76989,0.4224
22548,1003,Baldwin,AL,AL-062,62,zone,z_03oc08,2008-10-03,182265,76989,0.4224
25769,1003,Baldwin,AL,AL-062,62,zone,z_07my09,2009-05-07,182265,76989,0.4224
28990,1003,Baldwin,AL,AL-062,62,zone,z_15jl09,2009-07-15,182265,76989,0.4224


In [48]:
zone_geoid_df = zone_geoid_df[-zone_geoid_df.duplicated(subset=['state', 'zone'])]
zone_geoid_df

Unnamed: 0,geoid10,cnty,state,zone_id,zone,zone_type,zone_fname,zone_date,cnty_pop10,cnty_pop10_prop,cnty_area_prop
0,1001,Autauga,AL,AL-041,41,zone,z_16mr06,2006-03-16,54571,53179,0.9745
1,1003,Baldwin,AL,AL-062,62,zone,z_16mr06,2006-03-16,182265,76989,0.4224
2,1005,Barbour,AL,AL-050,50,zone,z_16mr06,2006-03-16,27457,26691,0.9721
3,1007,Bibb,AL,AL-034,34,zone,z_16mr06,2006-03-16,22915,22420,0.9784
4,1009,Blount,AL,AL-017,17,zone,z_16mr06,2006-03-16,57322,56382,0.9836
...,...,...,...,...,...,...,...,...,...,...,...
135864,16063,Lincoln,ID,ID-051,51,zone,z_10jl18,2018-07-10,5208,5207,0.9999
135865,16065,Madison,ID,ID-053,53,zone,z_10jl18,2018-07-10,37536,25070,0.6679
135866,16067,Minidoka,ID,ID-055,55,zone,z_10jl18,2018-07-10,20069,11415,0.5688
135873,16081,Teton,ID,ID-065,65,zone,z_10jl18,2018-07-10,10170,5553,0.5460


In [49]:
# zone_num was in object dtype previously
combined_with_LTZ['zone_num'] = combined_with_LTZ['zone_num'].astype('int64')


In [50]:
# merge GEOID and conbined_with_LTZ
combined_with_LTZ_geoid = combined_with_LTZ.merge(zone_geoid_df[['geoid10', 'state', 'zone']], how='left', left_on=['zone_state', 'zone_num'], right_on=['state', 'zone'])
combined_with_LTZ_geoid


Unnamed: 0,m_id,s_id,zone_x,zone_num,zone_state,product,action,phenom,sig,event_id,...,dates_covered,row_id,num_days,day_index,day,issue_datetime_LTZ,time_delta_issue_start,geoid10,state,zone_y
0,20060510_KCRP_101915,0,TXZ229,229,TX,O,NEW,HT,Y,0001,...,[2006-05-10],23160,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days,48283.0,TX,229.0
1,20060510_KCRP_101915,0,TXZ230,230,TX,O,NEW,HT,Y,0001,...,[2006-05-10],23161,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days,48311.0,TX,230.0
2,20060510_KCRP_101915,0,TXZ231,231,TX,O,NEW,HT,Y,0001,...,[2006-05-10],23162,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days,48297.0,TX,231.0
3,20060510_KCRP_101915,0,TXZ232,232,TX,O,NEW,HT,Y,0001,...,[2006-05-10],23163,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days,48025.0,TX,232.0
4,20060510_KCRP_101915,0,TXZ233,233,TX,O,NEW,HT,Y,0001,...,[2006-05-10],23164,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days,48175.0,TX,233.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145122,20180919_KMEG_191702,0,ARZ048,48,AR,O,NEW,HT,Y,0023,...,[2018-09-19],2107140,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days,5123.0,AR,48.0
145123,20180919_KMEG_191702,0,ARZ049,49,AR,O,NEW,HT,Y,0023,...,[2018-09-19],2107141,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days,5077.0,AR,49.0
145124,20180919_KMEG_191702,0,ARZ058,58,AR,O,NEW,HT,Y,0023,...,[2018-09-19],2107142,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days,5107.0,AR,58.0
145125,20180919_KMEG_191702,0,MOZ113,113,MO,O,NEW,HT,Y,0023,...,[2018-09-19],2107143,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days,29069.0,MO,113.0


In [51]:
# zones that cannot be mapped to GEOIDs
missed_matching_zones = combined_with_LTZ_geoid[combined_with_LTZ_geoid['geoid10'].isna()]
missed_matching_zones.head()


Unnamed: 0,m_id,s_id,zone_x,zone_num,zone_state,product,action,phenom,sig,event_id,...,dates_covered,row_id,num_days,day_index,day,issue_datetime_LTZ,time_delta_issue_start,geoid10,state,zone_y
15,20060601_KPSR_012102,0,AZZ022,22,AZ,O,NEW,HT,Y,1,...,[2006-06-02],31425,1,0,2006-06-02,2006-06-01 14:02:00-07:00,0 days 19:58:00,,,
16,20060601_KPSR_012102,0,AZZ023,23,AZ,O,NEW,HT,Y,1,...,[2006-06-02],31426,1,0,2006-06-02,2006-06-01 14:02:00-07:00,0 days 19:58:00,,,
19,20060601_KPSR_012215,0,AZZ022,22,AZ,O,NEW,HT,Y,1,...,[2006-06-02],31429,1,0,2006-06-02,2006-06-01 15:15:00-07:00,0 days 18:45:00,,,
20,20060601_KPSR_012215,0,AZZ023,23,AZ,O,NEW,HT,Y,1,...,[2006-06-02],31430,1,0,2006-06-02,2006-06-01 15:15:00-07:00,0 days 18:45:00,,,
23,20060602_KPSR_021149,0,AZZ020,20,AZ,O,NEW,HT,Y,2,...,[2006-06-03],31433,1,0,2006-06-03,2006-06-02 04:49:00-07:00,1 days 05:11:00,,,


In [52]:
df_missed_zone = pd.DataFrame(columns = ['missed_zone'])
df_missed_zone['missed_zone'] = missed_matching_zones.zone_x.unique()
df_missed_zone.to_csv('zones_without_geoid.csv', index=False)

**prettify the combined df**

In [53]:
combined_with_LTZ_geoid.columns

Index(['m_id', 's_id', 'zone_x', 'zone_num', 'zone_state', 'product', 'action',
       'phenom', 'sig', 'event_id', 'start_UTC', 'end_UTC', 'start_LTZ',
       'end_LTZ', 'raw_vtec', 'start_date', 'end_date', 'dates_covered',
       'row_id', 'num_days', 'day_index', 'day', 'issue_datetime_LTZ',
       'time_delta_issue_start', 'geoid10', 'state', 'zone_y'],
      dtype='object')

In [90]:
combined_with_LTZ_geoid_final = combined_with_LTZ_geoid.loc[:, ~combined_with_LTZ_geoid.columns.isin(['zone_y'])]
combined_with_LTZ_geoid_final = combined_with_LTZ_geoid_final.rename(columns = {'zone_x':'zone'})

# find na values
na_geoid = combined_with_LTZ_geoid_final['geoid10'].isna()

# fill in string version of geoid10
combined_with_LTZ_geoid_final['geoid10_str'] = np.nan

# int to remove .0, str to perform str operation
padded_geoid = combined_with_LTZ_geoid_final.loc[~na_geoid, 'geoid10'].astype(int).astype(str).str.pad(width=5, side='left', fillchar='0')

combined_with_LTZ_geoid_final.loc[~na_geoid, 'geoid10_str'] = padded_geoid
combined_with_LTZ_geoid_final

combined_with_LTZ_geoid_final

Unnamed: 0,m_id,s_id,zone,zone_num,zone_state,product,action,phenom,sig,event_id,...,dates_covered,row_id,num_days,day_index,day,issue_datetime_LTZ,time_delta_issue_start,geoid10,state,geoid10_str
0,20060510_KCRP_101915,0,TXZ229,229,TX,O,NEW,HT,Y,0001,...,[2006-05-10],23160,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days,48283.0,TX,48283
1,20060510_KCRP_101915,0,TXZ230,230,TX,O,NEW,HT,Y,0001,...,[2006-05-10],23161,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days,48311.0,TX,48311
2,20060510_KCRP_101915,0,TXZ231,231,TX,O,NEW,HT,Y,0001,...,[2006-05-10],23162,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days,48297.0,TX,48297
3,20060510_KCRP_101915,0,TXZ232,232,TX,O,NEW,HT,Y,0001,...,[2006-05-10],23163,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days,48025.0,TX,48025
4,20060510_KCRP_101915,0,TXZ233,233,TX,O,NEW,HT,Y,0001,...,[2006-05-10],23164,1,0,2006-05-10,2006-05-10 14:15:00-05:00,0 days,48175.0,TX,48175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145122,20180919_KMEG_191702,0,ARZ048,48,AR,O,NEW,HT,Y,0023,...,[2018-09-19],2107140,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days,5123.0,AR,05123
145123,20180919_KMEG_191702,0,ARZ049,49,AR,O,NEW,HT,Y,0023,...,[2018-09-19],2107141,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days,5077.0,AR,05077
145124,20180919_KMEG_191702,0,ARZ058,58,AR,O,NEW,HT,Y,0023,...,[2018-09-19],2107142,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days,5107.0,AR,05107
145125,20180919_KMEG_191702,0,MOZ113,113,MO,O,NEW,HT,Y,0023,...,[2018-09-19],2107143,1,0,2018-09-19,2018-09-19 12:02:00-05:00,0 days,29069.0,MO,29069


## Visualize missed zones

In [37]:
# get id of each state for Altair visualization
# per https://gist.github.com/mbostock/4090848#gistcomment-2102151
ansi = pd.read_csv('https://www2.census.gov/geo/docs/reference/state.txt', sep='|')
ansi.columns = ['id', 'abbr', 'state', 'statens']
ansi = ansi[['id', 'abbr', 'state']]
ansi.head()

Unnamed: 0,id,abbr,state
0,1,AL,Alabama
1,2,AK,Alaska
2,4,AZ,Arizona
3,5,AR,Arkansas
4,6,CA,California


In [38]:
df_missed_zone = pd.read_csv('zones_without_geoid.csv')
df_missed_zone.head()

Unnamed: 0,missed_zone
0,AZZ022
1,AZZ023
2,AZZ020
3,AZZ025
4,CAZ031


In [39]:
df_missed_state = [el[0:2] for el in df_missed_zone['missed_zone'].tolist()]

In [45]:
missed_state, missed_counts = np.unique(df_missed_state, return_counts=True)

In [52]:
df_missed_state_count = pd.DataFrame(columns=['State', 'Counts'])
df_missed_state_count['State'] = missed_state
df_missed_state_count['Counts'] = missed_counts


In [53]:
df_missed_state_count.tail()


Unnamed: 0,State,Counts
32,VA,5
33,VT,5
34,WA,20
35,WI,2
36,WV,2


In [56]:
state_missed_w_id = df_missed_state_count.merge(ansi[['id', 'abbr']], how='left', left_on='State', right_on='abbr')
state_missed_w_id.head()

Unnamed: 0,State,Counts,id,abbr
0,AL,6,1,AL
1,AZ,53,4,AZ
2,CA,78,6,CA
3,CO,4,8,CO
4,CT,5,9,CT


In [65]:
states = alt.topo_feature(data.us_10m.url, feature='states')

background = alt.Chart(states).mark_geoshape(
    fill='lightgray',
    stroke='white'
)

chart = alt.Chart(states).mark_geoshape(stroke='black').encode(
    color='Counts:Q',
    tooltip=['State:N', 'Counts:Q']
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(state_missed_w_id, 'id', list(state_missed_w_id.columns))
).project('albersUsa').properties(
    width=850,
    height=600
)

viz = background + chart
viz

In [66]:
# viz.save('chart.png', scale_factor=2.0)

ValueError: Saving charts in 'png' format requires the altair_saver package: see http://github.com/altair-viz/altair_saver/

In [74]:
# zone_tx_df[['geoid10', 'state', 'zone']]

Unnamed: 0,geoid10,state,zone
2610,48175,TX,233
2664,48283,TX,229
2671,48297,TX,231
2678,48311,TX,230


In [None]:
# combined_with_LTZ_geoid = combined_with_LTZ.merge(zone_geoid_df[['geoid10', 'state', 'zone']], how='left', left_on=['zone_state', 'zone_num'], right_on=['state', 'zone'])

In [90]:
# combined_with_LTZ

In [91]:
# zone_geoid_df['zone']

# (3) Extract parsed data by state (NB: this is clunky but it's how I originally wrote and tested the code)

### Test out in New England (6 states)

In [None]:
# Connecticut
ct = combined.loc[combined['zone_state'] == 'CT']

In [None]:
len(ct)

In [None]:
ct.to_csv('ct_2006_2018.csv', index=False)

In [None]:
# Massachusetts
ma = combined.loc[combined['zone_state'] == 'MA']

In [None]:
len(ma)

In [None]:
ma.to_csv('ma_2006_2018.csv', index=False)

In [None]:
# Maine
me = combined.loc[combined['zone_state'] == 'ME']

In [None]:
len(me)

In [None]:
me.to_csv('me_2006_2018.csv', index=False)

In [None]:
# New Hampshire
nh = combined.loc[combined['zone_state'] == 'NH']

In [None]:
len(nh)

In [None]:
nh.to_csv('nh_2006_2018.csv', index=False)

In [None]:
# Rhode Island
ri = combined.loc[combined['zone_state'] == 'RI']

In [None]:
len(ri)

In [None]:
ri.to_csv('ri_2006_2018.csv', index=False)

In [None]:
# Vermont
vt = combined.loc[combined['zone_state'] == "VT"]

In [None]:
len(vt)

In [None]:
vt.to_csv('vt_2006_2018.csv', index=False)

### Now do the other 44 states and District of Columbia

In [None]:
# (1) Alabama
al = combined.loc[combined['zone_state'] == 'AL']
len(al)

In [None]:
# (2) Alaska 
ak = combined.loc[combined['zone_state'] == 'AK']
len(ak)

In [None]:
# (3) Arizona
az = combined.loc[combined['zone_state'] == 'AZ']
len(az)

In [None]:
# (4) Arkansas
ar = combined.loc[combined['zone_state'] == 'AR']
len(ar)

In [None]:
# (5) California
ca = combined.loc[combined['zone_state'] == 'CA']
len(ca)

In [None]:
# (6) Colorado
co = combined.loc[combined['zone_state'] == 'CO']
len(co)

In [None]:
# (7) Connecticut, done above

In [None]:
# (8) Delaware
de = combined.loc[combined['zone_state'] == 'DE']
len(de)

In [None]:
# (9) DC
dc = combined.loc[combined['zone_state'] == 'DC']
len(dc)

In [None]:
# (10) Florida
fl = combined.loc[combined['zone_state'] == 'FL']
len(fl)

In [None]:
# (11) Georgia
ga = combined.loc[combined['zone_state'] == 'GA']
len(ga)

In [None]:
# (12) Hawaii
hi = combined.loc[combined['zone_state'] == 'HI']
len(hi)

In [None]:
# (13) Idaho
ida = combined.loc[combined['zone_state'] == 'ID']
len(ida)

In [None]:
# (14) Illinois
il = combined.loc[combined['zone_state'] == 'IL']
len(il)

In [None]:
# (15) Indiana
ind = combined.loc[combined['zone_state'] == 'IN']
len(ind)

In [None]:
# (16) Iowa
ia = combined.loc[combined['zone_state'] == 'IA']
len(ia)

In [None]:
# (17) Kansas
ks = combined.loc[combined['zone_state'] == 'KS']
len(ks)

In [None]:
# (18) Kentucky
ky = combined.loc[combined['zone_state'] == 'KY']
len(ky)

In [None]:
# (19) Louisiana
la = combined.loc[combined['zone_state'] == 'LA']
len(la)

In [None]:
# (20) Maine, done above

In [None]:
# (21) Maryland
md = combined.loc[combined['zone_state'] == 'MD']
len(md)

In [None]:
# (22) Massachusetts, done above

In [None]:
# (23) Michigan
mi = combined.loc[combined['zone_state'] == 'MI']
len(mi)

In [None]:
# (24) Minnesota
mn = combined.loc[combined['zone_state'] == 'MN']
len(mn)

In [None]:
# (25) Mississippi
ms = combined.loc[combined['zone_state'] == 'MS']
len(ms)

In [None]:
# (26) Missouri
mo = combined.loc[combined['zone_state'] == 'MO']
len(mo)

In [None]:
# (27) Montana
mt = combined.loc[combined['zone_state'] == 'MT']
len(mt)

In [None]:
# (28) Nebraska
ne = combined.loc[combined['zone_state'] == 'NE']
len(ne)

In [None]:
# (29) Nevada
nv = combined.loc[combined['zone_state'] == 'NV']
len(nv)

In [None]:
# (30) New Hampshire, done above

In [None]:
# (31) New Jersey
nj = combined.loc[combined['zone_state'] == 'NJ']
len(nj)

In [None]:
# (32) New Mexico
nm = combined.loc[combined['zone_state'] == 'NM']
len(nm)

In [None]:
# (33) New York
ny = combined.loc[combined['zone_state'] == 'NY']
len(ny)

In [None]:
# (34) North Carolina
nc = combined.loc[combined['zone_state'] == 'NC']
len(nc)

In [None]:
# (35) North Dakota
nd = combined.loc[combined['zone_state'] == 'ND']
len(nd)

In [None]:
# (36) Ohio
oh = combined.loc[combined['zone_state'] == 'OH']
len(oh)

In [None]:
# (37) Oklahoma
ok = combined.loc[combined['zone_state'] == 'OK']
len(ok)

In [None]:
# (38) Oregon
ore = combined.loc[combined['zone_state'] == 'OR']
len(ore)

In [None]:
# (39) Pennsylvania
pa = combined.loc[combined['zone_state'] == 'PA'] 
len(pa)

In [None]:
# (40) Rhode Island, done above

In [None]:
# (41) South Carolina
sc = combined.loc[combined['zone_state'] == 'SC']
len(sc)

In [None]:
# (42) South Dakota
sd = combined.loc[combined['zone_state'] == 'SD']
len(sd)

In [None]:
# (43) Tennessee
tn = combined.loc[combined['zone_state'] == 'TN']
len(tn)

In [None]:
# (44) Texas
tx = combined.loc[combined['zone_state'] == 'TX']
len(tx)

In [None]:
# (45) Utah
ut = combined.loc[combined['zone_state'] == 'UT']
len(ut)

In [None]:
# (46) Vermont, done above

In [None]:
# (47) Virginia
va = combined.loc[combined['zone_state'] == 'VA']
len(va)

In [None]:
# (48) Washington
wa = combined.loc[combined['zone_state'] == 'WA']
len(wa)

In [None]:
# (49) West Virginia
wv = combined.loc[combined['zone_state'] == 'WV']
len(wv)

In [None]:
# (50) Wisconsin
wi = combined.loc[combined['zone_state'] == 'WI']
len(wi)

In [None]:
# (51) Wyoming
wy = combined.loc[combined['zone_state'] == 'WY']
len(wy)

### Check to make sure I'm not missing any alerts when extracting by state:

In [None]:
remain = combined.loc[(combined['zone_state'] != 'AL') &
                      (combined['zone_state'] != 'AK') &
                      (combined['zone_state'] != 'AZ') &
                      (combined['zone_state'] != 'AR') &
                      (combined['zone_state'] != 'CA') &
                      (combined['zone_state'] != 'CO') &
                      (combined['zone_state'] != 'CT') &
                      (combined['zone_state'] != 'DE') &
                      (combined['zone_state'] != 'DC') &
                      (combined['zone_state'] != 'FL') &
                      (combined['zone_state'] != 'GA') &
                      (combined['zone_state'] != 'HI') &
                      (combined['zone_state'] != 'ID') &
                      (combined['zone_state'] != 'IL') &
                      (combined['zone_state'] != 'IN') &
                      (combined['zone_state'] != 'IA') &
                      (combined['zone_state'] != 'KS') &
                      (combined['zone_state'] != 'KY') &
                      (combined['zone_state'] != 'LA') &
                      (combined['zone_state'] != 'ME') &
                      (combined['zone_state'] != 'MD') &
                      (combined['zone_state'] != 'MA') &
                      (combined['zone_state'] != 'MI') &
                      (combined['zone_state'] != 'MN') &
                      (combined['zone_state'] != 'MS') &
                      (combined['zone_state'] != 'MO') &
                      (combined['zone_state'] != 'MT') &
                      (combined['zone_state'] != 'NE') &
                      (combined['zone_state'] != 'NV') &
                      (combined['zone_state'] != 'NH') &
                      (combined['zone_state'] != 'NJ') &
                      (combined['zone_state'] != 'NM') &
                      (combined['zone_state'] != 'NY') &
                      (combined['zone_state'] != 'NC') &
                      (combined['zone_state'] != 'ND') &
                      (combined['zone_state'] != 'OH') &
                      (combined['zone_state'] != 'OK') &
                      (combined['zone_state'] != 'OR') &
                      (combined['zone_state'] != 'PA') &
                      (combined['zone_state'] != 'RI') &
                      (combined['zone_state'] != 'SC') &
                      (combined['zone_state'] != 'SD') &
                      (combined['zone_state'] != 'TN') &
                      (combined['zone_state'] != 'TX') &
                      (combined['zone_state'] != 'UT') &
                      (combined['zone_state'] != 'VT') &
                      (combined['zone_state'] != 'VA') &
                      (combined['zone_state'] != 'WA') &
                      (combined['zone_state'] != 'WI') &
                      (combined['zone_state'] != 'WV') &
                      (combined['zone_state'] != 'WY')]
                      
len(remain)

### Now export all the state-level files to csv for further processing in R:

In [None]:
al.to_csv('al_2006_2018.csv', index=False)
ak.to_csv('ak_2006_2018.csv', index=False)
az.to_csv('az_2006_2018.csv', index=False)
ar.to_csv('ar_2006_2018.csv', index=False)
ca.to_csv('ca_2006_2018.csv', index=False)
co.to_csv('co_2006_2018.csv', index=False)
ct.to_csv('ct_2006_2018.csv', index=False)
de.to_csv('de_2006_2018.csv', index=False)
dc.to_csv('dc_2006_2018.csv', index=False)
fl.to_csv('fl_2006_2018.csv', index=False)
ga.to_csv('ga_2006_2018.csv', index=False)
hi.to_csv('hi_2006_2018.csv', index=False)
ida.to_csv('ida_2006_2018.csv', index=False)
il.to_csv('il_2006_2018.csv', index=False)
ind.to_csv('ind_2006_2018.csv', index=False)
ia.to_csv('ia_2006_2018.csv', index=False)
ks.to_csv('ks_2006_2018.csv', index=False)
ky.to_csv('ky_2006_2018.csv', index=False)
la.to_csv('la_2006_2018.csv', index=False)
me.to_csv('me_2006_2018.csv', index=False)
md.to_csv('md_2006_2018.csv', index=False)
ma.to_csv('ma_2006_2018.csv', index=False)
mi.to_csv('mi_2006_2018.csv', index=False)
mn.to_csv('mn_2006_2018.csv', index=False)
ms.to_csv('ms_2006_2018.csv', index=False)
mo.to_csv('mo_2006_2018.csv', index=False)
mt.to_csv('mt_2006_2018.csv', index=False)
ne.to_csv('ne_2006_2018.csv', index=False)
nv.to_csv('nv_2006_2018.csv', index=False)
nh.to_csv('nh_2006_2018.csv', index=False)
nj.to_csv('nj_2006_2018.csv', index=False)
nm.to_csv('nm_2006_2018.csv', index=False)
ny.to_csv('ny_2006_2018.csv', index=False)
nc.to_csv('nc_2006_2018.csv', index=False)
nd.to_csv('nd_2006_2018.csv', index=False)
oh.to_csv('oh_2006_2018.csv', index=False)
ok.to_csv('ok_2006_2018.csv', index=False)
ore.to_csv('ore_2006_2018.csv', index=False)
pa.to_csv('pa_2006_2018.csv', index=False)
ri.to_csv('ri_2006_2018.csv', index=False)
sc.to_csv('sc_2006_2018.csv', index=False)
sd.to_csv('sd_2006_2018.csv', index=False)
tn.to_csv('tn_2006_2018.csv', index=False)
tx.to_csv('tx_2006_2018.csv', index=False)
ut.to_csv('ut_2006_2018.csv', index=False)
vt.to_csv('vt_2006_2018.csv', index=False)
va.to_csv('va_2006_2018.csv', index=False)
wa.to_csv('wa_2006_2018.csv', index=False)
wv.to_csv('wv_2006_2018.csv', index=False)
wi.to_csv('wi_2006_2018.csv', index=False)
wy.to_csv('wy_2006_2018.csv', index=False)

# Alternate code to extract a single csv file containing all states:
# combined.to_csv('all_states_2006_2018.csv', index=False)