In [1]:
mport pygrib as pg
import pathlib
import numpy as np

import pathlib
import glob
import os
import datetime
import pyproj
from scipy import stats
import gzip

import dataclasses as dc

Initial Variables

In [2]:
ndfd_area = 25
nsims = 10000
tornado_direction_distribution = stats.norm(50,15)
coolseason = [1,2,3,4,11,12]

# Impacts environment variable & root
os.environ['IMPACTSDATA'] = "../impacts-data"
impacts_data_root = pathlib.Path(os.environ['IMPACTSDATA'],"pas-input-data").expanduser().resolve()

# pp forecast file list
pp_reg_files = glob.glob('./pp_forecasts/regProbs/*')

Function to read pp file and format it properly

In [35]:
filepath = '/Volumes/Backup Plus/pas_output/simulationFiles/*'
paths = sorted(glob.glob(filepath), key=os.path.getmtime)

lastIdx = paths.index('/Volumes/Backup Plus/pas_output/simulationFiles/195001031630_pp.psv.gz') + 1
#toRun = np.arange(0,lastIdx)

In [53]:
paths[-1]

'/Volumes/Backup Plus/pas_output/simulationFiles/199812051630_pp.psv.gz'

In [13]:
paths[lastIdx-1]

'/Volumes/Backup Plus/pas_output/simulationFiles/195001031630_pp.psv.gz'

In [11]:
def read_pp_file(file):
    """ Read a PP file and format properly for the PAS """
    data = np.load(file)
    probs = data['fcst']*100
    vals = np.where(probs==0, -1.,probs)
    
    return vals

Output directory

In [12]:
outdir = pathlib.Path("/","Volumes","Backup Plus","pas_output","simulationFiles").resolve()
#outdir.mkdir(exist_ok=True)

## Loop through PP files and write out simulation files

In [64]:
# Updated to only run sim on older dates that need re-running
for i, string in enumerate(paths[3043:3045]): 
#for i, string in enumerate(paths):

    
    if (i % 100 == 0): print(f'Running file #{i}');
        
    ### Get actual file by crosschecking the already finished simulations
    newDate = string.split('/')[-1].split('1630')[0]
    file = glob.glob(f'./pp_forecasts/regProbs/{newDate}*')[0]
    #print(file)
    
    ### Setup Simulation ###
    # Determine Date/Time of outlook from filename
    date_in_name = file.split("/")[-1].split(".")[0]
    dt = datetime.datetime.strptime(date_in_name, "%Y%m%d_%H%M")
    outfile = outdir.joinpath(f"{dt.strftime('%Y%m%d%H%M')}_pp_new.psv.gz")
    
    # Read file
    torn = read_pp_file(file)
    
    # Make continuous probs
    try:
        continuous_torn = dc.make_continuous(torn)
    except ValueError:
        import sys
        if torn.max() == 0:
            with gzip.GzipFile(outfile, "w") as OUT:
                OUT.write("".encode())
            sys.exit(0)
        else:
            print("There was an uncaught error converting to continuous probabilities. Skipping file...")
            continue
            #sys.exit(1)
    
    if (glob.glob(f'./pp_forecasts/sigProbs/{date_in_name}.npz')):
        
        sigtorn = read_pp_file(f'./pp_forecasts/sigProbs/{date_in_name}.npz').astype(int)
        
        # Need this line to format sig torn file to work properly with logic (-1s replaced with 0s)
        sigtorn[sigtorn == -1] = 0
        
    else:
        
        # If there is no PP sig file, then create a 2-d array of 0s
        sigtorn = np.ndarray(shape=np.shape(torn), dtype=int, order='F')
        
    # Where there are 10 sig tor probs, set to 1
    sigtorn[sigtorn > 0] = 1
    
    # Make double sig (if necessary)
    if (torn.max() >= 30) and (sigtorn.max() > 0):
        sigtorn[torn >= 15] += 1
    
    # usesig changes the distribution used for tornado sampling
    sigtorn_1d = sigtorn.ravel()
    usesig = True if (dt.month in coolseason) or (sigtorn.max() > 0) else False
    
    ### Run Tornado Count Simulation ###
    #print(f"Running {nsims:,d} Tornado Count Simulations")
    tornado_dists = dc.TornadoDistributions()
    counts = np.zeros((5, nsims), dtype=int)
    counts[0, :] = (tornado_dists.f02.rvs(nsims) * ndfd_area * (torn == 2).sum()).astype(int)
    counts[1, :] = (tornado_dists.f05.rvs(nsims) * ndfd_area * (torn == 5).sum()).astype(int)
    counts[2, :] = (tornado_dists.f10.rvs(nsims) * ndfd_area * (torn == 10).sum()).astype(int)
    counts[3, :] = (tornado_dists.f15.rvs(nsims) * ndfd_area * (torn == 15).sum()).astype(int)
    counts[4, :] = (tornado_dists.f30.rvs(nsims) * ndfd_area * (torn >= 30).sum()).astype(int)
    
    ### Setup Impact Simulation ###
    igrids = dc.ImpactGrids(impacts_data_root)

    # Determine the indices of tornadoes for each prob level
    scounts = counts.sum(axis=1)
    inds02 = dc.weighted_choice(prob=2, probs=torn, cprobs=continuous_torn, size=scounts[0])
    inds05 = dc.weighted_choice(prob=5, probs=torn, cprobs=continuous_torn, size=scounts[1])
    inds10 = dc.weighted_choice(prob=10, probs=torn, cprobs=continuous_torn, size=scounts[2])
    inds15 = dc.weighted_choice(prob=15, probs=torn, cprobs=continuous_torn, size=scounts[3])
    inds30 = dc.weighted_choice(prob=30, probs=torn, cprobs=continuous_torn, size=scounts[4])
    inds = dc.flatten_list([inds02, inds05, inds10, inds15, inds30])
    
    non_sig_inds = sigtorn_1d[inds] == 0
    single_sig_inds = sigtorn_1d[inds] == 1
    double_sig_inds = sigtorn_1d[inds] == 2

    if usesig:
        single_sig_inds += non_sig_inds
        non_sig_inds[:] = False
        
    # Handle Locations
    non_sig_loc_inds = inds[non_sig_inds]
    single_sig_loc_inds = inds[single_sig_inds]
    double_sig_loc_inds = inds[double_sig_inds]
    
    # Handle Ratings
    _mags=[0, 1, 2, 3, 4, 5]
    non_sig_ratings = np.random.choice(_mags, size=non_sig_inds.sum(),
                                       replace=True, p=tornado_dists.r_nonsig)
    single_sig_ratings = np.random.choice(_mags, size=single_sig_inds.sum(),
                                          replace=True, p=tornado_dists.r_singlesig)
    double_sig_ratings = np.random.choice(_mags, size=double_sig_inds.sum(),
                                          replace=True, p=tornado_dists.r_doublesig)
    
    # Handle Distances
    non_sig_distances = dc.get_distances(non_sig_ratings, tornado_dists)
    single_sig_distances = dc.get_distances(single_sig_ratings, tornado_dists)
    double_sig_distances = dc.get_distances(double_sig_ratings, tornado_dists)
    
    #print("Running simulations...")
    #print("    Non Sig...")
    non_sig = dc.simulate(non_sig_loc_inds, non_sig_distances,
                          non_sig_ratings, tornado_direction_distribution, igrids)
    #print("    Single Sig...")
    single_sig = dc.simulate(single_sig_loc_inds, single_sig_distances,
                             single_sig_ratings, tornado_direction_distribution, igrids)
    #print("    Double Sig...")
    double_sig = dc.simulate(double_sig_loc_inds, double_sig_distances,
                             double_sig_ratings, tornado_direction_distribution, igrids)

    #print("Splitting simulations back out...")
    simulated_tornadoes = dc.flatten_list([non_sig, single_sig, double_sig])
    np.random.shuffle(simulated_tornadoes)
    _sims = np.split(simulated_tornadoes, counts.sum(axis=0).cumsum())[:-1]
    realizations = dc.Realizations([dc.SyntheticTornadoRealization(_sim, i+1) for i, _sim in enumerate(_sims)])


    #print("Writing Out gzipped PSV file...")
    with gzip.GzipFile(outfile, "w") as OUT:
        OUT.write(realizations.as_psv.encode())

Running file #0


Files with errors...

Errors are due to arrays without probs (due to the one tor file having a day with a tornado in PR or somewhere else)

In [56]:
paths.index('/Volumes/Backup Plus/pas_output/simulationFiles/199104091630_pp.psv.gz')

3044

In [61]:
for i, string in enumerate(paths[3044:3045]):
    print(string)

/Volumes/Backup Plus/pas_output/simulationFiles/199104091630_pp.psv.gz


## Getting bad files

## To do once sims are done
- Run through all files, read in data to df and inspect the county column -- if any tor row has greater than ~4 counties, add the file name to a list for further inspection
- Re-run sims on 19830607
- Re-run sims on 19990601
- 20110427

In [14]:
import pandas as pd

In [30]:
filepath = '/Volumes/Backup Plus/pas_output/simulationFiles/*'
paths = glob.glob(filepath)

to_check = naughtyList
naughtyList = []

for path in to_check:
    df = pd.read_csv(path,sep='|',nrows=50)
    
    cos_impacted = df['counties'].apply(lambda x: len(str(x).split(','))).mean()
    
    if cos_impacted > 2:
        #print('Uh oh!')
        naughtyList.append(path)

In [32]:
naughtyList

['/Volumes/Backup Plus/pas_output/simulationFiles/199812051630_pp.psv.gz']

In [28]:
with open('naughtylist.txt', 'w') as filehandle:
    for listitem in naughtyList:
        filehandle.write(f'{listitem}\n')

In [5]:
with open('naughtylist.txt') as f:
    naughtyList = f.readlines()

In [9]:
import numpy as np

In [17]:
a = np.ma.array([1,2,3,4,5], mask=[False,False,True,True,False])
b = np.array([1,2,np.nan,np.nan,5])

In [19]:
np.convolve(b,[0.5,0.5],'valid')

array([1.5, nan, nan, nan])

In [25]:
c = np.insert(b,0,b[4])
d = np.append(c,b[0])

In [33]:
start = 0
end = 3
mov_avg = []

while (end < (len(d)+1)):
    
    to_avg = d[start:end]
    nonzero_count = np.count_nonzero(~np.isnan(to_avg))
    total = np.sum(np.nan_to_num(to_avg))
    mov_avg.append(total/nonzero_count)
    
    start += 1
    end += 1

In [34]:
mov_avg

[2.6666666666666665, 1.5, 2.0, 5.0, 3.0]

In [30]:
np.count_nonzero(~np.isnan(d))

5

In [36]:
np.count_nonzero([0,1,0,3])

2