This script grabs out T2M data from the ISD station data and puts it in a netcdf file.
The following data are chosen:
- dates from 1979-2014
- only stations with latitudes north of 30N
- only stations with more than 20 years of data between 1979 and 2014.

The 29th Feb is removed from leap years.
T2M is set to nan when there is no data.

In [1]:
import pandas as pd
import sys 
import csv
import xarray as xr
import numpy as np
from CASutils import calendar_utils as cal
from math import nan as nan

Specify start year and end year for analysis.  Generate time axis for output and remove Feb 29th. 

In [2]:
ystart=1979
yend=2014
nyears=yend-ystart+1
 
# settting up output calendar dates
timeout = pd.date_range(start=str(ystart)+"-01-01",end=str(yend)+"-12-31")
# remove Feb 29th
timeout = timeout[~((timeout.month == 2) & (timeout.day == 29))]

Set up paths of data, inventory, isd-history file and output path

In [3]:
monstrings=['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
datpath="/project/mojave/observations/ISD/global-summary-of-the-day/archive/"
statfile="/project/mojave/observations/ISD/global-summary-of-the-day/isd-history.txt"
inventfile="/project/mojave/observations/ISD/global-summary-of-the-day/isd-inventory.csv"
fileout="/project/cas/islas/python/ISD/T2M_ISD_30Nto90N_1979_2014.nc"

Set up character locations of required columns from isd-history.txt.  Not sure there's a better way to parse this file that is space delimited but also with spaces in the titles and entries.

In [4]:
datestart=[82,90] 
dateend=[91,99]
latstr=[57,64]
lonstr=[65,73]
usafstr=[0,6]
wbanstr=[7,12]

Open the isd-history file and grab out relevant station information

In [5]:
f = open(statfile,"r")
# skip header
for i in range(22):
    f.readline()

#sys.exit()
usaf=[] ; wban=[] ; dates=[] ; datee=[] ; lon=[] ; lat=[]
count=0
for line in f:
    dates.append(line[datestart[0]:datestart[1]])
    datee.append(line[dateend[0]:dateend[1]])
    lon.append(line[lonstr[0]:lonstr[1]])
    lat.append(line[latstr[0]:latstr[1]])
    usaf.append(line[usafstr[0]:usafstr[1]])
    wban.append(line[wbanstr[0]:wbanstr[1]])
f.close()

Make a dictionary containing station information

In [6]:
dictstat=[{'wban': wban, 'usaf': usaf, 'lat': lat, 'lon':lon, 'dates':dates, 'datee':datee} for wban, usaf, lat, lon, dates, datee in zip (wban, usaf, lat, lon, dates, datee)]

In [7]:
inventory = pd.read_csv(inventfile)

  interactivity=interactivity, compiler=compiler, result=result)


Select only stations that have data extending from 1979 to 2014 and have latitude greter than 30degN and have more than 20 years worth of data in that period.

In [8]:
statuse=[]
statname=[]
usaf=[]
wban=[]
latstation=[]
lonstation=[]
for key in dictstat:
    try:
        latflt=float(key['lat'])
    except:
        latflt=-9999.
        
    try: 
        lonflt=float(key['lon'])
    except:
        lonflt=-9999.
        
    try:
        usafval = int(key['usaf'])
    except:
        usafval = key['usaf']
        
    inventdat = inventory.loc[inventory['USAF']==usafval]
    statyears = inventdat.loc[(inventdat['YEAR'] >= 1979) & (inventdat['YEAR'] <= 2014)]
    statyears = statyears['YEAR']
    
    datebegflt = float(key['dates'])
    dateendflt = float(key['datee'])
    
    # cut based on latitude and year start and year end and number of years of data
    if ((latflt >= 30.) and (datebegflt < ystart*10000) and (dateendflt >= (yend*10000+1)) and (len(statyears) > 20)):        
        statuse.append(key)
        statname.append(key['usaf']+key['wban'])
        usaf.append(key['usaf'])
        wban.append(key['wban'])
        latstation.append(latflt)
        lonstation.append(lonflt)

Generate dates for a non-leap year year

In [9]:
datevals=pd.date_range(start="1979-01-01",end="1979-12-31")
m = np.array(datevals.month)
mm = np.char.zfill(m.astype(str),2)
d = np.array(datevals.day)
dd = np.char.zfill(d.astype(str),2)
datestrings=[mmm + "-" +  ddd  for  mmm, ddd in zip(mm, dd) ]

In [None]:
t2m = np.empty([len(statname),nyears*365])
t2m[:,:]=nan
for istat in range(0,len(statname),1):
    print(str(istat)+':'+statname[istat])

    usafval = int(usaf[istat])
    wbanval = int(wban[istat])
    inventdat = inventory.loc[inventory['USAF']==usafval]
    statyears = inventdat.loc[(inventdat['YEAR'] >= 1979) & (inventdat['YEAR'] <= 2014)]
    statyears = statyears['YEAR']  

    for iyear in range(ystart,yend+1,1):
        if (iyear in statyears.astype(int).values):   
            datesofyear=[str(iyear)+'-'+i for i in datestrings]
            yearinvent = inventdat.loc[inventdat['YEAR'].astype(int) == iyear]
            yearinvent = yearinvent.loc[yearinvent['WBAN']==wbanval]
            #yearinvent = inventdat.where(inventdat['YEAR'].astype(int)==iyear).dropna(how='all')

            # check there's enough data in the year
            sumobs = 0.
            for imon in monstrings:
                sumobs = sumobs + np.array(yearinvent[imon])
            
            if (sumobs > 365): # only using the file if there's more then 365 obs going into the year (a bit arbitrary)
            
                fname=datpath+str(iyear)+"/"+statname[istat]+'.csv'

                try:
                    data = pd.read_csv(fname)
                    date_data = data[['DATE','TEMP']]
                    
                    # remove Feb 29th
                    date_data = date_data[~date_data['DATE'].isin([str(iyear)+'-02-29'])]
                    
                    alldates = [str(iyear)+'-'+i for i in datestrings]
                    # assign indices to dates
                    alldatesinds=dict() 
                    for i, j in enumerate(alldates):
                        alldatesinds.setdefault(j, []).append(i)
                        
                    # find indices of all dates that are in data
                    # and assign the relevant elements of t2m to the right place in the array
                    res = [alldatesinds.get(i, [None]) for i in date_data['DATE']]
                    t2m[istat,(iyear-ystart)*365+np.array(res).squeeze()] = date_data['TEMP']

                except:
                    t2m[istat,(iyear-ystart)*365:(iyear-ystart+1)*365] = nan            
            else:
                t2m[istat, (iyear-ystart)*365:(iyear-ystart+1)*365] = nan 
        else:
            t2m[istat,(iyear-ystart)*365:(iyear-ystart+1)*365] = nan

0:01001099999
1:01004099999
2:01008099999
3:01010099999
4:01017099999
5:01023099999
6:01025099999
7:01028099999
8:01033099999
9:01035099999
10:01041099999
11:01043099999
12:01045099999
13:01046099999
14:01047099999
15:01049099999
16:01052099999
17:01055099999
18:01057099999
19:01059099999
20:01062099999
21:01065099999
22:01074099999
23:01078099999
24:01083099999
25:01088099999
26:01089099999
27:01092099999
28:01098099999
29:01102099999
30:01112099999
31:01115099999


convert to xarray data array and merge into a dataset

In [38]:
t2mxr = xr.DataArray(t2m, coords=[statname, timeout], dims=['station','time'], name='t2m')
lon = xr.DataArray(lonstation, name='lon', coords=[statname], dims=['station'])
lat = xr.DataArray(latstation, name='lat', coords=[statname], dims=['station'])
stationdat = xr.merge([t2mxr, lon,lat])

In [39]:
stationdat.to_netcdf(path="/project/cas/islas/savs/python/ISD/T2M_ISD_1979_2014.nc")