IN DEVELOPMENT

In [None]:
# import wget
from urllib.request import urlretrieve
import os
import tarfile
import glob
import pandas as pd

In [None]:
# urls and data paths

data_dir=r'C://Users/kerrie/Documents/02_LocalData/tutorials/GHCNm/' # where to store data
# data_dir='/c/Users/kerrie/Documents/02_LocalData/tutorials/GHCNm/' # where to store data
data_url='https://www.ncei.noaa.gov/pub/data/ghcn/v4/ghcnm.tavg.latest.qcf.tar.gz'  # url of data file
fn_data=data_url.split('/')[-1]  # filename

meta_url='https://www.ncei.noaa.gov/pub/data/ghcn/v4/readme.txt'  # url of metadata file
fn_meta=meta_url.split('/')[-1]  # filename

In [None]:
# download files
urlretrieve(meta_url,data_dir+fn_meta)
urlretrieve(data_url,data_dir+fn_data)

In [None]:
# unzip/untar the data file
with tarfile.open(data_dir+fn_data) as f:
    f.extractall(path=data_dir)

In [None]:
# get the filenames of the unpacked data files
datfile=glob.glob(data_dir+'*/*.dat')[0]
invfile=glob.glob(data_dir+'*/*.inv')[0]

datfile,invfile

Open the readme.txt file that you've downloaded to see a description of how the data are arranged in the .dat and .inv data files.

The readme tells us that the station information is given in the .inv file where each line of the file is 68 characters long and contains the
station ID, lat, lon, elevation, and station name.

The readme also tells us that the .dat file contains the monthly data values for each station. A single line of the .dat file is 115 characters 
long and contains the data for a single station (station ID, year, element, 
Jan data value, Jan data flag1, Jan data flag2, Jan data flag3, Feb data value, Feb data flag1, ... , Dec data flag3).  

This format of this data as a conituous string on each line of the file means we will have to parse these strings so that we can work with the 
data as numerical values. You can imagine that it may be useful to parse these strings into large tables where each row is a station and each 
column is a different field (stationID, year, data value, etc). The most useful python package to use for this is called Pandas. Pandas will 
allow us to organize and easily query and manipulate tabular data in a data structure called a dataframe.  


In [None]:
# We need to parse our text strings into a dataframe
# There are no column names in the data file so we need to manually type them here

# Also, usually text file data has a separator like a space or comma between different data items which allows for easier file reading.
# In this case there is no separator so we have to manually type where each data item begins and ends using the indexes in the readme file.

# make a list of all the column names (from readme)
colnames = ['ID','YEAR','ELEMENT',
            'VALUE1','DMFLAG1','QCFLAG1','DSFLAG1',
            'VALUE2','DMFLAG2','QCFLAG2','DSFLAG2',
            'VALUE3','DMFLAG3','QCFLAG3','DSFLAG3',
            'VALUE4','DMFLAG4','QCFLAG4','DSFLAG4',
            'VALUE5','DMFLAG5','QCFLAG5','DSFLAG5',
            'VALUE6','DMFLAG6','QCFLAG6','DSFLAG6',
            'VALUE7','DMFLAG7','QCFLAG7','DSFLAG7',
            'VALUE8','DMFLAG8','QCFLAG8','DSFLAG8',
            'VALUE9','DMFLAG9','QCFLAG9','DSFLAG9',
            'VALUE10','DMFLAG10','QCFLAG10','DSFLAG10',
            'VALUE11','DMFLAG11','QCFLAG11','DSFLAG11',
            'VALUE12','DMFLAG12','QCFLAG12','DSFLAG12']

# column indexes of the start and end of each data item (from readme)
# note the readme gives column numbers starting at 1, 
# but python indexing is zero-based and is exclusive of the end index
splitcol=[0,11,15,19,24,25,26,27,32,33,34,35,40,41,42,43,48,49,50,51,56,57,58,59,64,65,66,67,72,73,74,75,80,81,82,83,88,89,90,91,96,97,98,99,104,105,106,107,112,113,114,115]

with open(datfile) as f:
    data=f.read()
data

# First we'll create an empty dataframe with all the appropriate columns (get the column names from the readme file)


# do an example of read one line to 51 col df
# do an example of read one line to 1 col of df then split



    # print(data)
    # exit


    # for line in f:
    #     data=[line[splitcol[i]:splitcol[i+1]] for i in range(len(splitcol)-1)]
    #     df = pd.concat([df,pd.DataFrame([data],columns=colnames)],ignore_index=True)




    # parse the string into the different data items (indexes from readme)
    # data=[line[0:11],int(line[11:15]),line[15:19],
    #     int(line[19:24]),line[24:25],line[25:26],line[26:27],
    #     int(line[27:32]),line[32:33],line[33:34],line[34:35],
    #     int(line[35:40]),line[40:41],line[41:42],line[42:43],
    #     int(line[43:48]),line[48:49],line[49:50],line[50:51],
    #     int(line[51:56]),line[56:57],line[57:58],line[58:59],
    #     int(line[59:64]),line[64:65],line[65:66],line[66:67],
    #     int(line[67:72]),line[72:73],line[73:74],line[74:75],
    #     int(line[75:80]),line[80:81],line[81:82],line[82:83],
    #     int(line[83:88]),line[88:89],line[89:90],line[90:91],
    #     int(line[91:96]),line[96:97],line[97:98],line[98:99],
    #     int(line[99:104]),line[104:105],line[105:106],line[106:107],
    #     int(line[107:112]),line[112:113],line[113:114],line[114:115]]
    # df=pd.DataFrame(data,columns=['line'])

# df                                                    

In [None]:
print(data)

In [None]:
# this is advanced
with open(datfile) as f:
    # data is a generator object, which is like a lazy iterator
    # we're describing how to split each line of the file up, but not doing it yet
    # the generator object holds these instructions for later and doesn't use up memory
    data=( [line[splitcol[i]:splitcol[i+1]] for i in range(len(splitcol)-1)] for line in f ) # generator object

    # pandas can create a dataframe from a generator, which performs much faster and uses less memory 
    # than other techniques for converting the .dat file to a dataframe (such as append/concat line by line)
    df=pd.DataFrame(data,columns=colnames)