In [4]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gzip
from zipfile import ZipFile

In [None]:
# loading the temp.zip and creating a zip object
"""
with ZipFile("data/noaa.zip", 'r') as zObject:
  
    # Extracting all the members of the zip 
    # into a specific location.
    zObject.extractall(path="data/noaa/")
"""

In [29]:
verbose = False

# process a single .op.gz file
def process_opgz (opgz_path):
    # read in the data
    station_data = []
    with gzip.open(opgz_path,'rb') as station_file:
        station_contents = station_file.readlines()[1:]
    columns = ['STN', 'YEAR', 'MODA', 
                 'TEMP','DEWP', 'SLP',
                 'VISIB', 'WDSP','PRCP', 'SNDP'] # header
    # let's extract the data from their character-wise position (seems safest, ref. readme.txt)
    station_data += list(map(lambda line : [line[:6], line[14:18], line[18:22], 
                                            line[24:30], line[35:41], line[46:52],
                                            line[68:73], line[78:83], line[118:123], line[125:130]], station_contents))
    station_df = pd.DataFrame(station_data, columns=columns)

    return station_df

# process a single year, i.e., .tar file
import tarfile
from tqdm import tqdm
from IPython.display import clear_output

   
def process_tar (tar_path):
    clear_output(wait=True)
    print("Processing year data from file %s.." % tar_path)
    # extract the tarfile
    print(' - extracting tarfile.. ', end='', flush=True)
    with tarfile.open(tar_path) as tar:
        tar.extractall(path='./temp')
    print('done.', flush=True)
    # process all the op.gz files
    print(" - processing .op.gz files.. ", flush=True)
    
    year_df = pd.DataFrame(columns=['STN', 'YEAR', 'MODA', 
                 'TEMP','DEWP', 'SLP',
                 'VISIB', 'WDSP','PRCP', 'SNDP'])
    station_files = sorted(os.listdir("./temp"))
    station_dfs = []
    for station_file in tqdm(station_files):
        station_df = process_opgz("./temp/"+station_file)
        if station_df.shape[0] > 0:
            station_dfs.append(station_df)
    year_df = pd.concat(station_dfs)
    print('    done.', flush=True)
    print(" - removing temporary files.. ", end='', flush=True)
    ! rm -r './temp'
    print('done.', flush=True)
    return year_df

In [30]:
!rm -r './temp'

In [36]:
year_files = ["data/noaa/gsod_all_years/gsod_2007.tar",
              "data/noaa/gsod_all_years/gsod_2008.tar",
             "data/noaa/gsod_all_years/gsod_2008.tar",
              "data/noaa/gsod_all_years/gsod_2010.tar",
              "data/noaa/gsod_all_years/gsod_2011.tar",
              "data/noaa/gsod_all_years/gsod_2012.tar"
             ]

In [32]:
#year_files = ["data/noaa/gsod_all_years/gsod_2007.tar"]

In [37]:
from tqdm import tqdm
import warnings

warnings.filterwarnings('ignore')

year_dfs = []

for year_file in year_files:
    year_dfs.append(process_tar(year_file))
    
df_weather = pd.concat(year_dfs)


Processing year data from file data/noaa/gsod_all_years/gsod_2008.tar..
 - extracting tarfile.. done.
 - processing .op.gz files.. 



  0%|                                                                                                                                                             | 0/10725 [00:00<?, ?it/s][A
  1%|█▍                                                                                                                                               | 106/10725 [00:00<00:10, 1056.08it/s][A
  2%|███                                                                                                                                              | 224/10725 [00:00<00:09, 1126.11it/s][A
  3%|████▌                                                                                                                                            | 337/10725 [00:00<00:09, 1107.60it/s][A
  4%|██████                                                                                                                                           | 448/10725 [00:00<00:09, 1107.52it/s][A
  5%|███████▌                          

KeyboardInterrupt: 

In [None]:
import pickle as pkl
with open('processed_data_2007.pkl', 'wb') as f:
    pkl.dump(df, f)

In [None]:
df = process_tar(year_files[1])
with open('processed_data_2008.pkl', 'wb') as f:
    pkl.dump(df, f)

In [None]:
df = process_tar(year_files[2])
with open('processed_data_2009.pkl', 'wb') as f:
    pkl.dump(df, f)

In [None]:
df = process_tar(year_files[3])
with open('processed_data_2010.pkl', 'wb') as f:
    pkl.dump(df, f)

In [None]:
df = process_tar(year_files[4])
with open('processed_data_2011.pkl', 'wb') as f:
    pkl.dump(df, f)

In [None]:
df = process_tar(year_files[5])
with open('processed_data_2012.pkl', 'wb') as f:
    pkl.dump(df, f)