In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import re
import os
import codecs
import ftplib
import urllib
import tarfile
import gzip
import shutil
import datetime

# Weather Import Pipeline

### Data Cleaning Process:

-  Filled `NaN` in `PRCP` with 0 (i.e. assuming `NaN` means "no rain")

-  Divide all temperatures (`TMIN`, `TAVG`, `TMAX`) by 10 (originally measured in tenths of degrees C, now just degrees C)

-  For `NaN` values in `TMIN` and `TMAX`:
    - Take the mean of the differences of `TMIN` and `TAVG` (or `TMAX` and `TAVG`)
    - Fill `NaN` by adding or subtracting the mean difference from `TAVG`
    - Example:
    
Before:

TMIN | TAVG | TMAX
--- | --- | ---
NaN | 10 | NaN


```
avg_min_diff = 5
avg_max_diff = 3
```

After:

TMIN | TAVG | TMAX
--- | --- | ---
5 | 10 | 13

        

In [48]:
def import_transform(stations,yr):
    print('retrieving file...')
    urllib.request.urlretrieve('ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/'+yr+'.csv.gz','new'+yr+'.csv.gz')
    print('unzipping...')
    with gzip.open('new'+yr+'.csv.gz', 'rb') as f_in:
        with open('new'+yr+'.csv', 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)
    print('making dataframe...')
    to_add = pd.read_csv("new"+yr+".csv",names=["Station",'Date',"Element","Value",'Flag1','Flag2','Flag3','Time'])
    print('selecting relevant data...')
    to_add.drop(['Flag1','Flag2','Flag3','Time'],axis='columns',inplace=True)
    to_add = to_add.loc[to_add['Station'].isin(inbounds['ID'].values),:]
    print('reformatting...')
    piv = to_add.pivot_table(values='Value',columns='Element',index=['Station','Date']).reset_index()
    piv.columns.name = None
    piv.rename({'Station':'ID'},axis='columns',inplace=True)
    print('Done!')
    return piv

def merge_clean(df,stations,country_dict):
    
    df['Date'] = pd.to_datetime(df['Date'],format="%Y%m%d")
    df = df.merge(stations,how='left',on='ID')
    df = df[['ID','Date','LAT','LON','ELV','TAVG','TMAX','TMIN','PRCP']]
    df['Country'] = df['ID'].str.slice(stop=2)
    df = df[['Country','ID','Date','LAT','LON','ELV','TAVG','TMAX','TMIN','PRCP']]
    df['Country'].replace(to_replace=country_dict,inplace=True)
    df = df.sort_values(by=['Date','Country']).reset_index(drop=True)
    df['PRCP'] = df.loc[:,'PRCP'].fillna(0)
    df.loc[:,['TAVG','TMAX','TMIN']] = df.copy().loc[:,['TAVG','TMAX','TMIN']]/10
    avgdiff_maxavg = np.mean(df['TMAX']-df['TAVG'])
    avgdiff_avgmin = np.mean(df['TAVG']-df['TMIN'])
    df['TMAX'] = df.loc[:,'TMAX'].fillna(df['TAVG']+avgdiff_maxavg)
    df['TMIN'] = df.loc[:,'TMIN'].fillna(df['TAVG']-avgdiff_avgmin)
    return df

In [None]:
# station info
inbounds = pd.read_csv('inbounds.csv',index_col=0)

# load current dataset
current = pd.read_csv('all_weather.csv')

# dict to convert country codes to country names
countries = pd.read_fwf('ghcnd-countries.txt')

rpl = dict(zip(countries['ID'],countries['Name']))

In [None]:
year = '2019'
allnew = merge_clean(import_transform(inbounds,year),inbounds,rpl)
new_data = allnew.loc[-allnew['Date'].isin(test['Date'].values),:]
current = pd.concat([current,new_data])
current.to_csv('all_weather.csv')
os.remove('new'+year+'.csv.gz')
os.remove('new'+year+'.csv')