In [1]:
import pandas as pd
import numpy as np

In [2]:
filename='72061600207'

In [181]:
# load file
df = pd.read_csv(filename + '.csv')
# keep only relevant columns
columns = ['STATION', 'DATE', 'REPORT_TYPE', 'HourlyDryBulbTemperature', 'HourlyPrecipitation', 
             'HourlyStationPressure', 'HourlyVisibility', 'HourlyWindSpeed']
df = df[columns]

In [182]:
# keep only REPORT_TYPE FM-15 = METAR Aviation routine weather report => hourly report for whole month
df = df[df.REPORT_TYPE == 'FM-15']
df = df.drop(columns=['REPORT_TYPE'])

In [183]:
def clean_up(df, column_name):
    # some rows contain numeric values followed by letters, while some are null
    # convert everything to string => 54 = '54', '72s' = '72s', nan = 'nan'
    df[column_name] = df[column_name].astype(str)
    # Remove all letters => 54 = '54', '72s' = '72', nan = ''
    df[column_name] = df[column_name].replace('[A-Za-z]+','', regex=True)
    # Change '' back to valid nan: np.nan
    df[column_name] = df[column_name].str.replace('^$', lambda _: np.nan)
    # Replace * with np.nan
    df[column_name] = df[column_name].str.replace('\*', lambda _: np.nan)
    # Convert back to numeric all numeric values, nan remains none
    df[column_name] = pd.to_numeric(df[column_name])
    # Fill left nan with average values
    df[column_name] = (df[column_name].fillna(method='ffill') + df[column_name].fillna(method='bfill'))/2
    # Fill boundary nan values 
    df[column_name] = df[column_name].fillna(method='bfill')
    df[column_name] = df[column_name].fillna(method='ffill')
    return df[column_name]

In [184]:
df.HourlyDryBulbTemperature = clean_up(df, 'HourlyDryBulbTemperature')
df.HourlyPrecipitation = clean_up(df, 'HourlyPrecipitation')
df.HourlyStationPressure = clean_up(df, 'HourlyStationPressure')
df.HourlyVisibility = clean_up(df, 'HourlyVisibility')
df.HourlyWindSpeed = clean_up(df, 'HourlyWindSpeed')

In [186]:
df.to_csv(filename + '-filtered.csv', index=False)