In [1]:
import pandas as pd

In [79]:
fields = ['STN', 'WBAN', 'YEARMODA', 'TEMP', 'TEMP_count', 'DEWP', 'DEWP_count', 'SLP', 'SLP_count', 'STP', 'STP_count', 'VISIB', 'VISIB_count', 'WDSP', 'WDSP_count', 'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP', 'FRSHTT']

In [162]:
df = pd.read_csv('CDO7301306888149.txt', 
                  sep=r'\s+', 
                  names=fields, 
                  header=0, 
                  parse_dates=['YEARMODA'], 
                  na_values={'TEMP':[9999.9], 
                             'DEWP':[9999.9], 
                             'SLP':[9999.9], 
                             'STP':[9999.9], 
                             'VISIB':[999.9], 
                             'WDSP':[999.9], 
                             'MXSPD':[999.9], 
                             'GUST':[999.9], 
                             'MAX':['9999.9'], # doesn't matter whether float or str
                             'MIN':['9999.9'], 
                             'PRCP':['99.99'],
                             'SNDP':[999.9]}
                 )
df.head()

Unnamed: 0,STN,WBAN,YEARMODA,TEMP,TEMP_count,DEWP,DEWP_count,SLP,SLP_count,STP,...,VISIB_count,WDSP,WDSP_count,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT
0,722900,23188,1998-11-01,61.5,24,55.2,24,1016.9,24,1016.2,...,24,3.5,24,8.0,,66.9*,53.6*,0.00C,,100000
1,722900,23188,1998-11-02,61.2,24,53.8,24,1013.8,24,1012.9,...,24,4.6,24,12.0,,68.0,54.0,0.00D,,0
2,722900,23188,1998-11-03,61.2,24,54.5,24,1014.1,24,1013.2,...,24,4.5,24,13.0,,68.0*,54.0*,0.00C,,100000
3,722900,23188,1998-11-04,61.4,24,56.0,24,1014.9,24,1014.0,...,24,3.0,24,8.9,,68.0*,53.6*,0.00D,,100000
4,722900,23188,1998-11-05,61.2,24,55.6,24,1014.0,24,1013.1,...,24,5.6,24,13.0,,69.1,54.0,0.00D,,100000


In [163]:
flagged = df.copy()
flagged['MAX'] = df['MAX'].map(lambda x: float(x[:-1]) if '*' in x else float(x))
flagged['MAX_flag'] = df['MAX'].map(lambda x: True if '*' in x else False)
flagged['MIN'] = df['MIN'].map(lambda x: float(x[:-1]) if '*' in x else float(x))
flagged['MIN_flag'] = df['MIN'].map(lambda x: True if '*' in x else False)
flagged.head()

Unnamed: 0,STN,WBAN,YEARMODA,TEMP,TEMP_count,DEWP,DEWP_count,SLP,SLP_count,STP,...,WDSP_count,MXSPD,GUST,MAX,MIN,PRCP,SNDP,FRSHTT,MAX_flag,MIN_flag
0,722900,23188,1998-11-01,61.5,24,55.2,24,1016.9,24,1016.2,...,24,8.0,,66.9,53.6,0.00C,,100000,True,True
1,722900,23188,1998-11-02,61.2,24,53.8,24,1013.8,24,1012.9,...,24,12.0,,68.0,54.0,0.00D,,0,False,False
2,722900,23188,1998-11-03,61.2,24,54.5,24,1014.1,24,1013.2,...,24,13.0,,68.0,54.0,0.00C,,100000,True,True
3,722900,23188,1998-11-04,61.4,24,56.0,24,1014.9,24,1014.0,...,24,8.9,,68.0,53.6,0.00D,,100000,True,True
4,722900,23188,1998-11-05,61.2,24,55.6,24,1014.0,24,1013.1,...,24,13.0,,69.1,54.0,0.00D,,100000,False,False


In [169]:
flagged['PRCP'] = df['PRCP'].map(lambda x: float(x[:-1]) if type(x) is str else x)
PRCP_flag = df['PRCP'].map(lambda x: x[-1] if type(x) is str else x)
PRCP_dummies = pd.get_dummies(PRCP_flag).add_prefix('PRCP_')

In [173]:
preprocessed = flagged.join(PRCP_dummies)
preprocessed.head()

Unnamed: 0,STN,WBAN,YEARMODA,TEMP,TEMP_count,DEWP,DEWP_count,SLP,SLP_count,STP,...,FRSHTT,MAX_flag,MIN_flag,PRCP_A,PRCP_B,PRCP_C,PRCP_D,PRCP_G,PRCP_H,PRCP_I
0,722900,23188,1998-11-01,61.5,24,55.2,24,1016.9,24,1016.2,...,100000,True,True,0,0,1,0,0,0,0
1,722900,23188,1998-11-02,61.2,24,53.8,24,1013.8,24,1012.9,...,0,False,False,0,0,0,1,0,0,0
2,722900,23188,1998-11-03,61.2,24,54.5,24,1014.1,24,1013.2,...,100000,True,True,0,0,1,0,0,0,0
3,722900,23188,1998-11-04,61.4,24,56.0,24,1014.9,24,1014.0,...,100000,True,True,0,0,0,1,0,0,0
4,722900,23188,1998-11-05,61.2,24,55.6,24,1014.0,24,1013.1,...,100000,False,False,0,0,0,1,0,0,0


In [172]:
pd.to_datetime(df['YEARMODA'], format='%Y%m%d').head()

0   1998-11-01
1   1998-11-02
2   1998-11-03
3   1998-11-04
4   1998-11-05
Name: YEARMODA, dtype: datetime64[ns]

In [None]:
preprocessed.to_csv()

In [184]:
%%writefile preprocessing.py
"""
Usage:

Input file must be space delimited.

1. As a module

    from preprocessing import preprocess
    preprocess('input_file.txt')

2. As a command line tool

Specify the name of the preprocessed file

    python3 preprocessing.py input_file.txt preprocessed.csv
    
or derive the name from the input file
    
    python3 preprocessing.py input_file.txt
    (This generates input_file_preprocessed.csv)

"""

import pandas as pd

def preprocess(filename):
    """Preprocess NCDC weather data"""
    
    fields = ['STN', 'WBAN', 'YEARMODA', 'TEMP', 'TEMP_count', 'DEWP', 'DEWP_count', 'SLP', 'SLP_count', 'STP', 'STP_count', 'VISIB', 'VISIB_count', 'WDSP', 'WDSP_count', 'MXSPD', 'GUST', 'MAX', 'MIN', 'PRCP', 'SNDP', 'FRSHTT']
    
    df = pd.read_csv('CDO7301306888149.txt', 
                  sep=r'\s+', 
                  names=fields, 
                  header=0, 
                  parse_dates=['YEARMODA'], 
                  na_values={'TEMP':[9999.9], 
                             'DEWP':[9999.9], 
                             'SLP':[9999.9], 
                             'STP':[9999.9], 
                             'VISIB':[999.9], 
                             'WDSP':[999.9], 
                             'MXSPD':[999.9], 
                             'GUST':[999.9], 
                             'MAX':['9999.9'], # doesn't matter whether float or str
                             'MIN':['9999.9'], 
                             'PRCP':['99.99'],
                             'SNDP':[999.9]}
                 )
    
    flagged = df.copy()
    flagged['MAX'] = df['MAX'].map(lambda x: float(x[:-1]) if '*' in x else float(x))
    flagged['MAX_flag'] = df['MAX'].map(lambda x: True if '*' in x else False)
    flagged['MIN'] = df['MIN'].map(lambda x: float(x[:-1]) if '*' in x else float(x))
    flagged['MIN_flag'] = df['MIN'].map(lambda x: True if '*' in x else False)
    
    flagged['PRCP'] = df['PRCP'].map(lambda x: float(x[:-1]) if type(x) is str else x)
    PRCP_flag = df['PRCP'].map(lambda x: x[-1] if type(x) is str else x)
    PRCP_dummies = pd.get_dummies(PRCP_flag).add_prefix('PRCP_')
    preprocessed = flagged.join(PRCP_dummies)
    
    return preprocessed

if __name__ == '__main__':
    
    from sys import argv
    if len(argv) == 3:
        preprocessed = preprocess(argv[1])
        preprocessed.to_csv(argv[2])
    elif len(argv) == 2:
        preprocessed = preprocess(argv[1])
        preprocessed.to_csv(argv[1].split('.')[0] + '_preprocessed.csv')
    else:
        raise Exception('Not correct number of arguments')

Overwriting preprocessing.py
