In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob

In [18]:
def fog_season(data_frame):
    plt.clf()
    month_list = range(1, 13)
    fog_count = []

    for m in month_list:
        fog_count.append(
            data_frame[(data_frame['wx'].str.contains('Fog')) & (data_frame.obs_time.month == m)].shape[0])

    station_name = data_frame.iloc[0].station
    plt.bar(month_list, fog_count)
    # plt.xticks(m)
    plt.title('{} Airport - Fog reports per month'.format(station_name))
    plt.savefig('{}_fog_seasonality.png'.format(station_name))
    plt.show()

In [19]:
for i in glob.iglob('./station_csvs/*csv'):
    df = pd.read_csv(i)
    print df.columns

Index([u'station_id', u' obs_time', u' wx', u' rh', u' t', u' td', u' wdir',
       u' wspeed', u' pressure'],
      dtype='object')
Index([u'station_id', u' obs_time', u' wx', u' rh', u' t', u' td', u' wdir',
       u' wspeed', u' pressure'],
      dtype='object')
Index([u'station_id', u' obs_time', u' wx', u' rh', u' t', u' td', u' wdir',
       u' wspeed', u' pressure'],
      dtype='object')
Index([u'station_id', u' obs_time', u' wx', u' rh', u' t', u' td', u' wdir',
       u' wspeed', u' pressure'],
      dtype='object')
Index([u'station_id', u' obs_time', u' wx', u' rh', u' t', u' td', u' wdir',
       u' wspeed', u' pressure'],
      dtype='object')
Index([u'station_id', u' obs_time', u' wx', u' rh', u' t', u' td', u' wdir',
       u' wspeed', u' pressure'],
      dtype='object')
Index([u'station_id', u' obs_time', u' wx', u' rh', u' t', u' td', u' wdir',
       u' wspeed', u' pressure'],
      dtype='object')
Index([u'station_id', u' obs_time', u' wx', u' rh', u' t', u' td', u'

In [3]:
def rearrange(data_frame):
    
    # replacing 'Fog', 'Partial Fog' etc occurrences for 1's, other for 0's
    wx_types = list(data_frame.presentwx.unique())
    fog_types = []

    for i in wx_types:
        if 'FG' in i:
            fog_types.append(i)
            wx_types.remove(i)

    data_frame.presentwx = data_frame.presentwx.replace(fog_types, 1)
    data_frame.presentwx = data_frame.presentwx.replace(wx_types, 0)
    
    # removing consecutive observations of fog
    del_indexes = []
    for index, row in data_frame[0:-1].iterrows():
        if row.presentwx == 1 and data_frame.iloc[index + 1].presentwx == 1:
            del_indexes.append(index)

    data_frame = data_frame.drop(data_frame.index[del_indexes])
    # placing features from X hours before in current line
    lead_hours = 6

    tmpf = (lead_hours+1)*['M']
    dwpf = (lead_hours+1)*['M']
    relh = (lead_hours+1)*['M']
    drct = (lead_hours+1)*['M']
    sknt = (lead_hours+1)*['M']
    alti = (lead_hours+1)*['M']

    for index, row in data_frame[lead_hours:-1].iterrows():
        valid_time = row['valid'] - pd.Timedelta(hours=lead_hours, minutes=row['valid'].minute)
        lead_row = data_frame.loc[data_frame['valid'] == valid_time]

        try:
            tmpf.append(lead_row['tmpf'].values[0])
        except:
            tmpf.append('M')

        try:
            dwpf.append(lead_row['dwpf'].values[0])
        except:
            dwpf.append('M')

        try:
            relh.append(lead_row['relh'].values[0])
        except:
            relh.append('M')

        try:
            drct.append(lead_row['drct'].values[0])
        except:
            drct.append('M')

        try:
            sknt.append(lead_row['sknt'].values[0])
        except:
            sknt.append('M')

        try:
            alti.append(lead_row['alti'].values[0])
        except:
            alti.append('M')

    data_frame['tmpf_{}h'.format(lead_hours)] = np.asarray(tmpf)
    data_frame['dwpf_{}h'.format(lead_hours)] = np.asarray(dwpf)
    data_frame['relh_{}h'.format(lead_hours)] = np.asarray(relh)
    data_frame['drct_{}h'.format(lead_hours)] = np.asarray(drct)
    data_frame['sknt_{}h'.format(lead_hours)] = np.asarray(sknt)
    data_frame['alti_{}h'.format(lead_hours)] = np.asarray(alti)

    # removing some columns, sending 'presentwx' to the right
    cols = data_frame.columns.tolist()
    for i in ['tmpf', 'dwpf', 'relh', 'drct', 'sknt', 'p01i', 'alti', 'vsby']:
        cols.remove(i)
    cols.append(cols.pop(cols.index('presentwx')))
    data_frame = data_frame[cols]
    
    # removing 'M' rows
    data_frame = data_frame.replace('M', np.nan)
    data_frame = data_frame.dropna(how='any')
    
    # to remove sparcicity, drop months outside Fog Season (april-september) or drop random 'no fog' rows
#     data_frame = data_frame.drop(data_frame[(data_frame['valid'].dt.month < 4)].index)
#     data_frame = data_frame.drop(data_frame[(data_frame['valid'].dt.month > 7)].index)
    
#     no_fog = data_frame[(data_frame['valid'].dt.month > 7)].index
#     no_fog = np.random.choice(no_fog,int(np.shape(no_fog)[0]*0.2),replace=False)

    data_frame = data_frame.drop(data_frame.query('presentwx == 0').sample(frac=.97).index)
    
    
    return data_frame

In [4]:
# column_names = ['station_id', 'obs_time', 'wx', 'rh', 't', 'td', 'wdir', 'wspeed', 'pressure']
column_names = ['station', 'valid', 'tmpf', 'dwpf', 'relh', 'drct', 'sknt', 'p01i', 'alti', 'mslp', 'vsby', 'gust',
                'skyc1', 'skyc2', 'skyc3', 'skyc4', 'skyl1', 'skyl2', 'skyl3', 'skyl4', 'presentwx', 'metar']

usecols = ['station', 'valid', 'tmpf', 'dwpf', 'relh', 'drct', 'sknt', 'p01i', 'alti', 'vsby', 'presentwx']

dataset = pd.read_csv('./SBPA.csv', names=column_names, skiprows=6, parse_dates=['valid'], usecols=usecols,
                 date_parser=lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M'), low_memory=False)

In [5]:
dataset.shape

(50941, 11)

In [6]:
dataset = rearrange(dataset)

In [7]:
print(dataset.shape)
np.histogram(dataset['valid'].dt.month,bins=[1,2,3,4,5,6,7,8,9,10,11])

(1612, 9)


(array([153, 102, 158, 134, 169, 162, 161, 120, 114, 209]),
 array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11]))

In [8]:
# Reading Features and Labels
X = dataset.iloc[:, 2:8].values
y = dataset.iloc[:, 8].values

In [27]:
# Generatin Training, Validation and Testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

In [28]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
# X_val = sc.transform(X_val)
X_test = sc.transform(X_test)



In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

# classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
# classifier = RandomForestClassifier(n_estimators=30, criterion='entropy', random_state=0)
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[264  11]
 [ 44   4]]


In [30]:
(264.+4.)/np.sum(cm)


0.8297213622291022

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

print(np.histogram(y_train,bins=[0,.5,1]))
print(np.histogram(y_val,bins=[0,.5,1]))
print(np.histogram(y_test,bins=[0,.5,1]))


(array([900, 131]), array([ 0. ,  0.5,  1. ]))
(array([221,  37]), array([ 0. ,  0.5,  1. ]))
(array([275,  48]), array([ 0. ,  0.5,  1. ]))


In [18]:
from keras.models import Sequential
from keras.layers import Dense, Dropout


classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(Dense(8,  activation='relu', input_dim=X_train.shape[1]))

# Adding hidden layers
classifier.add(Dense(16,  activation='relu'))
classifier.add(Dropout(0.3))

classifier.add(Dense(32,  activation='relu'))
classifier.add(Dropout(0.3))

classifier.add(Dense(64,  activation='relu'))
classifier.add(Dropout(0.3))

# Adding the output layer
classifier.add(Dense(1, activation='sigmoid', kernel_initializer="uniform"))


# Compiling the ANN
classifier.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])


In [19]:
# Training the classifier

from keras.callbacks import ModelCheckpoint  

### TODO: specify the number of epochs that you would like to use to train the model.

epochs = 20

checkpointer = ModelCheckpoint(filepath='weights1.hdf5', verbose=1, save_best_only=True)

classifier.fit(X_train, y_train, 
          validation_data=(X_val, y_val),
          epochs=epochs, batch_size=1, callbacks=[checkpointer], verbose=1)

Train on 1031 samples, validate on 258 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x11c21e5c0>

In [16]:
# Predicting the Test set results
classifier.load_weights('weights1.hdf5')
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

print(cm)

[[275   0]
 [ 48   0]]


In [None]:
from sklearn.tree import DecisionTreeClassifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)