In [1]:
# keegan saunders
# cs334 FA2020
# HW4

import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
%matplotlib inline

In [2]:
# NOTE: Previous accuracy was 89%... However, I was using the precipitation feature on accident. 
#       After removing the feature, accuracy is at 81.75%, so that will be what my 5% increase will be based upon.

# I started by retreiving a little more data, and increasing the amount of domains. Notabley taking advantage of
# weather types, and wind direction. I believe the weather types account for different kinds of clouds in the sky.

# retreived from https://www.ncdc.noaa.gov/cdo-web/search
df = pd.read_csv("BOI_WEATHER2.csv")
df.drop(['STATION', 'TAVG', 'SNWD', 'WESD', 'WDF5', 'WSF2', 'WSF5'], inplace=True, axis=1)
df.head()

Unnamed: 0,DATE,AWND,FMTM,PGTM,PRCP,SNOW,TMAX,TMIN,WDF2,WT01,...,WT07,WT08,WT09,WT11,WT13,WT16,WT17,WT18,WT19,WT22
0,2010-01-01,14.09,2137.0,2136.0,0.44,0.0,44,35,280.0,1.0,...,,,,1.0,1.0,1.0,,,,
1,2010-01-02,2.01,48.0,717.0,0.0,0.0,46,30,130.0,1.0,...,1.0,1.0,,,,,,,,
2,2010-01-03,3.58,1155.0,1153.0,0.0,0.0,40,22,150.0,1.0,...,,,,,1.0,,,,,1.0
3,2010-01-04,2.91,2057.0,1208.0,0.0,0.0,35,28,70.0,1.0,...,,,,,1.0,,,1.0,,
4,2010-01-05,1.57,1933.0,929.0,0.05,0.0,36,31,310.0,1.0,...,,,,,1.0,1.0,,1.0,1.0,


In [3]:
# Filled these features with the previous values. Wind direction for 2 minute highest gust, Fastest Miles Time, 
# and temperature min/max.
df[['WDF2', 'FMTM', 'AWND', 'TMAX', 'TMIN']] = df[['WDF2', 'FMTM', 'AWND', 'TMAX', 'TMIN']].fillna(method='ffill')

# For weather types, I saw that values were either 1.0 or NAN so I interpreted it as being boolean and filled with 0
im_lazy = ['WT01', 'WT02', 'WT03', 'WT04', 'WT05', 'WT06','WT07', 'WT08', 
           'WT09', 'WT11', 'WT13', 'WT16', 'WT17', 'WT18', 'WT19', 'WT22']
df[im_lazy] = df[im_lazy].fillna(0)
df.head()

Unnamed: 0,DATE,AWND,FMTM,PGTM,PRCP,SNOW,TMAX,TMIN,WDF2,WT01,...,WT07,WT08,WT09,WT11,WT13,WT16,WT17,WT18,WT19,WT22
0,2010-01-01,14.09,2137.0,2136.0,0.44,0.0,44,35,280.0,1.0,...,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
1,2010-01-02,2.01,48.0,717.0,0.0,0.0,46,30,130.0,1.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2010-01-03,3.58,1155.0,1153.0,0.0,0.0,40,22,150.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,2010-01-04,2.91,2057.0,1208.0,0.0,0.0,35,28,70.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,2010-01-05,1.57,1933.0,929.0,0.05,0.0,36,31,310.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0


In [4]:
df["RAIN_BOOL"] = ""
df["WND_YESTERDAY"] = ""
df.loc[0, "WND_YESTERDAY"] = 0
df.head()

Unnamed: 0,DATE,AWND,FMTM,PGTM,PRCP,SNOW,TMAX,TMIN,WDF2,WT01,...,WT09,WT11,WT13,WT16,WT17,WT18,WT19,WT22,RAIN_BOOL,WND_YESTERDAY
0,2010-01-01,14.09,2137.0,2136.0,0.44,0.0,44,35,280.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,,0.0
1,2010-01-02,2.01,48.0,717.0,0.0,0.0,46,30,130.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,
2,2010-01-03,3.58,1155.0,1153.0,0.0,0.0,40,22,150.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,,
3,2010-01-04,2.91,2057.0,1208.0,0.0,0.0,35,28,70.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,,
4,2010-01-05,1.57,1933.0,929.0,0.05,0.0,36,31,310.0,1.0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,


In [5]:
for index in df.index:
    if df.loc[index, 'PRCP'] > 0.00:
       df.loc[index, 'RAIN_BOOL'] = 1
    else: df.loc[index, 'RAIN_BOOL'] = 0

# Decided to add a new column that tracks what the direction of the wind was, during the 2 minute highest gust
# yesterday. Didn't end up making too much of a difference. Would probably be able to find average wind direction
# for a full day if I was using a different dataset.
for index in df.index[1:]:
    df.loc[index, 'WND_YESTERDAY'] = df.loc[index - 1, 'WDF2']

In [6]:
df.head()

Unnamed: 0,DATE,AWND,FMTM,PGTM,PRCP,SNOW,TMAX,TMIN,WDF2,WT01,...,WT09,WT11,WT13,WT16,WT17,WT18,WT19,WT22,RAIN_BOOL,WND_YESTERDAY
0,2010-01-01,14.09,2137.0,2136.0,0.44,0.0,44,35,280.0,1.0,...,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1,0
1,2010-01-02,2.01,48.0,717.0,0.0,0.0,46,30,130.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,280
2,2010-01-03,3.58,1155.0,1153.0,0.0,0.0,40,22,150.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0,130
3,2010-01-04,2.91,2057.0,1208.0,0.0,0.0,35,28,70.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0,150
4,2010-01-05,1.57,1933.0,929.0,0.05,0.0,36,31,310.0,1.0,...,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1,70


In [7]:
features = ['AWND', 'FMTM', 'SNOW', 'TMAX', 'TMIN', 'WT01', 'WT02', 'WT03', 'WT04', 
            'WT05', 'WT06','WT07', 'WT08', 'WT09', 'WT11', 'WT13', 'WT16', 'WT17', 
            'WT18', 'WT19', 'WT22', 'WND_YESTERDAY']
label = ['RAIN_BOOL']

X = df[features].values
Y = df[label].values
Y=Y.astype('int')

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

In [8]:
model_tts = LogisticRegression(max_iter=5000)
model_tts.fit(X_train, Y_train.ravel())

LogisticRegression(max_iter=5000)

In [10]:
# This is the accuracy with a normal train test split.
print("Accuracy:",metrics.accuracy_score(Y_test, model_tts.predict(X_test)))

Accuracy: 0.8938879456706282


In [13]:
# yeah I lifted this from your jupyter notebook
# Looks like this has lowered the accuracy, but it is probably a more accurate reading.

skf = StratifiedKFold(n_splits=5)
model_skf = LogisticRegression(max_iter=5000)
results_skf = model_selection.cross_val_score(model_skf, X, Y.ravel(), cv=skf)
print("Accuracy: %.2f%%" % (results_skf.mean()*100.0))

Accuracy: 87.03%
