In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn import preprocessing


import utility_functions as fn

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
# reads from csv into a new dataframe
raw_weather = pd.read_csv('weather_raw.csv',index_col=1)

# reindexes by appropriate datetime
raw_weather.index = pd.to_datetime(raw_weather.index,format='%Y-%m-%d %H:00:00 +0000 UTC')
raw_weather.index.names = ['']

# removes unused columns
raw_weather.drop(raw_weather.columns[np.array([0,1,2,3,4,9,10,13,15,16,17,19,20,21,23,24,25,26])],axis=1,inplace=True)

# renames columns
column_names = ['temp','tmin','tmax','pressure','humidity','wind','rain','snow','clouds']
raw_weather.columns = column_names

#reorders columns
raw_weather = raw_weather[['temp','tmin','tmax','pressure','humidity','wind','clouds','rain','snow']]

In [3]:
# drops duplicate rows
raw_weather.drop_duplicates(inplace=True)

In [4]:
# TEMPORARY bypass missing 2014-2015 data
weather = raw_weather.truncate('2016-01-01 00:00:00','2019-02-10 00:00:00')

In [5]:
# creates new index shifted by one position, used to calculate gap sizes
offbyone = raw_weather.iloc[raw_weather.index.get_loc(weather.index[0])-1:len(raw_weather.index)-1]

# create bool column indicating if a time gap is greater than six hours
# weather['impute_ok'] = weather.index - offbyone.index <= '06:00:00'

weather.insert(0,'impute_ok',(weather.index - offbyone.index <= '06:00:00'))


In [6]:
# reindexes to fill missing rows, for a total length of 27265
correct_dt = pd.DatetimeIndex(start='2016-01-01 00:00:00',end='2019-02-10 00:00:00',freq='h')
weather = weather.reindex(index=correct_dt)

In [7]:
# fills gaps in data with True if less than 6 hours, False if more
weather['impute_ok'].fillna(method='bfill',inplace=True)

In [8]:
# removes weird wind outlier
weather['wind'] = weather['wind'].where(weather['wind'] < 20*weather['wind'].median())

# assigns NaN precip values to zero, this is an unconfirmed assumption
weather['rain'].replace(np.NaN,0,inplace=True)
weather['snow'].replace(np.NaN,0,inplace=True)

# removes negative clouds measurements
weather['clouds'].clip(lower=0,inplace=True)

# sets nonsense zeroes in temp, pressure, and humidity to NaN
weather.iloc[:,[0,1,2,3,4]] = weather.iloc[:,[0,1,2,3,4]].replace(0,np.NaN)

weather.describe()

# fn.snapshotplot(weather,'temp','2016-07-15 00:00:00','2016-07-25 00:00:00')

Unnamed: 0,temp,tmin,tmax,pressure,humidity,wind,clouds,rain,snow
count,25505.0,25505.0,25505.0,25505.0,25505.0,25504.0,25505.0,27265.0,27265.0
mean,284.515651,282.245787,286.604817,1016.829876,70.902333,2.978748,41.812429,0.069799,0.002332
std,10.090681,10.227527,10.14853,8.172056,19.657567,1.865644,40.306978,0.52976,0.061998
min,251.79,247.15,255.15,981.0,8.0,0.0,0.0,0.0,0.0
25%,276.57,274.15,278.15,1012.0,56.0,2.0,1.0,0.0,0.0
50%,284.52,282.15,286.45,1017.0,73.0,3.0,36.0,0.0,0.0
75%,293.17,291.15,295.15,1022.0,88.0,4.0,90.0,0.0,0.0
max,307.51,305.37,310.93,1044.0,100.0,15.0,100.0,28.19,4.0


In [9]:
# imputes only values that have impute_ok = True
# in the future, try linear vs quadratic here
# weather.mask(weather['impute_ok']==True, weather.interpolate(method='polynomial', order=7, limit=6), inplace=True)

weather.interpolate(method='polynomial', order=7,inplace=True)

ValueError: Odd degree for now only. Got 6.

In [None]:
# plot over missing chunk to check that the simple impute has yielded expected results
fn.plot_feature(weather,'temp','2016-07-15 00:00:00','2016-07-25 00:00:00')

In [None]:
# fn.gap_check(weather)

In [None]:
# imputes large segments, ignores period for now
# to be replaced by FFT imputation in the future

weather = weather.interpolate(method='linear')

# removes impute_ok column
weather.pop('impute_ok')

fn.plot_feature(weather,'temp','2016-07-17 00:00:00','2016-07-23 00:00:00')

In [None]:
# this function adds columns with data from previous hours associated with the target hour
hours_before = np.arange(16,25)
weather_with_previous = fn.add_hours_before(weather,hours_before)
weather_with_previous.describe()

In [None]:
# implements sklearn MinMaxScaler
x = weather_with_previous.values
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
x_scaled = scaler.fit_transform(x)

scaled = pd.DataFrame(x_scaled)
scaled.columns = weather_with_previous.columns
scaled.index = weather_with_previous.index

scaled.describe()

In [None]:
scaled.to_csv('weather_clean.csv')

In [None]:
# USEFUL STUFF

# # returns index labels where nan values appear for a certain column
# nan_index = weather['temp'].index[weather['temp'].apply(np.isnan)]
# nan_index

# correct and complete datetime index for the date range considered
# dt = pd.DatetimeIndex(start='2013-01-01 00:00:00',end='2019-02-10 00:00:00',freq='h')

# # renames (here, makes lowercase) column labels using a simple loop
# df.columns = [x.lower() for x in df.columns]

# # implements sklearn scaler
# from sklearn import preprocessing
# x = df.values #returns a numpy array
# scaler = preprocessing.MinMaxScaler()
# x_scaled = scaler.fit_transform(x)
# df = pandas.DataFrame(x_scaled)