In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import date, timedelta

In [2]:
# import data
df_weather = pd.read_csv('./assets/weather.csv')

In [3]:
df_weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


It is believed that hot and dry conditions are more favorable for West Nile virus than cold and wet. 

Dew point - the temperature to which air must be cooled to become saturated with water vapor

Wet-bulb - the temperature read by a thermometer covered in water-soaked cloth over which air is passed. At 100% relative humidity, the wet-bulb temperature is equal to the air temperature and it is lower at lower humidity.

In [4]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2944 entries, 0 to 2943
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Station      2944 non-null   int64  
 1   Date         2944 non-null   object 
 2   Tmax         2944 non-null   int64  
 3   Tmin         2944 non-null   int64  
 4   Tavg         2944 non-null   object 
 5   Depart       2944 non-null   object 
 6   DewPoint     2944 non-null   int64  
 7   WetBulb      2944 non-null   object 
 8   Heat         2944 non-null   object 
 9   Cool         2944 non-null   object 
 10  Sunrise      2944 non-null   object 
 11  Sunset       2944 non-null   object 
 12  CodeSum      2944 non-null   object 
 13  Depth        2944 non-null   object 
 14  Water1       2944 non-null   object 
 15  SnowFall     2944 non-null   object 
 16  PrecipTotal  2944 non-null   object 
 17  StnPressure  2944 non-null   object 
 18  SeaLevel     2944 non-null   object 
 19  Result

## Data Cleaning

### Change column names to lowercase

Change all column names to lowercase

Based on noaa_weather_qclcd_documentation.pdf

"T" means TRACE of precipation
"M" means Missing values
"-" means values not available


1. "T" 
Found in column PrecipTotal
Action - Will replace with zero

2. "M"
Found in columns AvgSpeed, Cool, Depart, Depth, Heat, Water1, Sealevel, SnowFall, StnPressure, Tavg, WaterBulb

3. "-"
Found in columns Sunrise and Sunset
Action - replace with NaN and use fillna with method="ffill"


codesum - split the codes and create dummmy


Remove Water1 since all its values are missing


In [5]:
# Check weather columns
df_weather.columns

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint',
       'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth',
       'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel',
       'ResultSpeed', 'ResultDir', 'AvgSpeed'],
      dtype='object')

In [6]:
# change all column names to lowercase
df_weather.columns = [x.lower() for x in df_weather.columns]

In [7]:
# verify
df_weather.columns

Index(['station', 'date', 'tmax', 'tmin', 'tavg', 'depart', 'dewpoint',
       'wetbulb', 'heat', 'cool', 'sunrise', 'sunset', 'codesum', 'depth',
       'water1', 'snowfall', 'preciptotal', 'stnpressure', 'sealevel',
       'resultspeed', 'resultdir', 'avgspeed'],
      dtype='object')

In [8]:
# check values of water1
df_weather.water1.unique()

array(['M'], dtype=object)

In [9]:
# since all the values of water1 are missing, we will drop the column 
df_weather.drop(columns=['water1'],inplace=True)

In [10]:
# check "T" values 
df_weather.preciptotal.unique()

array(['0.00', '  T', '0.13', '0.02', '0.38', '0.60', '0.14', '0.07',
       '0.11', '0.09', '1.01', '0.28', '0.04', '0.08', '0.01', '0.53',
       '0.19', '0.21', '0.32', '0.39', '0.31', '0.42', '0.27', '0.16',
       '0.58', '0.93', '0.05', '0.34', '0.15', '0.35', 'M', '0.40',
       '0.66', '0.30', '0.24', '0.43', '1.55', '0.92', '0.89', '0.17',
       '0.03', '1.43', '0.97', '0.26', '1.31', '0.06', '0.46', '0.29',
       '0.23', '0.41', '0.45', '0.83', '1.33', '0.91', '0.48', '0.37',
       '0.88', '2.35', '1.96', '0.20', '0.25', '0.18', '0.67', '0.36',
       '0.33', '1.28', '0.74', '0.76', '0.71', '0.95', '1.46', '0.12',
       '0.52', '0.64', '0.22', '1.24', '0.72', '0.73', '0.65', '1.61',
       '1.22', '0.50', '1.05', '2.43', '0.59', '2.90', '2.68', '1.23',
       '0.62', '6.64', '3.07', '1.44', '1.75', '0.82', '0.80', '0.86',
       '0.63', '0.55', '1.03', '0.70', '1.73', '1.38', '0.44', '1.14',
       '1.07', '3.97', '0.87', '0.78', '1.12', '0.68', '0.10', '0.61',
       '0.

In [11]:
# Replace values of "T" with zero

df_weather = df_weather.replace('  T', 0)

In [12]:
# check "T" values 
df_weather.preciptotal.unique()

array(['0.00', 0, '0.13', '0.02', '0.38', '0.60', '0.14', '0.07', '0.11',
       '0.09', '1.01', '0.28', '0.04', '0.08', '0.01', '0.53', '0.19',
       '0.21', '0.32', '0.39', '0.31', '0.42', '0.27', '0.16', '0.58',
       '0.93', '0.05', '0.34', '0.15', '0.35', 'M', '0.40', '0.66',
       '0.30', '0.24', '0.43', '1.55', '0.92', '0.89', '0.17', '0.03',
       '1.43', '0.97', '0.26', '1.31', '0.06', '0.46', '0.29', '0.23',
       '0.41', '0.45', '0.83', '1.33', '0.91', '0.48', '0.37', '0.88',
       '2.35', '1.96', '0.20', '0.25', '0.18', '0.67', '0.36', '0.33',
       '1.28', '0.74', '0.76', '0.71', '0.95', '1.46', '0.12', '0.52',
       '0.64', '0.22', '1.24', '0.72', '0.73', '0.65', '1.61', '1.22',
       '0.50', '1.05', '2.43', '0.59', '2.90', '2.68', '1.23', '0.62',
       '6.64', '3.07', '1.44', '1.75', '0.82', '0.80', '0.86', '0.63',
       '0.55', '1.03', '0.70', '1.73', '1.38', '0.44', '1.14', '1.07',
       '3.97', '0.87', '0.78', '1.12', '0.68', '0.10', '0.61', '0.54',
      

#### Check where are the missing values "M"

In [13]:
df_weather[df_weather['avgspeed']=='M']

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,sunset,codesum,depth,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
87,2,2007-06-13,86,68,77,M,53,62,0,12,...,-,,M,M,0.0,M,M,7.0,5,M
1745,2,2011-09-14,60,48,54,M,45,51,11,0,...,-,RA BR HZ FU,M,M,0.0,29.47,M,6.0,32,M
2067,2,2012-08-22,84,72,M,M,51,61,M,M,...,-,,M,M,0.0,29.39,M,4.7,19,M


In [14]:
df_weather[df_weather['preciptotal']=='M']

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,sunset,codesum,depth,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
117,2,2007-06-28,73,61,67,M,56,61,0,2,...,-,,M,M,M,29.43,30.07,12.2,2,13.3
119,2,2007-06-29,71,56,64,M,56,60,1,0,...,-,,M,M,M,29.47,30.11,7.4,2,8.2


In [15]:
df_weather[df_weather['cool']=='M']

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,sunset,codesum,depth,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
7,2,2007-05-04,78,51,M,M,42,50,M,M,...,-,,M,M,0.0,29.36,30.04,10.1,7,10.4
505,2,2008-07-08,86,46,M,M,68,71,M,M,...,-,TS RA,M,M,0.28,29.16,29.80,7.4,24,8.3
675,2,2008-10-01,62,46,M,M,41,47,M,M,...,-,,M,M,0.0,29.3,29.96,10.9,33,11.0
1637,2,2011-07-22,100,71,M,M,70,74,M,M,...,-,TS TSRA BR,M,M,0.14,29.23,29.86,3.8,10,8.2
2067,2,2012-08-22,84,72,M,M,51,61,M,M,...,-,,M,M,0.0,29.39,M,4.7,19,M
2211,2,2013-05-02,71,42,M,M,39,45,M,M,...,-,,M,M,0.0,29.51,30.17,15.8,2,16.1
2501,2,2013-09-24,91,52,M,M,48,54,M,M,...,-,,M,M,0.0,29.33,30.00,5.8,9,7.7
2511,2,2013-09-29,84,53,M,M,48,54,M,M,...,-,RA BR,M,M,0.22,29.36,30.01,6.3,36,7.8
2525,2,2013-10-06,76,48,M,M,44,50,M,M,...,-,RA DZ BR,M,M,0.06,29.1,29.76,10.1,25,10.6
2579,2,2014-05-02,80,47,M,M,43,47,M,M,...,-,RA,M,M,0.04,29.1,29.79,10.7,23,11.9


In [16]:
df_weather[df_weather['depart']=='M']

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,sunset,codesum,depth,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,-,,M,M,0.00,29.18,29.82,2.7,25,9.6
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,-,BR HZ,M,M,0.00,29.44,30.08,13.3,2,13.4
5,2,2007-05-03,67,48,58,M,40,50,7,0,...,-,HZ,M,M,0.00,29.46,30.12,12.9,6,13.2
7,2,2007-05-04,78,51,M,M,42,50,M,M,...,-,,M,M,0.00,29.36,30.04,10.1,7,10.4
9,2,2007-05-05,66,54,60,M,39,50,5,0,...,-,,M,M,0,29.46,30.09,11.2,7,11.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2935,2,2014-10-27,79,54,67,M,52,59,0,2,...,-,RA,M,M,0.02,29.00,29.67,12.7,19,13.6
2937,2,2014-10-28,66,48,57,M,40,48,8,0,...,-,RA,M,M,0.03,29.23,29.85,14.0,26,14.6
2939,2,2014-10-29,49,40,45,M,34,42,20,0,...,-,,M,M,0.00,29.42,30.07,8.5,29,9.0
2941,2,2014-10-30,53,37,45,M,35,42,20,0,...,-,RA,M,M,0,29.41,30.10,5.9,23,6.5


In [17]:
df_weather[df_weather['depth']=='M']

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,sunset,codesum,depth,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,-,,M,M,0.00,29.18,29.82,2.7,25,9.6
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,-,BR HZ,M,M,0.00,29.44,30.08,13.3,2,13.4
5,2,2007-05-03,67,48,58,M,40,50,7,0,...,-,HZ,M,M,0.00,29.46,30.12,12.9,6,13.2
7,2,2007-05-04,78,51,M,M,42,50,M,M,...,-,,M,M,0.00,29.36,30.04,10.1,7,10.4
9,2,2007-05-05,66,54,60,M,39,50,5,0,...,-,,M,M,0,29.46,30.09,11.2,7,11.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2935,2,2014-10-27,79,54,67,M,52,59,0,2,...,-,RA,M,M,0.02,29.00,29.67,12.7,19,13.6
2937,2,2014-10-28,66,48,57,M,40,48,8,0,...,-,RA,M,M,0.03,29.23,29.85,14.0,26,14.6
2939,2,2014-10-29,49,40,45,M,34,42,20,0,...,-,,M,M,0.00,29.42,30.07,8.5,29,9.0
2941,2,2014-10-30,53,37,45,M,35,42,20,0,...,-,RA,M,M,0,29.41,30.10,5.9,23,6.5


In [18]:
df_weather[df_weather['heat']=='M']

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,sunset,codesum,depth,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
7,2,2007-05-04,78,51,M,M,42,50,M,M,...,-,,M,M,0.0,29.36,30.04,10.1,7,10.4
505,2,2008-07-08,86,46,M,M,68,71,M,M,...,-,TS RA,M,M,0.28,29.16,29.80,7.4,24,8.3
675,2,2008-10-01,62,46,M,M,41,47,M,M,...,-,,M,M,0.0,29.3,29.96,10.9,33,11.0
1637,2,2011-07-22,100,71,M,M,70,74,M,M,...,-,TS TSRA BR,M,M,0.14,29.23,29.86,3.8,10,8.2
2067,2,2012-08-22,84,72,M,M,51,61,M,M,...,-,,M,M,0.0,29.39,M,4.7,19,M
2211,2,2013-05-02,71,42,M,M,39,45,M,M,...,-,,M,M,0.0,29.51,30.17,15.8,2,16.1
2501,2,2013-09-24,91,52,M,M,48,54,M,M,...,-,,M,M,0.0,29.33,30.00,5.8,9,7.7
2511,2,2013-09-29,84,53,M,M,48,54,M,M,...,-,RA BR,M,M,0.22,29.36,30.01,6.3,36,7.8
2525,2,2013-10-06,76,48,M,M,44,50,M,M,...,-,RA DZ BR,M,M,0.06,29.1,29.76,10.1,25,10.6
2579,2,2014-05-02,80,47,M,M,43,47,M,M,...,-,RA,M,M,0.04,29.1,29.79,10.7,23,11.9


In [19]:
df_weather[df_weather['sealevel']=='M']

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,sunset,codesum,depth,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
87,2,2007-06-13,86,68,77,M,53,62,0,12,...,-,,M,M,0.0,M,M,7.0,5,M
832,1,2009-06-18,80,61,71,1,63,67,0,6,...,1929,RA BR,0,0.0,0.12,29.08,M,6.7,16,7.9
994,1,2009-09-07,77,59,68,1,59,62,0,3,...,1817,BR,0,0.0,0.0,29.39,M,5.8,3,4.0
1732,1,2011-09-08,75,57,66,0,53,59,0,1,...,1815,RA,0,0.0,0.0,29.34,M,13.0,2,13.4
1745,2,2011-09-14,60,48,54,M,45,51,11,0,...,-,RA BR HZ FU,M,M,0.0,29.47,M,6.0,32,M
1756,1,2011-09-20,74,49,62,0,54,58,3,0,...,1753,MIFG BCFG BR,0,0.0,0.0,29.26,M,7.3,18,7.3
2067,2,2012-08-22,84,72,M,M,51,61,M,M,...,-,,M,M,0.0,29.39,M,4.7,19,M
2090,1,2012-09-03,88,71,80,12,70,73,0,15,...,1824,BR,0,0.0,0.0,29.17,M,4.6,6,4.4
2743,2,2014-07-23,76,64,70,M,56,61,0,5,...,-,,M,M,0.0,29.47,M,16.4,2,16.7


In [20]:
df_weather[(df_weather.index>=832)].loc[:,['station','sealevel']]

Unnamed: 0,station,sealevel
832,1,M
833,2,29.79
834,1,29.70
835,2,29.68
836,1,29.76
...,...,...
2939,2,30.07
2940,1,30.09
2941,2,30.10
2942,1,30.20


In [22]:
df_weather.dtypes

station          int64
date            object
tmax             int64
tmin             int64
tavg            object
depart          object
dewpoint         int64
wetbulb         object
heat            object
cool            object
sunrise         object
sunset          object
codesum         object
depth           object
snowfall        object
preciptotal     object
stnpressure     object
sealevel        object
resultspeed    float64
resultdir        int64
avgspeed        object
dtype: object

#### Prepare to impute missing values

In [23]:
# Convert "M" and "-" to NaN
df_weather = df_weather.replace('M', np.NaN)
df_weather = df_weather.replace('-', np.NaN)

# check for null values
df_weather.isna().sum()

station           0
date              0
tmax              0
tmin              0
tavg             11
depart         1472
dewpoint          0
wetbulb           4
heat             11
cool             11
sunrise        1472
sunset         1472
codesum           0
depth          1472
snowfall       1472
preciptotal       2
stnpressure       4
sealevel          9
resultspeed       0
resultdir         0
avgspeed          3
dtype: int64

In [24]:
# Function to impute missing values
# if station 1 is nan but not station 2, get value from station 2
# if station 2 is nan but not station 2, get value from station 1
# if both station 1 and 2 for the same day are nan, get value from either previous/next day's observation

def fill_missing(df):
    row,col = df.shape
    for i in range(row):
        for j in range(col):
            if pd.isnull(df.iloc[i,j]):
                if df.iloc[i,0]==1:   
                    if pd.notnull(df.iloc[i+1,j]):
                        df.iloc[i,j]=df.iloc[i+1,j]    
                    else:
                        df.iloc[i,j]=df.iloc[i-1,j] 
                else:
                    if pd.notnull(df.iloc[i-1,j]):
                        df.iloc[i,j]=df.iloc[i-1,j]
                    else:
                        df.iloc[i,j]=df.iloc[i+1,j]
                

In [25]:
# execute the imputation of missing value function
fill_missing(df_weather)

In [26]:
# check the result
df_weather.isna().sum()

station        0
date           0
tmax           0
tmin           0
tavg           0
depart         0
dewpoint       0
wetbulb        0
heat           0
cool           0
sunrise        0
sunset         0
codesum        0
depth          0
snowfall       0
preciptotal    0
stnpressure    0
sealevel       0
resultspeed    0
resultdir      0
avgspeed       0
dtype: int64

#### The above result shows there is no more missing values

In [27]:
df_weather

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,sunset,codesum,depth,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,1849,,0,0.0,0.00,29.10,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,14,51,57,0,3,...,1849,,0,0.0,0.00,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,1850,BR,0,0.0,0.00,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,-3,42,47,13,0,...,1850,BR HZ,0,0.0,0.00,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,1851,,0,0.0,0.00,29.39,30.12,11.7,7,11.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939,2,2014-10-29,49,40,45,-4,34,42,20,0,...,1650,,0,0.0,0.00,29.42,30.07,8.5,29,9.0
2940,1,2014-10-30,51,32,42,-4,34,40,23,0,...,1649,,0,0.0,0.00,29.34,30.09,5.1,24,5.5
2941,2,2014-10-30,53,37,45,-4,35,42,20,0,...,1649,RA,0,0.0,0,29.41,30.10,5.9,23,6.5
2942,1,2014-10-31,47,33,40,-6,25,33,25,0,...,1647,RA SN,0,0.1,0.03,29.49,30.20,22.6,34,22.9


In [28]:
# convert date field to date
df_weather['date'] =  pd.to_datetime(df_weather['date'])

In [29]:
df_weather.dtypes

station                 int64
date           datetime64[ns]
tmax                    int64
tmin                    int64
tavg                   object
depart                 object
dewpoint                int64
wetbulb                object
heat                   object
cool                   object
sunrise                object
sunset                 object
codesum                object
depth                  object
snowfall               object
preciptotal            object
stnpressure            object
sealevel               object
resultspeed           float64
resultdir               int64
avgspeed               object
dtype: object

#### Codesum contains a list of weather types in text (categorical variables). We need to convert them to dummy variables

In [30]:
# convert values to lowercase so that column names created will be lowercase
df_weather['codesum'] = df_weather['codesum'].str.lower()

In [31]:
# Create an empty list as a globale variable
code_sum_list = []

# Define a function to be used in the subsequent code block
def get_code_sum_elems(code_sum_string):
    global code_sum_list # use the global list
    
    # split codesum into weather types and add them to code_sum_list
    code_sum_list.extend(code_sum_string.split()) 
    return True

# Step 1 : get codesum from each observation
# Step 2 : split codesum into indivisual weather type
# Step 3 : add the the weather types to code_sum_list
for i in df_weather['codesum']:
    get_code_sum_elems(i.strip())   # strip the text 

# Now code_sum_list has collected all weather types, many of them are duplicates
# So use set to extract unique weather types
code_sum_set = set(code_sum_list)  

# Now create the dummy variables, one for each weather type (16 altogther)
for i in code_sum_set:
    df_weather[i] = 0

for index, row in df_weather.iterrows():
    cs_list = row['codesum'].split() # split the codesum string
    
    for j in cs_list:
        #set corresponding column to 1
        if j in code_sum_set:
            df_weather.loc[index,j]=1

# Npw the dummy variables have been created. We can drop the 'codesum' column
df_weather.drop(labels='codesum',axis=1,inplace=True)

In [32]:
df_weather.head()

Unnamed: 0,station,date,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,...,br,fg,fg+,tsra,hz,sq,dz,vcfg,mifg,sn
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,0,0,0,0,0,0,0,0,0,0
1,2,2007-05-01,84,52,68,14,51,57,0,3,...,0,0,0,0,0,0,0,0,0,0
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,1,0,0,0,0,0,0,0,0,0
3,2,2007-05-02,60,43,52,-3,42,47,13,0,...,1,0,0,0,1,0,0,0,0,0
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# Save cleaned data
df_weather.to_csv('./assets/data_clean/weather_clean.csv', index=False)

In [34]:
# Read cleaned data to verify
df_weather = pd.read_csv('./assets/data_clean/weather_clean.csv')

Plot
- precipitation
- wind speed
- temperature



heatmap


date/time field

how effective is the spray?
using plot to visualize? 
correlation
what cause WMV? precipitation, temperature, rain

#### convert date to date datatype and set it as index

In [35]:
df_weather['date'] =  pd.to_datetime(df_weather['date'])

df_weather.set_index('date',inplace=True)