# Cleaning the weather dataset

In [78]:
import pandas as pd

In [79]:
weather = pd.read_csv('../data/weather.csv')

In [80]:
weather.columns

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint',
       'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth',
       'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel',
       'ResultSpeed', 'ResultDir', 'AvgSpeed'],
      dtype='object')

In [81]:
weather.shape

(2944, 22)

In [82]:
weather.isnull().sum().sum()

0

## At least there aren't any true null values, but missing data is encoded as an 'M' in this dataframe

# Date

Well, they're all length ten strings, so probably all good here.

In [83]:
weather.Date.map(len).value_counts()

10    2944
Name: Date, dtype: int64

# Tmax, Tmin, Tavg, Depart

In [84]:
Tmax_missing = weather['Tmax'].map(lambda val: val == 'M')
Tmax_missing.sum()

0

In [85]:
Tmin_missing = weather['Tmin'].map(lambda val: val == 'M')
Tmin_missing.sum()

0

In [86]:
Tavg_missing = weather['Tavg'].map(lambda val: val == 'M')
Tavg_missing.sum()

11

A few missing Tavg, we'll drop those rows.

In [87]:
weather = weather[~Tavg_missing]

In [88]:
Depart_missing = weather['Depart'].map(lambda val: val == 'M')
Depart_missing.sum()

1461

# DewPoint, WetBulb

In [89]:
DewPoint_missing = weather['DewPoint'].map(lambda val: val == 'M')
DewPoint_missing.sum()

0

In [90]:
WetBulb_missing = weather['WetBulb'].map(lambda val: val == 'M')
WetBulb_missing.sum()

4

In [91]:
weather = weather[~WetBulb_missing]

# Heat, Cool

In [92]:
Heat_missing = weather['Heat'].map(lambda val: val == 'M')
Heat_missing.sum()

0

In [93]:
Cool_missing = weather['Cool'].map(lambda val: val == 'M')
Cool_missing.sum()

0

# Sunrise, Sunset

In [94]:
Sunrise_missing = weather['Sunrise'].map(lambda val: val == 'M')
Sunrise_missing.sum()

0

In [95]:
Sunset_missing = weather['Sunset'].map(lambda val: val == 'M')
Sunset_missing.sum()

0

# CodeSum, Depth

In [96]:
CodeSum_missing = weather['CodeSum'].map(lambda val: val == 'M')
CodeSum_missing.sum()

0

In [97]:
Depth_missing = weather['Depth'].map(lambda val: val == 'M')
Depth_missing.sum()

1460

In [98]:
weather['Depth'].value_counts()

0    1469
M    1460
Name: Depth, dtype: int64

In [99]:
weather.drop('Depth', 1, inplace=True)

# Water1, SnowFall, PrecipTotal, 

In [100]:
Water1_missing = weather['Water1'].map(lambda val: val == 'M')
Water1_missing.sum()

2929

In [101]:
weather['Water1'].value_counts()

M    2929
Name: Water1, dtype: int64

In [102]:
weather.drop('Water1', 1, inplace=True)

In [103]:
SnowFall_missing = weather['SnowFall'].map(lambda val: val == 'M')
SnowFall_missing.sum()

1460

In [104]:
weather['SnowFall'].value_counts()

M      1460
0.0    1456
  T      12
0.1       1
Name: SnowFall, dtype: int64

In [105]:
weather.drop('SnowFall', 1, inplace=True)

In [106]:
PrecipTotal_missing = weather['PrecipTotal'].map(lambda val: val == 'M')
PrecipTotal_missing.sum()

2

In [107]:
weather = weather[~PrecipTotal_missing]

# StnPressure SeaLevel, ResultSpeed , ResultDir, AvgSpeed


In [108]:
StnPressure_missing = weather['StnPressure'].map(lambda val: val == 'M')
StnPressure_missing.sum()

2

In [109]:
SeaLevel_missing = weather['SeaLevel'].map(lambda val: val == 'M')
SeaLevel_missing.sum()

8

In [110]:
ResultSpeed_missing = weather['ResultSpeed'].map(lambda val: val == 'M')
ResultSpeed_missing.sum()

0

In [111]:
ResultDir_missing = weather['ResultDir'].map(lambda val: val == 'M')
ResultDir_missing.sum()

0

In [112]:
AvgSpeed_missing = weather['AvgSpeed'].map(lambda val: val == 'M')
AvgSpeed_missing.sum()

2

In [113]:
weather = weather[(~StnPressure_missing) & (~SeaLevel_missing) & (~AvgSpeed_missing)]

## Exporting to csv

In [114]:
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,0448,1849,,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,-,-,,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,0447,1850,BR,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,-,-,BR HZ,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,0446,1851,,0.0,29.39,30.12,11.7,7,11.9


In [115]:
weather.isnull().sum().sum()

0

In [116]:
weather.to_csv('../data/weather_cleaned.csv', index=False)