# Cleaning the weather dataset

## Importing data and modules

In [1]:
import pandas as pd

weather = pd.read_csv('../data/weather.csv')

In [2]:
weather.columns

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint',
       'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth',
       'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel',
       'ResultSpeed', 'ResultDir', 'AvgSpeed'],
      dtype='object')

## Checking for null values

In [3]:
weather.shape

(2944, 22)

In [4]:
weather.isnull().sum().sum()

0

** There aren't any true null values, but based on the data dictionary missing data is actually encoded as an 'M' in this dataset**.

# Date

Well, they're all length ten strings, so probably all good here.

In [5]:
weather.Date.map(len).value_counts()

10    2944
Name: Date, dtype: int64

# Tmax, Tmin, Tavg, Depart

In [6]:
Tmax_missing = weather['Tmax'].map(lambda val: val == 'M')
Tmax_missing.sum()

0

In [7]:
Tmin_missing = weather['Tmin'].map(lambda val: val == 'M')
Tmin_missing.sum()

0

In [8]:
Tavg_missing = weather['Tavg'].map(lambda val: val == 'M')
Tavg_missing.sum()

11

A few missing Tavg, we'll drop those rows.

In [9]:
weather = weather[~Tavg_missing]

In [10]:
Depart_missing = weather['Depart'].map(lambda val: val == 'M')
Depart_missing.sum()

1461

# DewPoint, WetBulb

In [11]:
DewPoint_missing = weather['DewPoint'].map(lambda val: val == 'M')
DewPoint_missing.sum()

0

In [12]:
WetBulb_missing = weather['WetBulb'].map(lambda val: val == 'M')
WetBulb_missing.sum()

4

In [13]:
weather = weather[~WetBulb_missing]

# Heat, Cool

In [14]:
Heat_missing = weather['Heat'].map(lambda val: val == 'M')
Heat_missing.sum()

0

In [15]:
Cool_missing = weather['Cool'].map(lambda val: val == 'M')
Cool_missing.sum()

0

# Sunrise, Sunset

In [16]:
Sunrise_missing = weather['Sunrise'].map(lambda val: val == 'M')
Sunrise_missing.sum()

0

In [17]:
Sunset_missing = weather['Sunset'].map(lambda val: val == 'M')
Sunset_missing.sum()

0

# CodeSum, Depth

In [18]:
CodeSum_missing = weather['CodeSum'].map(lambda val: val == 'M')
CodeSum_missing.sum()

0

In [19]:
Depth_missing = weather['Depth'].map(lambda val: val == 'M')
Depth_missing.sum()

1460

In [20]:
weather['Depth'].value_counts()

0    1469
M    1460
Name: Depth, dtype: int64

In [21]:
weather.drop('Depth', 1, inplace=True)

# Water1, SnowFall, PrecipTotal, 

In [22]:
Water1_missing = weather['Water1'].map(lambda val: val == 'M')
Water1_missing.sum()

2929

In [23]:
weather['Water1'].value_counts()

M    2929
Name: Water1, dtype: int64

In [24]:
weather.drop('Water1', 1, inplace=True)

In [25]:
SnowFall_missing = weather['SnowFall'].map(lambda val: val == 'M')
SnowFall_missing.sum()

1460

In [26]:
weather['SnowFall'].value_counts()

M      1460
0.0    1456
  T      12
0.1       1
Name: SnowFall, dtype: int64

In [27]:
weather.drop('SnowFall', 1, inplace=True)

In [28]:
PrecipTotal_missing = weather['PrecipTotal'].map(lambda val: val == 'M')
PrecipTotal_missing.sum()

2

In [29]:
weather = weather[~PrecipTotal_missing]

# StnPressure SeaLevel, ResultSpeed , ResultDir, AvgSpeed


In [30]:
StnPressure_missing = weather['StnPressure'].map(lambda val: val == 'M')
StnPressure_missing.sum()

2

In [31]:
SeaLevel_missing = weather['SeaLevel'].map(lambda val: val == 'M')
SeaLevel_missing.sum()

8

In [32]:
ResultSpeed_missing = weather['ResultSpeed'].map(lambda val: val == 'M')
ResultSpeed_missing.sum()

0

In [33]:
ResultDir_missing = weather['ResultDir'].map(lambda val: val == 'M')
ResultDir_missing.sum()

0

In [34]:
AvgSpeed_missing = weather['AvgSpeed'].map(lambda val: val == 'M')
AvgSpeed_missing.sum()

2

In [35]:
weather = weather[(~StnPressure_missing) & (~SeaLevel_missing) & (~AvgSpeed_missing)]

## Exporting to csv

In [36]:
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,CodeSum,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,0448,1849,,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,-,-,,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,0447,1850,BR,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,-,-,BR HZ,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,0446,1851,,0.0,29.39,30.12,11.7,7,11.9


In [37]:
weather.isnull().sum().sum()

0

In [38]:
weather.to_csv('../data/weather_cleaned.csv', index=False)