In [25]:
# Dependencies
import pandas as pd

# Read csv files into pandas dataframes
hawaii_measurements_df = pd.read_csv("hawaii_measurements.csv")
hawaii_stations_df = pd.read_csv("hawaii_stations.csv")

hawaii_measurements_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [26]:
# Count values for each column
hawaii_measurements_df.count()

station    19550
date       19550
prcp       18103
tobs       19550
dtype: int64

In [27]:
# Look into amount of missing data for prcp column
print(hawaii_measurements_df.isnull().sum())

station       0
date          0
prcp       1447
tobs          0
dtype: int64


In [28]:
# See value count for each unique value in prcp
hawaii_measurements_df["prcp"].value_counts()

0.00     8185
0.01     1198
0.02      966
0.03      707
0.04      483
0.05      466
0.06      375
0.08      312
0.07      308
0.10      297
0.09      230
0.12      217
0.11      179
0.13      179
0.14      169
0.15      154
0.16      139
0.20      119
0.17      113
0.19      112
0.18      106
0.22      104
0.30       88
0.23       88
0.21       82
0.25       79
0.26       72
0.24       66
0.29       60
0.27       60
         ... 
1.86        1
2.35        1
4.88        1
5.96        1
3.18        1
2.36        1
2.47        1
5.71        1
3.47        1
2.37        1
8.81        1
3.23        1
11.53       1
2.79        1
3.68        1
2.81        1
2.42        1
4.95        1
2.96        1
2.49        1
5.35        1
2.63        1
2.71        1
3.38        1
3.03        1
3.99        1
3.46        1
6.83        1
3.44        1
4.68        1
Name: prcp, Length: 342, dtype: int64

In [29]:
# Look into dataframe summary statistics
hawaii_measurements_df.describe()

Unnamed: 0,prcp,tobs
count,18103.0,19550.0
mean,0.160644,73.097954
std,0.468746,4.523527
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.11,76.0
max,11.53,87.0


In [30]:
# Look into dropping NaN values
drop_hawaii_measurements = hawaii_measurements_df.dropna().reset_index(drop=True)
drop_hawaii_measurements.describe()

Unnamed: 0,prcp,tobs
count,18103.0,18103.0
mean,0.160644,72.994863
std,0.468746,4.512107
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.11,76.0
max,11.53,87.0


In [31]:
drop_hawaii_measurements.head(10)

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-07,0.06,70
5,USC00519397,2010-01-08,0.0,64
6,USC00519397,2010-01-09,0.0,68
7,USC00519397,2010-01-10,0.0,73
8,USC00519397,2010-01-11,0.01,64
9,USC00519397,2010-01-12,0.0,61


In [32]:
# Look into filling NaN values with 0 
fill_hawaii_measurements = hawaii_measurements_df.fillna(value=0)
fill_hawaii_measurements.describe()

Unnamed: 0,prcp,tobs
count,19550.0,19550.0
mean,0.148753,73.097954
std,0.453021,4.523527
min,0.0,53.0
25%,0.0,70.0
50%,0.01,73.0
75%,0.09,76.0
max,11.53,87.0


In [33]:
# Look into filling NaN values with average
mean_hawaii_measurements = hawaii_measurements_df.fillna(value=0.160644)
mean_hawaii_measurements.describe()

Unnamed: 0,prcp,tobs
count,19550.0,19550.0
mean,0.160644,73.097954
std,0.451064,4.523527
min,0.0,53.0
25%,0.0,70.0
50%,0.02,73.0
75%,0.160644,76.0
max,11.53,87.0


In [34]:
# Look at Hawaii stations dataframe
hawaii_stations_df.head()

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [35]:
# Look for missing data for Hawaii stations dataframe
hawaii_stations_df.count()

station      9
name         9
latitude     9
longitude    9
elevation    9
dtype: int64

In [36]:
# Decided with filling the missing data with the value 0 since 0 is the median and mode value for the prcp column
# This will avoid removing >1000 rows of data that is going to be used to analyze tobs
# Saved cleaned dataframes as csv files
fill_hawaii_measurements.to_csv("clean_hawaii_measurements.csv", index = False)
hawaii_stations_df.to_csv("clean_hawaii_stations.csv", index = False)