# Step 1: Data Engineering
Inspect the content of the data and perform cleaning tasks.


In [1]:
import pandas as pd
import os

In [2]:
hawaiim = os.path.join('hawaii_measurements.csv')
hawaiim_df = pd.read_csv(hawaiim, encoding="utf-8")
hawaiis = os.path.join('hawaii_stations.csv')
hawaiis_df = pd.read_csv(hawaiis, encoding="utf-8")

In [3]:
hawaiim_df.head()

Unnamed: 0,station,date,prcp,tobs
0,USC00519397,2010-01-01,0.08,65
1,USC00519397,2010-01-02,0.0,63
2,USC00519397,2010-01-03,0.0,74
3,USC00519397,2010-01-04,0.0,76
4,USC00519397,2010-01-06,,73


In [4]:
# I can see the column prcp contents NaN values,
# I want to know how many rows holds NaN value, 
# then I can decide if drop those rows or not.
hawaiim_df.count()

station    19550
date       19550
prcp       18103
tobs       19550
dtype: int64

In [5]:
# There are 19550 - 18103= 1447 rows holding NaN values in the hawaiim_df['prcp'] column.
# It represents too many rows 7.4% of total, however I prefer 
# drop these rows instead of replacing them to 0 because I don't 
# know if 0.00 is a valid assumption.

# Drop all rows with missing information
hawaiim_df = hawaiim_df.dropna(how='any')
#hawaiim_df.head()
# Now the columns
hawaiim_df.count()

station    18103
date       18103
prcp       18103
tobs       18103
dtype: int64

In [6]:
# Now I will inspect the data types
hawaiim_df.dtypes

station     object
date        object
prcp       float64
tobs         int64
dtype: object

In [7]:
# Data types look good except hawaiim_df['date'] that should be date type instead object.
# I prefer to leave hawaiim_df['date'] as is. I will convert it to a real 
# date format when a database is created on this data.

In [8]:
hawaiis_df.head()

Unnamed: 0,station,name,latitude,longitude,elevation
0,USC00519397,"WAIKIKI 717.2, HI US",21.2716,-157.8168,3.0
1,USC00513117,"KANEOHE 838.1, HI US",21.4234,-157.8015,14.6
2,USC00514830,"KUALOA RANCH HEADQUARTERS 886.9, HI US",21.5213,-157.8374,7.0
3,USC00517948,"PEARL CITY, HI US",21.3934,-157.9751,11.9
4,USC00518838,"UPPER WAHIAWA 874.3, HI US",21.4992,-158.0111,306.6


In [9]:
# Data in hawaiis_df look good it is complete and very small.
# I will inspect the data types

hawaiis_df.count()

station      9
name         9
latitude     9
longitude    9
elevation    9
dtype: int64

In [10]:
hawaiis_df.dtypes

station       object
name          object
latitude     float64
longitude    float64
elevation    float64
dtype: object

In [11]:
# Saving the cleaned files 
hawaiim_df.to_csv('clean_hawaii_measurements.csv', encoding='utf-8', index=False, header=True)

In [12]:
hawaiis_df.to_csv('clean_hawaii_stations.csv', encoding='utf-8', index=False, header=True)