In [1]:
# ----------------------------------------------------------------------
# Step 1: Import all necessary modules for data import and cleaning
# ----------------------------------------------------------------------
import pandas as pd
import os

In [2]:
# ----------------------------------------------------------------------
# Step 2: Import data for cleaning
# ----------------------------------------------------------------------
# import Hawaii stations (no missing data)
hawaii_stations_df = pd.read_csv(os.path.join('Resources','hawaii_stations.csv'))

# import Hawaii measurements
hawaii_measurements_df = pd.read_csv(os.path.join('Resources','hawaii_measurements.csv'))

In [3]:
# ----------------------------------------------------------------------
# Step 3: Review and clean data, then export cleaned data into CSV
# ----------------------------------------------------------------------
# Step 3.1: Clean measurement data

# drop any null precipitation values (can't reasonably assume they'd be 0)
hawaii_measurements_df.dropna(axis=0, how='any', inplace=True)

# reset index
hawaii_measurements_df.reset_index(inplace=True)

# rename columns
hawaii_measurements_df.rename(columns = {'index':'id', 
                                         'prcp':'precipitation',
                                         'tobs':'temp'},
                              inplace=True)

# make sure precipitation is rounded to 2 decimal places
hawaii_measurements_df['precipitation'] = hawaii_measurements_df['precipitation'].round(decimals=2)

# export cleaned data to a CSV
hawaii_measurements_df.to_csv(os.path.join('Resources',
                                           'clean_hawaii_measurements.csv'))

In [4]:
# Step 3.1: Clean station data
# remove redundant information from station names (, HI US)
hawaii_stations_df['name'] = hawaii_stations_df['name'].str.split(',').str.get(0)

# round lat and long to 4 decimal places
hawaii_stations_df['latitude'] = hawaii_stations_df['latitude'].round(decimals=4)
hawaii_stations_df['longitude'] = hawaii_stations_df['longitude'].round(decimals=4)

# make sure elevation data points are all floats
hawaii_stations_df['elevation'] = hawaii_stations_df['elevation'].astype(float)

# reset index and rename id to create primary key column
hawaii_stations_df.reset_index().rename(columns={'index':'id'}, inplace=True)

# export cleaned data to a CSV
hawaii_stations_df.to_csv(os.path.join('Resources',
                                       'clean_hawaii_stations.csv'))