# Capstone One Data Wrangling

In [15]:
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# pandas options
pd.set_option('display.max_columns', 125) # csv contains 124 columns
pd.set_option('display.max_rows', 4000) # display more rows
pd.options.mode.chained_assignment = None  # turn off false positive warnings for editing a copy

In [3]:
data = pd.read_csv('montereyClimateData.csv') # had one float at end of this col

  interactivity=interactivity, compiler=compiler, result=result)


Get relevant columns from the data set:

In [4]:
df = data
columns = ['DATE', 'HourlyPresentWeatherType', 'HourlySkyConditions', 'HourlyVisibility', 'HourlyDryBulbTemperature', 'HourlyWindSpeed', 'DailyMaximumDryBulbTemperature', 'DailyMinimumDryBulbTemperature', 'DailyPeakWindSpeed', 'DailyPrecipitation']
df = df.loc[:, columns]

### 1. Convert the 'DATE' column into a new datetime column full of datetime objects, and set this new column as the dataframe's index.

In [5]:
def date_val_to_datetime(to_parse):
    to_format = to_parse.split('T')
    return datetime.strptime(to_format[0] + ' ' + to_format[1], '%Y-%m-%d %H:%M:%S')

df['datetime'] = df.loc[:,'DATE'].apply(date_val_to_datetime)

### 2. Set this new column to be a datetime index for the dataframe.

In [6]:
df = df.set_index(['datetime'])

### 3. Convert strings to float across dataset where possible, replacing asterisks and suspect values with NaNs.

In [7]:
df.dtypes # all but hourly windspeed must be cleaned and converted to float for numeric processing

DATE                               object
HourlyPresentWeatherType           object
HourlySkyConditions                object
HourlyVisibility                   object
HourlyDryBulbTemperature           object
HourlyWindSpeed                   float64
DailyMaximumDryBulbTemperature     object
DailyMinimumDryBulbTemperature     object
DailyPeakWindSpeed                 object
DailyPrecipitation                 object
dtype: object

In [8]:
cols = ['HourlyVisibility', # columns to convert
 'HourlyDryBulbTemperature',
 'HourlyWindSpeed',
 'DailyMaximumDryBulbTemperature',
 'DailyMinimumDryBulbTemperature',
 'DailyPeakWindSpeed',
 'DailyPrecipitation',
       ]

# convert columns by applying to_numeric with error coersion
df.loc[:, cols] = df.loc[:, cols].apply(pd.to_numeric, errors='coerce')

In [9]:
# check for desired result
for c in cols:
    assert df.loc[:, c].dtypes == np.float64
    assert len(df.loc[df[c].astype(str).str[-1].isin(('*', 's'))]) == 0

In [10]:
df.dtypes # and now all numeric fields are now floats

DATE                               object
HourlyPresentWeatherType           object
HourlySkyConditions                object
HourlyVisibility                  float64
HourlyDryBulbTemperature          float64
HourlyWindSpeed                   float64
DailyMaximumDryBulbTemperature    float64
DailyMinimumDryBulbTemperature    float64
DailyPeakWindSpeed                float64
DailyPrecipitation                float64
dtype: object

### 4. Backfill the daily values to eliminate missing values: maximum daily temperature, minimum daily temperature, maximum daily wind speed, and daily precipitation columns throughout each day should have identical values.


In [11]:
df.loc[:,['DailyMaximumDryBulbTemperature', 'DailyMinimumDryBulbTemperature', 'DailyPeakWindSpeed', 'DailyPrecipitation']] = df.loc[:,['DailyMaximumDryBulbTemperature', 'DailyMinimumDryBulbTemperature', 'DailyPeakWindSpeed', 'DailyPrecipitation']].bfill()

### 5. Replace sky condition string list with a list of dictionaries. The sky condition is the key and a SkyCondition namedtuple is the value.

In [12]:
# column value is a string of a list of codes, 'BKN:07 15 OVC:08 20'
# desired output is a list of tuples, [('BKN', 7, 15), ('OVC', 8, 20)]
# clear days lack a second integer, i.e., 'CLR:00', appending 0 in place of missing value
from collections import namedtuple

SkyCondition = namedtuple('SkyCondition', 'obscuration, vertical_distance') # these will be the dict's values

def list_of_lists_by_n(the_list, n):
    """Yields the next n elements of a list as a sublist"""
    for i in range(0, len(the_list), n):  
        yield the_list[i:i + n] 
        
def from_many_to_two(the_string):
    split_at_spaces = the_string.split(' ')
    return list(list_of_lists_by_n(split_at_spaces, 2))

def from_two_to_three(list_of_lists):
    """
    input: ['CAPS:02', '35']
    output: {'CAPS':, SkyCondition(obscuration=02, vertical_distance=35)}
    """
    output = []
    for two_element_list in list_of_lists:
        first_element = two_element_list[0]
        if 2 >= len(first_element):
            return {} # for single trailing ints
        first_element_split = first_element.split(":")
        if 2 > len(two_element_list): 
            two_element_list.append(0) # catch CLR days missing following 00
        condition = SkyCondition(int(first_element_split[1]), int(two_element_list[1]))
        output.append({first_element_split[0]: condition})
    return output

def condition_string_to_namedtuple_dict(value):
    """
    Converts string containing several of the following to a list of dictionaries as follows:
    input: "CAPS:03 34"
    output: {'CAPS':, SkyCondition(obscuration=3, vertical_distance=34)}
    """
    if isinstance(value, float): # the only floats are np.nan, which is a float...with a str repr
        return [] # replace NaNs as an empty list
    the_string = value
    list_of_twos = from_many_to_two(the_string)
    return from_two_to_three(list_of_twos)

df['HourlySkyConditions'] = df['HourlySkyConditions'].apply(condition_string_to_namedtuple_dict)

Next we add an average obscuration score for each hour in the datset:

In [22]:
def calculate_average_obscuration(sky_conditions_for_hour):
    """Calculates the mean obscuration for each hour in the dataset"""
    if not sky_conditions_for_hour:
        return np.nan
    else:
        obscurations = [[y.obscuration for x, y in d.items()] for d in sky_conditions_for_hour]
        obscuration_mean = sum([x[0] for x in obscurations]) / len(obscurations) # calculate mean obscuration
        return obscuration_mean
    

df['averageObscuration'] = df['HourlySkyConditions'].apply(calculate_average_obscuration)