# Contents
1. [Data Cleaning (Weather)](#1.-Data-Cleaning-(Weather))
2. [Data Cleaning (Outage)](#2.-Data-Cleaning-(Outage))
3. [Model Prep](#3.-Model-Prep)
4. [NLP Exploration](#4.-NLP-Exploration)
5. [Final Cleanup Before Merging with Weather](#5.-Final-Cleanup-Before-Merge)

In [1]:
# Load libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pytemperature

# Python magic to display matplolib plots in the notebook
%matplotlib inline

## 1. Data Cleaning (Weather)

#### Loading weather data downloaded from [Kaggle's Historic Hourly Weather](https://www.kaggle.com/selfishgene/historical-hourly-weather-data).

In [2]:
# Load weather data from multiple CSV files
humidity = pd.read_csv('data/kaggle_weather_datasets/humidity.csv')
pressure = pd.read_csv('data/kaggle_weather_datasets/pressure.csv')
temperature = pd.read_csv('data/kaggle_weather_datasets/temperature.csv')
weather_des = pd.read_csv('data/kaggle_weather_datasets/weather_description.csv')
wind_dir = pd.read_csv('data/kaggle_weather_datasets/wind_direction.csv')
wind_speed = pd.read_csv('data/kaggle_weather_datasets/wind_speed.csv')

In [11]:
# Check format of datasets. 
temperature.head(2)

Unnamed: 0,datetime,Vancouver,Portland,San Francisco,Seattle,Los Angeles,San Diego,Las Vegas,Phoenix,Albuquerque,...,Philadelphia,New York,Montreal,Boston,Beersheba,Tel Aviv District,Eilat,Haifa,Nahariyya,Jerusalem
0,2012-10-01 12:00:00,,,,,,,,,,...,,,,,,,309.1,,,
1,2012-10-01 13:00:00,284.63,282.08,289.48,281.8,291.87,291.53,293.41,296.6,285.12,...,285.63,288.22,285.83,287.17,307.59,305.47,310.58,304.4,304.4,303.5


#### Merging weather datasets.

In [12]:
# Merge datasets, keeping only New York weather information

df_list = [pressure, temperature, weather_des, wind_dir, wind_speed]
df_names = ['pressure', 'temperature', 'weather_des', 'wind_dir', 'wind_speed']

## Create merged_df variable with humidity dataset
merged_df = humidity[['datetime', 'New York']].rename(columns = {'datetime' : 'date', 'New York' : 'ny_humidity'})

## For loop to add remaining datasets to merged_df 
df_counter = 0
for df in df_list:
    df_name = df_names[df_counter]
    new_df = df[['datetime', 'New York']].rename(columns = {'datetime' : 'date', 'New York' : 'ny_' + df_name})
    merged_df = merged_df.merge(new_df, how = 'right', on = 'date')
    df_counter += 1
    
## Check shape of merged datasets
print('Merged dataset has {} rows and {} columns.'.format(merged_df.shape[0], merged_df.shape[1]))
## Check merged_df
merged_df.head()

Merged dataset has 45253 rows and 7 columns.


Unnamed: 0,date,ny_humidity,ny_pressure,ny_temperature,ny_weather_des,ny_wind_dir,ny_wind_speed
0,2012-10-01 12:00:00,,,,,,
1,2012-10-01 13:00:00,58.0,1012.0,288.22,few clouds,260.0,7.0
2,2012-10-01 14:00:00,57.0,1012.0,288.247676,few clouds,260.0,7.0
3,2012-10-01 15:00:00,57.0,1012.0,288.32694,few clouds,260.0,7.0
4,2012-10-01 16:00:00,57.0,1012.0,288.406203,few clouds,260.0,7.0


#### Changing format, datatypes, dropping nulls, re-setting index.

In [13]:
# Change datatype of 'date' column from object to datetime 
merged_df['date'] = pd.to_datetime(merged_df['date'])

# Rename "date" column to "datetime" to match "datetime" column in tweets dataset (dataset that will be merged with weather.)
merged_df.rename(columns = {'data' : 'datetime'}, inplace = True)

In [14]:
# Dropped humidity and pressure data, due to assumption that there is no strong correlation between these and power outages
merged_df = merged_df.drop(columns = ['ny_humidity', 'ny_pressure'])

In [15]:
# Drop null values by indexing. Null values are located only at beginning of dataset and end
# This allows for no gaps in between 10/1/2012 2 PM - 10/28/2017 12 AM

merged_df = merged_df.iloc[1:44461]

In [16]:
# Set 'date' column as index
merged_df = merged_df.set_index('date').sort_index()

In [17]:
# Change temperature data from Kelvin to Fahrenheit
# https://pypi.org/project/pytemperature/
merged_df['ny_temperature'] = pytemperature.k2f(merged_df['ny_temperature'])

In [19]:
# Function to convert meters per second to miles per hour.
def convert_mph(meters_ps):
    miles = meters_ps / 1609.344
    miles_ph = miles * 3600
    
    return int(miles_ph)

# Convert "ny_wind_speed" (wind speed) from meters per second to miles per hour. 
merged_df.ny_wind_speed = merged_df.ny_wind_speed.map(lambda cell: convert_mph(cell))

In [21]:
# Rename columns to reflect units.
merged_df.rename(columns = {
    'ny_temperature' : 'temperature_f',
    'ny_wind_dir'    : 'wind_dir_degrees',
    'ny_wind_speed'  : 'wind_speed_mph'
}, inplace = True)

# change format of column names; spaces replaced by underscores.
column_new_names = []
for column in merged_df.columns:
    new_name = column.replace(' ', '_')
    column_new_names.append(new_name)
merged_df.columns = column_new_names

In [20]:
merged_df.head()

Unnamed: 0_level_0,ny_temperature,ny_weather_des,ny_wind_dir,ny_wind_speed
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-10-01 13:00:00,59.11,few clouds,260.0,15
2012-10-01 14:00:00,59.16,few clouds,260.0,15
2012-10-01 15:00:00,59.3,few clouds,260.0,15
2012-10-01 16:00:00,59.44,few clouds,260.0,15
2012-10-01 17:00:00,59.58,few clouds,261.0,13


#### Creating dummy variables for wind descriptions.

In [22]:
# Create dummy features for text values in 'ny_weather_des' (weather descriptions)

## Check how many unique values for weather descriptions
print('{} unique weather description values.'.format(merged_df.ny_weather_des.nunique()))

## Create dummy variables. 
dummy_variables = pd.get_dummies(merged_df['ny_weather_des'])

## Merge dummy variables with merged_df that holds other weather features 
numerical_df = merged_df.merge(dummy_variables, how = 'left', on = 'date')

## Drop text/object dtype column 'ny_weather_des' (weather descriptions)
numerical_df.drop(columns ='ny_weather_des', inplace = True)

36 unique weather description values.


#### Final cleaned weather dataframe.

In [23]:
# Check shape of merged datasets
print('Merged dataset has {} rows and {} columns.'.format(numerical_df.shape[0], numerical_df.shape[1]))
#  Check merged_df
numerical_df.head()

Merged dataset has 44460 rows and 39 columns.


Unnamed: 0_level_0,temperature_f,wind_dir_degrees,wind_speed_mph,broken clouds,drizzle,dust,few clouds,fog,freezing rain,haze,...,sky is clear,smoke,snow,squalls,thunderstorm,thunderstorm with heavy rain,thunderstorm with light drizzle,thunderstorm with light rain,thunderstorm with rain,very heavy rain
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2012-10-01 13:00:00,59.11,260.0,15,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2012-10-01 14:00:00,59.16,260.0,15,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2012-10-01 15:00:00,59.3,260.0,15,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2012-10-01 16:00:00,59.44,260.0,15,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2012-10-01 17:00:00,59.58,261.0,13,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## 2. Data Cleaning (Power Outages)

#### Loading power outage data downloaded from [NYC OpenData's OEM Emergency Notifications](https://data.cityofnewyork.us/Public-Safety/OEM-Emergency-Notifications/8vv7-7wx3/data).

In [24]:
# Load outage data from multiple CSV files