In [14]:
import pandas as pd
import re
import mysql.connector

In [34]:
df = pd.read_csv('../data/blaine_weather.csv')

In [35]:
df.columns = [re.sub(r'^ ', '', val) for val in df.columns.values]  # extra spaces removed in column names

In [37]:
df.Events = df.Events.fillna('Clear')

In [38]:
df["Events"].value_counts()

Rain                 175
Clear                161
Fog                   39
Fog-Rain              10
Rain-Snow              7
Fog-Rain-Snow          3
Snow                   2
Rain-Thunderstorm      1
Name: Events, dtype: int64

In [39]:
df.PST = df.PST.str.replace(r'-(\d)-(\d)$', '-0\g<1>-0\g<2>')
df.PST = df.PST.str.replace(r'-(\d)-(\d\d)$', '-0\g<1>-\g<2>')
df.PST = df.PST.str.replace(r'-(\d\d)-(\d)$', '-\g<1>-0\g<2>')

In [41]:
df.PST = pd.to_datetime(df.PST, format='%Y-%m-%d')

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 398 entries, 0 to 397
Data columns (total 23 columns):
PST                          398 non-null datetime64[ns]
Max TemperatureF             398 non-null int64
Mean TemperatureF            398 non-null int64
Min TemperatureF             398 non-null int64
Max Dew PointF               398 non-null int64
MeanDew PointF               398 non-null int64
Min DewpointF                398 non-null int64
Max Humidity                 398 non-null int64
Mean Humidity                398 non-null int64
Min Humidity                 398 non-null int64
Max Sea Level PressureIn     398 non-null float64
Mean Sea Level PressureIn    398 non-null float64
Min Sea Level PressureIn     398 non-null float64
Max VisibilityMiles          398 non-null int64
Mean VisibilityMiles         398 non-null int64
Min VisibilityMiles          398 non-null int64
Max Wind SpeedMPH            398 non-null int64
Mean Wind SpeedMPH           398 non-null int64
Max Gust SpeedMP

In [43]:
df.head()

Unnamed: 0,PST,Max TemperatureF,Mean TemperatureF,Min TemperatureF,Max Dew PointF,MeanDew PointF,Min DewpointF,Max Humidity,Mean Humidity,Min Humidity,...,Max VisibilityMiles,Mean VisibilityMiles,Min VisibilityMiles,Max Wind SpeedMPH,Mean Wind SpeedMPH,Max Gust SpeedMPH,PrecipitationIn,CloudCover,Events,WindDirDegrees
0,2006-01-01,53,48,43,42,38,35,86,70,52,...,10,10,9,30,13,48.0,0.01,5,Rain,134
1,2006-01-02,48,42,37,43,40,36,96,83,66,...,10,10,9,14,8,20.0,0.04,7,Rain,180
2,2006-01-03,45,40,35,39,36,34,96,80,68,...,10,10,10,16,8,23.0,0.02,6,Rain,127
3,2006-01-04,51,47,43,40,38,36,86,73,56,...,10,10,10,16,11,,0.0,5,Rain,112
4,2006-01-05,60,52,44,48,44,34,100,83,39,...,10,8,2,26,11,39.0,1.06,8,Rain,118


In [46]:
df.columns.values

array(['PST', 'Max TemperatureF', 'Mean TemperatureF', 'Min TemperatureF',
       'Max Dew PointF', 'MeanDew PointF', 'Min DewpointF', 'Max Humidity',
       'Mean Humidity', 'Min Humidity', 'Max Sea Level PressureIn',
       'Mean Sea Level PressureIn', 'Min Sea Level PressureIn',
       'Max VisibilityMiles', 'Mean VisibilityMiles',
       'Min VisibilityMiles', 'Max Wind SpeedMPH', 'Mean Wind SpeedMPH',
       'Max Gust SpeedMPH', 'PrecipitationIn', 'CloudCover', 'Events',
       'WindDirDegrees'], dtype=object)

## Import into weather_raw table first
* A few functions below to make create table command easier
* Imported using copy weather_raw from '/Users/jng/galvanize/BorderCrossing/data/blaine_weather-2013-clean.csv' (HEADER TRUE, DELIMITER ',', FORMAT 'csv');
* "T" in precip converted to 0.01

In [3]:
cols = ['PST', 'Max TemperatureF', 'Mean TemperatureF', 'Min TemperatureF',
       'Max Dew PointF', 'MeanDew PointF', 'Min DewpointF', 'Max Humidity',
       'Mean Humidity', 'Min Humidity', 'Max Sea Level PressureIn',
       'Mean Sea Level PressureIn', 'Min Sea Level PressureIn',
       'Max VisibilityMiles', 'Mean VisibilityMiles',
       'Min VisibilityMiles', 'Max Wind SpeedMPH', 'Mean Wind SpeedMPH',
       'Max Gust SpeedMPH', 'PrecipitationIn', 'CloudCover', 'Events',
       'WindDirDegrees']
cols = [ name.replace(' ', '_') for name in cols]

In [5]:
columns = ", ".join(cols)

In [6]:
columns

'PST, Max_TemperatureF, Mean_TemperatureF, Min_TemperatureF, Max_Dew_PointF, MeanDew_PointF, Min_DewpointF, Max_Humidity, Mean_Humidity, Min_Humidity, Max_Sea_Level_PressureIn, Mean_Sea_Level_PressureIn, Min_Sea_Level_PressureIn, Max_VisibilityMiles, Mean_VisibilityMiles, Min_VisibilityMiles, Max_Wind_SpeedMPH, Mean_Wind_SpeedMPH, Max_Gust_SpeedMPH, PrecipitationIn, CloudCover, Events, WindDirDegrees'

## Copy into weather table
* Start with numerics and date handling
* Store events as categoricals 

```sql
select 
    to_date(pst, 'YYYY-MM-DD'),
    max_temperaturef,
    mean_temperaturef,
    min_temperaturef,
    max_visibilitymiles,
    mean_visibilitymiles,
    min_visibilitymiles,
    max_wind_speedmph,
    mean_wind_speedmph,
    max_gust_speedmph,
    precipitationin,
    coalesce(events like '%Rain%', False),
    coalesce(events like '%Snow%', False),
    coalesce(events like '%Fog%', False),
    coalesce(events like '%Thunderstorm%', False)
from weather_raw 
```

In [8]:
from dbhelper import pd_query

In [9]:
events = pd_query('select distinct Events from weather_raw;')

In [11]:
events

Unnamed: 0,events
0,
1,Fog
2,Fog-Rain-Thunderstorm
3,Fog-Snow
4,Rain-Thunderstorm
5,Fog-Rain-Snow
6,Snow
7,Thunderstorm
8,Fog-Rain
9,Rain


In [12]:
event_types = ['Fog', 'Rain', 'Thunderstorm', 'Snow']