# Lesson 3: Transforming Data
Sample annual temperature and preciptation climate data from Portland, OR, San Diego, CA, Tampa, FL, and Bangor, ME. Data from [NOAA Climate Data Online](http://www.ncdc.noaa.gov/cdo-web/datasets#ANNUAL). See accompanying documentation.

The purpose of this notebook is to demonstrate some functionality in pandas. No attempt at a sensible analysis is made.

In [None]:
import os
import csv

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read in climate data
climate = pd.read_csv(os.path.join("data","climate.csv"), na_values=[-9999, 9999])

In [None]:
# read in file to map column names to something human readable
namemap = {}
with open(os.path.join("data", "climate_colnames.csv")) as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        namemap[row[0]] = row[1]

In [None]:
climate.columns = [namemap.get(c,c).lower() for c in climate.columns]

In [None]:
climate.columns

In [None]:
climate["state"] = [s[-5:-2] for s in climate.station_name]
climate["date"] = pd.to_datetime(climate.date, format="%Y%m")
climate["year"] = climate.date.dt.year
climate["month"] = climate.date.dt.month

In [None]:
print(climate.state.unique())
print(climate.year.unique())
print(climate.month.unique())

In [None]:
# print max values by state
groupbystate = climate.groupby("state")
groupbystate.agg(max)

In [None]:
groupbystation = climate.groupby("station_name")
groupbystation.agg

In [None]:
climate.columns[3:-4].union(["state"])

In [None]:
sns.set()
climate_values_nona = climate[climate.columns[3:-4].union(["state"])].dropna()
sns.pairplot(climate_values_nona, hue="state")

## Normalization

In [None]:
# view distribution for one station per metropolitan area.
stationlist = list(groupbystate.station_name.agg(max))
climate_stations = climate[climate["station_name"].apply(lambda s: s in stationlist)]

g = sns.FacetGrid(climate_stations, col="station_name")
g.map(plt.hist,"max_daily_temp")

In [None]:
# calculate normalized max_daily_temperature (i.e. the Z-score) for each station
mean = groupbystation.max_daily_temp.mean()
std = groupbystation.max_daily_temp.std()
climate["max_daily_temp_norm"] = climate.apply(lambda x: 
                                               (x["max_daily_temp"] - mean[x["station_name"]])/std[x["station_name"]],
                                               axis=1)

In [None]:
# show normalized max temperature distribution for same set of stations as above
climate_stations = climate[climate["station_name"].apply(lambda s: s in stationlist)]

g = sns.FacetGrid(climate_stations, col="station_name")
g.map(plt.hist,"max_daily_temp_norm")

## Dummy Variables
For some algorithms you will need to convert categorical variables to dummy variables, mutually exclusive categories that take boolean values.

In [None]:
pd.get_dummies(climate.station_name)

## Create Example Plots
Code for plots used in lecture.

In [None]:
# Side-by-side box plot (Explanatory: Categorical, Response: Numerical)
sns.boxplot(x="state", y="max_daily_temp", data = climate)

In [None]:
# Bin snow day count to make an ordinal categorical variable from a discrete numerical variable
snowdaysbins = [0,1,5,10,20,366]
binlabels = ["0", "1-4", "5-9", "10-19", "20+"]
climate.days_with_snow = pd.cut(climate.count_days_with_snow, snowdaysbins, right=False, labels = binlabels)
# create contingency table
snowtable = pd.crosstab(climate.state, climate.days_with_snow)
# view contingency table
snowtable