<a href="https://colab.research.google.com/github/ipeirotis/dealing_with_data/blob/master/08-Visualization/Visualization_Examples-NCDC_Normals.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from io import StringIO
import seaborn as sns
from matplotlib import pyplot as plt
import matplotlib.dates as mdates
import numpy as np

In [None]:
# Increase the font size to 150%
sns.set(font_scale = 1.5)
# Set the style to have white color and a grid
sns.set_style("whitegrid")

# NOAA'S 1981-2010 Climate Normals

[Description of the dataset](https://www1.ncdc.noaa.gov/pub/data/normals/1981-2010/readme.txt)

## Loading the dataset

### Load the weather stations

In [None]:
# Read the file directly from the URL. If it is a "Fixed Width File", so we use the .read_fwf command
stations = pd.read_fwf('https://www1.ncdc.noaa.gov/pub/data/normals/1981-2010/station-inventories/allstations.txt', header=None)

# From the README, we know the meaning of the different columns
stations.columns = ['station_id', 'lat', 'lon', 'elevation', 'state', 'name', 'is_GSN_station', 'is_HCN_station', 'WMO_number' ]

# We drop unecessary columns
stations = stations.drop(['is_GSN_station', 'is_HCN_station', 'WMO_number'], axis='columns')

# We only keep stations located in the US
condition = stations["station_id"].str.startswith("US")
stations = stations[ condition ]

stations

### Load the temperature measurements

This data file requires a bit of data wrangling to get it into the format that we want.

In [None]:
# read the fixed width file
data_url = "https://www1.ncdc.noaa.gov/pub/data/normals/1981-2010/products/temperature/dly-tavg-normal.txt"

# We set the infer_nrows to a large value, because otherwise
# some columns that have length greater than 4 get truncated
# (e.g., all the temperatures > 100 in Dealth Valley get truncated
# to values close to 0, because the leading 1 is missing)
df = pd.read_fwf(data_url, header=None, infer_nrows=100000, dtype="object")

In [None]:
# column names for the fixed-width file
# The format is station, month, and then one column per day
columns = ["station_id", "month"]
days = [f"{i:02.0f}" for i in range(1, 32)]
df.columns = columns + days

In [None]:
# un-pivot the table
# We keep ["station_id", "month"] as column names and we convert
# the remaining columns that correspond to days into values, under
# the column "day"
df = pd.melt(df, id_vars=columns, value_vars=days, var_name="day")

In [None]:
# extract the number and the flag from the temperature using a regex
regex = r"(-*\d+)(\w*)"
df = pd.concat([df, df["value"].str.extract(regex)], axis="columns")
df = df.drop("value", axis="columns")
df.columns = ["station_id", "month", "day", "temperature", "flag"]

In [None]:
# Remove dummy values from temperature and
# convert to F by dividing by 10
df["temperature"] = pd.to_numeric(df["temperature"])
df = df[df["temperature"] != -8888]
df["temperature"] = df["temperature"] / 10

In [None]:
# Keep only entries that are "Complete", "Standard"
df = df[df["flag"].isin(['C','S','R'])]

In [None]:
# Convert month-day pairs to actual dates
df["date"] = "2000-" + df["month"].astype(str) + "-" + df["day"].astype(str)
df["date"] = pd.to_datetime(df["date"])

In [None]:
# join with the details of each station
df = pd.merge(df, stations)

## Pick data for a few selected locations

In [None]:
# The list of stations that we want to plot
# and their friendly names (we will use the 
# friendly names to rename the time series later)
list_stations = {
    "USC00042319": "Death Valley",  # Death Valley, CA 92328 hottest point in the US
    "USW00094728": "New York",  # New York, NY 10012
    "USW00093107": "San Diego",  # San Diego, CA 92145
    "USW00012918": "Houston",  # Houston, TX 77061
    "USW00027502": "Barrow", # Barrow, AK 99723, coldest point in the US
    
    #"USC00516128": "Honolulu", # Honolulu, HI 96813
}

In [None]:
"['" + "','".join(list_stations.keys()) + "']"

In [None]:
# Keep only the data for the selected stations
condition = df['station_id'].isin(list_stations.keys())
data = df[condition].sort_values(['station_id','date'])
data

## Create a Line Plot

In [None]:
# Create a pivot table with the stations as columns
pivot = data.pivot_table(
    index="date", 
    columns="station_id", 
    values="temperature"
)

# Change the name of the columns to use the friendly names
pivot.rename(list_stations, axis="columns", inplace=True)

# Date is the index and Station IDs the columns
pivot

In [None]:
ax = pivot.plot(
    grid=True, 
    figsize=(8,5), 
    linewidth=3
)
ax.set_xlabel("Day of the year")
ax.set_ylabel("Temperature (°F)")

# Put the legend a bit outside the box
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
pass 

### Line plot, using redundant encoding for location (both line color, and line style)

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
ax = sns.lineplot(data=pivot, linewidth=2)
ax.set_xlabel("Day of the year")
ax.set_ylabel("Temperature (°F)")

plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
pass

### Line plot, using polar coordinates for the time of the year

In [None]:
fig, ax = plt.subplots(subplot_kw={'projection': 'polar'}, figsize=(10,7))
times = pd.date_range("01/01/2000", "12/31/2000")
t = mdates.date2num(pivot.index.to_pydatetime())
tnorm = (t-t.min())/(t.max()-t.min())*2.*np.pi
ax.set_ylim(-40,120)
ax.set_xticklabels(['Jan-1', 'Feb-15', 'Apr-1', 'May-15', 'Jul-1', 'Aug-15', 'Oct-1', 'Nov-15'])
ax.set_theta_direction(-1)
ax.set_theta_zero_location("N")
sns.lineplot(data=pivot.set_index(tnorm), linewidth=2)
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)
pass

## Create a Heatmap

In [None]:
# We create temperature data with monthly
heatmap = pivot.resample("1M").mean()
heatmap

In [None]:
# Beautifying: Instead of days, we change the index to have names of the months
months = [
    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
]
heatmap.index = months
heatmap

In [None]:
# Beautifying: Reordering the columns, so that we can go from coldest to hottest
locations = ["Barrow", "New York", "San Diego", "Houston", "Death Valley"]
heatmap = heatmap[locations]
heatmap

In [None]:
# Plot the heatmap, using the Seaborn library

plt.figure(figsize=(12,5))
sns.heatmap(
    heatmap.T, 
    linewidths=1, 
    cbar_kws={'label': 'Temperature (°F)'}, 
    cmap='coolwarm'
)
pass

In [None]:
# Same plot, but we now add the temperature values in the boxes
plt.figure(figsize=(12,3))
sns.heatmap(
    heatmap.T, 
    annot=True, # Put the data value in the box
    fmt="3.0f", # formatting the value to have 3 digits with 0 decimals
    linewidths=1, 
    cbar_kws={'label': 'Temperature (°F)'}, 
    cmap='coolwarm'
)

pass

In [None]:
# We now change the granularity to weekly and re-plot

plt.figure(figsize=(26,2))
sns.set(font_scale = 0.8)

# Resampling our dataset to weekly instead of monthly
heatmap = pivot.resample("1W").mean()
# Changing the dates to be "w01", "w02", "w03", etc.
heatmap.index = [f'w{i:02.0f}' for i in range(1,54)]
# Reordering the pivot columns
heatmap = heatmap[locations]

# Same plot as before, but now we add a bit of control for the 
# tick labels (adding 45 degree rotation for the x-axis labels)
g = sns.heatmap(heatmap.T, annot=True, fmt="2.0f", linewidths=1, cbar_kws={'label': 'Temperature (°F)'}, cmap='coolwarm')
g.set_yticklabels(g.get_yticklabels(),rotation=0) 
g.set_xticklabels(g.get_xticklabels(),rotation=45) 
pass

In [None]:
# Removing the annotations for the weekly
plt.figure(figsize=(26,2))
g = sns.heatmap(heatmap.T, linewidths=1, cbar_kws={'label': 'Temperature (°F)'}, cmap='coolwarm')
g.set_yticklabels(g.get_yticklabels(),rotation=0) 
g.set_xticklabels(g.get_xticklabels(),rotation=45) 
pass

In [None]:
# Reploting daily values. We remove the white lines by removing the 
# option ("linewidths=1")
plt.figure(figsize=(26,2))
heatmap = pivot
heatmap = heatmap[locations]
g = sns.heatmap(heatmap.T, cbar_kws={'label': 'Temperature (°F)'}, cmap='coolwarm')
g.set_yticklabels(g.get_yticklabels(),rotation=0)
g.set(xticklabels=[])
pass

## Plotting temperatures of one location against temperatures in another

In [None]:
fig, ax = plt.subplots(figsize=(7,7))
sns.set_style("whitegrid")

sns.scatterplot(data=pivot, x="New York", y="Barrow", s=5)

### Plotting one location against another, using color to mark the month

In [None]:
fig, ax = plt.subplots(figsize=(7,7))
sns.set_style("whitegrid")
d = pivot.reset_index()
d['month'] = d['date'].dt.month_name()
sns.scatterplot(data=d, x="New York", y="Barrow", s=5, hue='month', palette="hsv")
plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0)

## For Further Visualizations: Overall statistics per location

In [None]:
stats = df.pivot_table(
    
    index="station_id", 
    values="temperature", 
    aggfunc=["std",'mean','max','min']
)

stats.columns = ['std_temp', 'avg_temp','max_temp', 'min_temp']
stats = stats.reset_index()

excluded_states = []
# excluded_states = ['HI','CA','FL','WA','OR','AK','TX']
s = stations[~stations["state"].isin(excluded_states)]
stats = pd.merge(stats, s).sort_values('max_temp')
stats

In [None]:
stats.plot(
    kind='scatter',
    x='elevation',
    y='avg_temp',
    c='std_temp',
    cmap = 'coolwarm',
    figsize=(10,10)
)

In [None]:
stats.plot(
    kind='scatter',
    x='lon',
    y='lat',
    c='std_temp',
    cmap = 'rainbow',
    figsize=(20,10),
    s = 2
)