# Highlighting data

In [None]:
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt 
import numpy as np 

pollution = pd.read_csv('../data/pollution_wide.csv')
pollution.head()

In [None]:
sns.scatterplot(pollution['CO'])

In [None]:
cinci_pollution=pollution[pollution['city']=='Cincinnati']

In [None]:
sns.scatterplot(cinci_pollution['CO'])

Lets highlight some data points.

In [None]:
cinci_colors = ['orangered' if day==38 else 'steelblue' for day in cinci_pollution.day] 

p = sns.regplot(x='NO2', 
               y='SO2', 
               data = cinci_pollution, 
               fit_reg=False, 
               scatter_kws={'facecolors':cinci_colors, 'alpha':0.7})

In [None]:
houston_pollution = pollution[pollution.city  ==  'Houston']

# Make array orangred for day 330 of year 2014, otherwise lightgray
houston_colors = ['orangered' if (day  ==  330) & (year  ==  2014) else 'lightgray' 
                  for day,year in zip(houston_pollution.day, houston_pollution.year)]

sns.regplot(x = 'NO2',
            y = 'SO2',
            data = houston_pollution,
            fit_reg = False, 
            # Send scatterplot argument to color points 
            scatter_kws = {'facecolors': houston_colors, 'alpha': 0.7})
plt.show()

In [None]:
houston_pollution = pollution[pollution.city  ==  'Houston'].copy()

# Find the highest observed O3 value
max_O3 = houston_pollution.O3.max()

# Make a column that denotes which day had highest O3
houston_pollution['point_type'] = ['Highest O3 Day' if O3  ==  max_O3 else 'Others' for O3 in houston_pollution.O3]

# Encode the hue of the points with the O3 generated column
sns.scatterplot(x = 'NO2',
                y = 'SO2',
                hue = 'point_type',
                data = houston_pollution)
plt.show()

# Comparing groups

In [None]:
sns.set_style('whitegrid')

In [None]:
pollution_nov = pollution[pollution['month']==10]
sns.distplot(pollution_nov[pollution_nov.city == 'Denver'].O3, hist=False, color='red')
sns.distplot(pollution_nov[pollution_nov.city != 'Denver'].O3, hist=False)

In [None]:
sns.distplot(pollution_nov[pollution_nov.city == 'Denver'].O3, hist=False, color='red', rug=True)
sns.distplot(pollution_nov[pollution_nov.city != 'Denver'].O3, hist=False)

Adding rug=True sns add small dashes corresponding to the datapoints, so their distribution can be better understood

In [None]:
# Filter dataset to the year 2012
sns.kdeplot(pollution[pollution.year == 2012].O3, 
            # Shade under kde and add a helpful label
            shade = True,
            label = '2012')

# Filter dataset to everything except the year 2012
sns.kdeplot(pollution[pollution.year != 2012].O3, 
            # Again, shade under kde and add a helpful label
            shade = True,
            label = 'other years')
plt.show()

In [None]:
sns.distplot(pollution[pollution.city == 'Vandenberg Air Force Base'].O3, 
             label = 'Vandenberg', 
             # Turn off the histogram and color blue to stand out
             hist = False,
             color = 'steelblue', 
             # Turn on rugplot
             rug = True)

sns.distplot(pollution[pollution.city != 'Vandenberg Air Force Base'].O3, 
             label = 'Other cities',
             # Turn off histogram and color gray
             hist = False,  
             color = 'gray')
plt.show()

# The Beeswarm plot

Better approach to compare the distribution of many groups than the kernel density plot.


In [None]:
# Filter data to just March
pollution_mar = pollution[pollution.month == 3]

# Plot beeswarm with x as O3
sns.swarmplot(y = "city",
              x = 'O3', 
              data = pollution_mar, 
              # Decrease the size of the points to avoid crowding 
              size = 3)

# Give a descriptive title
plt.title('March Ozone levels by city')
plt.show()

In [None]:
pollution.city.value_counts()

In [None]:
sns.set_theme(rc={'figure.figsize':(13.7,8.27)})

cities=['Houston',
'Vandenberg Air Force Base',
'Denver',
'Long Beach',
'Indianapolis',
'Cincinnati',
'Des Moines']                 

sns.swarmplot(data=pollution[pollution.city.isin(cities) & (pollution.year==2012)], y='O3', x='city', hue='city')

# Annotations 

Annotations are great to add analysis to graphs, but they are time consuming.


In [None]:
sns.scatterplot(x='NO2', y='SO2', data=houston_pollution)

plt.text(13,33, 'Outlier', 
        fontdict={'ha': 'left', 'size':'x-large'})

In [None]:
sns.scatterplot(x='NO2', y='SO2', data=houston_pollution) 

plt.annotate('A buried point to look at', xy=(45.5, 11.8), xytext=(60,22), 
            arrowprops={'facecolor':'gray', 'width':3}, backgroundcolor = 'white')
                

In [None]:
# Draw basic scatter plot of pollution data for August
sns.scatterplot(x = 'CO', y = 'SO2', data = pollution[pollution.month  ==  8])

# Label highest SO2 value with text annotation
plt.text(0.57, 41,
         'Cincinnati had highest observed\nSO2 value on Aug 11, 2013', 
         # Set the font to large
         fontdict = {'ha': 'left', 'size': 'large'})
plt.show()

In [None]:
# Query and filter to New Years in Long Beach
jan_pollution = pollution.query("(month  ==  1) & (year  ==  2012)")
lb_newyears = jan_pollution.query("(day  ==  1) & (city  ==  'Long Beach')")

sns.scatterplot(x = 'CO', y = 'NO2',
                data = jan_pollution)

# Point arrow to lb_newyears & place text in lower left 
plt.annotate('Long Beach New Years',
             xy = (lb_newyears.CO, lb_newyears.NO2),
             xytext = (2, 15), 
             # Shrink the arrow to avoid occlusion
             arrowprops = {'facecolor':'gray', 'width': 3, 'shrink': 0.03},
             backgroundcolor = 'white')
plt.show()

# Colors in visualizations

Colors are very useful but have to be used carefully

Colors can affect the perception of size. Boxing the shapes in black mitigates this.

In [None]:
# Hard to read scatter of CO and NO2 w/ color mapped to city
sns.scatterplot(x='CO', y='NO2',
                alpha = 0.2,
                hue = 'city',
                data = pollution)

plt.show()

In [None]:
# Setup a facet grid to separate the cities apart
g = sns.FacetGrid(data = pollution,
                  col = 'city',
                  col_wrap = 3)

# Map sns.scatterplot to create separate city scatter plots
g.map(sns.scatterplot, 'CO', 'NO2', alpha = 0.2)
plt.show()

In [None]:
sns.barplot(y = 'city', x = 'CO', 
            estimator = np.mean,
            errorbar=('ci', False),
            data = pollution, 
           hue='city')
plt.show()

In [None]:
sns.barplot(y = 'city', x = 'CO', 
            estimator = np.mean,
            errorbar=('ci', False),
            data = pollution, 
           hue='city', 
           edgecolor='black')
plt.show()

# Continuous color palettes 

Seaborn offers the *sns.palplot()* method to check palettes.

Color is less precise when we use it to convey values. Sizes perform better.

Keep in mind color blindness. Palettes relying on intensity are safer.

When the data you are representing have a natural center point, diverging palettes are preferred.

When using light palettes, beware of the null color: if the background is white and there are datapoints displayed in very light colors, they could go unnoticed

In [None]:
blue_scale = sns.light_palette('steelblue')

sns.palplot(blue_scale)

In [None]:
cadetblue_scale = sns.light_palette('cadetblue')

sns.palplot(cadetblue_scale)

In [None]:
scale = sns.light_palette('orangered')

sns.palplot(scale)

In [None]:
sns.palplot(sns.light_palette('seagreen'))

In [None]:
sns.palplot(sns.dark_palette('seagreen'))

In [None]:
# Filter the data
cinci_2014 = pollution.query("city  ==  'Cincinnati' & year  ==  2014")

# Define a custom continuous color palette
color_palette = sns.light_palette('orangered',
                         as_cmap = True)

# Plot mapping the color of the points with custom palette
sns.scatterplot(x = 'CO',
                y = 'NO2',
                hue = 'O3', 
                data = cinci_2014,
                palette = color_palette)
plt.show()

In [None]:
nov_2015_CO = pollution[(pollution.month==11)&(pollution.year==2015)&(pollution.city.isin(cities))][['city','CO','day']].pivot(index='city', columns='day', values='CO') - 1 

# Define a custom palette
color_palette = sns.diverging_palette(250, 0, as_cmap = True)

# Pass palette to plot and set axis ranges
sns.heatmap(nov_2015_CO,
            cmap = color_palette,
            center = 0,
            vmin = -2,
            vmax = 2)
plt.yticks(rotation = 0)
plt.show()

In [None]:
plt.style.available

In [None]:
# Dark plot background
plt.style.use("dark_background")

# Modify palette for dark background
color_palette = sns.diverging_palette(250, 0,
                                      center = 'dark',
                                      as_cmap = True)

# Pass palette to plot and set center
sns.heatmap(nov,
            cmap = color_palette,
            center = 0)
plt.yticks(rotation = 0)
plt.show()

# Categorical Palettes 

Dont put more than 10 categories/colors on a single graph.

Keep color blind in mind 

A good alternative to reduce the amount of colors on a graph is to include a 'other' category and group there the less interesting categories 

# Ordinal Data

Seaborn has built in a lot of nice palettes for ordinal data

In [None]:
# Dark plot background
plt.style.use("ggplot")

colorbrewer_palettes = ['Reds', 'Blues', 'YlOrBr', 'PuBuGn', 'GnBu', 'Greys']

for i,pal in enumerate(colorbrewer_palettes): 
    sns.palplot(pal=sns.color_palette(pal, n_colors=i+4))

In [None]:
pollution['NO2 Tertial'] = pd.qcut(pollution['NO2'], 3, labels=False)

sns.scatterplot(x='CO', y='SO2', hue='NO2 Tertial', palette='OrRd', 
               data=pollution[(pollution.city=='Long Beach')&(pollution.year==2014)])

In [None]:
# Filter our data to Jan 2013
pollution_jan13 = pollution.query('year  ==  2013 & month  ==  1')

# Color lines by the city and use custom ColorBrewer palette
sns.lineplot(x = "day", 
             y = "CO", 
             hue = "city",
             palette = "Set2", 
             linewidth = 3,
             data = pollution_jan13)
plt.show()

In [None]:
# Divide CO into quartiles
pollution['CO quartile'] = pd.qcut(pollution['CO'], q = 4, labels = False)

# Filter to just Des Moines
des_moines = pollution.query("city  ==  'Des Moines'")

# Color points with by quartile and use ColorBrewer palette
sns.scatterplot(x = 'SO2',
                y = 'NO2',
                hue = 'CO quartile', 
                  data = des_moines,
                palette = 'GnBu')
plt.show()

# Dealing with Uncertainty

We have to care about uncertainty every time what we are displaying is an estimate (average of a subset, linear model coefficients...). Values that are facts, like counts or summaries of an entire population dont have anything to deal with uncertainty. 

Uncertainty helps inform confidence in estimate and its necessary for decission making.

In seaborn, confidence intervals can be displayed using hlines. When there are many confidence intervals to be displayed through time, the confidence band makes more sense (plt.fill_between()). Dont display many confidence bands on the same graph. If you want to compare confidence bands, dont put more than 2, and use alphas.





In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Generate some example data
np.random.seed(10)
x = np.repeat(np.arange(0, 10, 0.5), 10)  # Create more data points per x value
y = 2.5 * x + np.random.normal(0, 1, len(x))  # Simulated data points
df = pd.DataFrame({"x": x, "y": y})

# Group by x values and calculate the mean and standard deviation
grouped = df.groupby('x')['y'].agg(['mean', 'std', 'count'])

# Calculate the 95% confidence interval: CI = Z * (std / sqrt(n))
z_value = 1.96  # Z-score for 95% confidence level
grouped['ci'] = z_value * (grouped['std'] / np.sqrt(grouped['count']))

# Plot the mean line
plt.figure(figsize=(10, 6))
plt.plot(grouped.index, grouped['mean'], label="Mean", color="blue")

# Fill between for confidence interval
plt.fill_between(grouped.index, grouped['mean'] - grouped['ci'], grouped['mean'] + grouped['ci'], color='blue', alpha=0.3, label='95% Confidence Interval')

# Add labels, title, and legend
plt.title('Confidence Bands using fill_between Method', fontsize=16)
plt.xlabel('X', fontsize=14)
plt.ylabel('Y', fontsize=14)
plt.legend()

# Show the plot
plt.show()

Sometimes we want to go beyond that 95% CI and display more bands

In [None]:
sns.set_style('whitegrid')

# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Generate some example data
np.random.seed(10)
x = np.repeat(np.arange(0, 10, 0.1), 10)  # Create more data points per x value
y = 2.5 * x + np.random.normal(0, 1, len(x))+ np.random.normal(0, 2, len(x))**2  # Simulated data points
df = pd.DataFrame({"x": x, "y": y})

# Group by x values and calculate the mean, standard deviation, and count
grouped = df.groupby('x')['y'].agg(['mean', 'std', 'count'])

# Z-scores for different confidence levels
z_values = {
    "90%": 1.645,  # Z-score for 90% confidence level
    "95%": 1.96,   # Z-score for 95% confidence level
    "99%": 2.576   # Z-score for 99% confidence level
}

# Calculate confidence intervals for 90%, 95%, and 99% confidence levels
for level, z in z_values.items():
    grouped[f'ci_{level}'] = z * (grouped['std'] / np.sqrt(grouped['count']))

# Plot the mean line
plt.figure(figsize=(10, 6))
plt.plot(grouped.index, grouped['mean'], label="Mean", color="blue")

# Fill between for each confidence interval
plt.fill_between(grouped.index, grouped['mean'] - grouped['ci_90%'], grouped['mean'] + grouped['ci_90%'], 
                 color='red', alpha=0.35, label='90% Confidence Interval')

plt.fill_between(grouped.index, grouped['mean'] - grouped['ci_95%'], grouped['mean'] + grouped['ci_95%'], 
                 color='orange', alpha=0.3, label='95% Confidence Interval')

plt.fill_between(grouped.index, grouped['mean'] - grouped['ci_99%'], grouped['mean'] + grouped['ci_99%'], 
                 color='yellow', alpha=0.3, label='99% Confidence Interval')

# Add labels, title, and legend
plt.title('90%, 95%, and 99% Confidence Bands', fontsize=16)
plt.xlabel('X', fontsize=14)
plt.ylabel('Y', fontsize=14)
plt.legend()

# Show the plot
plt.show()

In [None]:
TO DO: UNFINISHED