# Visualization Curriculum

## Chapter1: Introduction to Altair

---
* Author:  [Yuttapong Mahasittiwat](mailto:khala1391@gmail.com)
* Technologist | Data Modeler | Data Analyst
* [YouTube](https://www.youtube.com/khala1391)
* [LinkedIn](https://www.linkedin.com/in/yuttapong-m/)
---

Source: [Visualization Curriculum](https://idl.uw.edu/visualization-curriculum/altair_introduction.html)

In [3]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import altair as alt
print("pandas version :",pd.__version__)
print("numpy version :",np.__version__)
print("matplotlib version :",mpl.__version__)
print("seaborn version :",sns.__version__)
print("altair version :",alt.__version__)

pandas version : 2.2.1
numpy version : 1.26.4
matplotlib version : 3.8.4
seaborn version : 0.13.2
altair version : 5.4.0


In [4]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning, message="the convert_dtype parameter is deprecated")

In [13]:
from vega_datasets import data

In [17]:
data.list_datasets()
# data.*?

['7zip',
 'airports',
 'annual-precip',
 'anscombe',
 'barley',
 'birdstrikes',
 'budget',
 'budgets',
 'burtin',
 'cars',
 'climate',
 'co2-concentration',
 'countries',
 'crimea',
 'disasters',
 'driving',
 'earthquakes',
 'ffox',
 'flare',
 'flare-dependencies',
 'flights-10k',
 'flights-200k',
 'flights-20k',
 'flights-2k',
 'flights-3m',
 'flights-5k',
 'flights-airport',
 'gapminder',
 'gapminder-health-income',
 'gimp',
 'github',
 'graticule',
 'income',
 'iowa-electricity',
 'iris',
 'jobs',
 'la-riots',
 'londonBoroughs',
 'londonCentroids',
 'londonTubeLines',
 'lookup_groups',
 'lookup_people',
 'miserables',
 'monarchs',
 'movies',
 'normal-2d',
 'obesity',
 'ohlc',
 'points',
 'population',
 'population_engineers_hurricanes',
 'seattle-temps',
 'seattle-weather',
 'sf-temps',
 'sp500',
 'stocks',
 'udistrict',
 'unemployment',
 'unemployment-across-industries',
 'uniform-2d',
 'us-10m',
 'us-employment',
 'us-state-capitals',
 'volcano',
 'weather',
 'weball26',
 'wheat',

In [21]:
cars = data.cars()
cars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Name              406 non-null    object        
 1   Miles_per_Gallon  398 non-null    float64       
 2   Cylinders         406 non-null    int64         
 3   Displacement      406 non-null    float64       
 4   Horsepower        400 non-null    float64       
 5   Weight_in_lbs     406 non-null    int64         
 6   Acceleration      406 non-null    float64       
 7   Year              406 non-null    datetime64[ns]
 8   Origin            406 non-null    object        
dtypes: datetime64[ns](1), float64(4), int64(2), object(2)
memory usage: 28.7+ KB


In [25]:
# data.list_datasets()
data.*?

data.7zip
data.__call__
data.__class__
data.__delattr__
data.__dict__
data.__dir__
data.__doc__
data.__eq__
data.__format__
data.__ge__
data.__getattr__
data.__getattribute__
data.__getstate__
data.__gt__
data.__hash__
data.__init__
data.__init_subclass__
data.__le__
data.__lt__
data.__module__
data.__ne__
data.__new__
data.__reduce__
data.__reduce_ex__
data.__repr__
data.__setattr__
data.__sizeof__
data.__str__
data.__subclasshook__
data.__weakref__
data.airports
data.annual_precip
data.anscombe
data.barley
data.birdstrikes
data.budget
data.budgets
data.burtin
data.cars
data.climate
data.co2_concentration
data.countries
data.crimea
data.disasters
data.driving
data.earthquakes
data.ffox
data.flare
data.flare_dependencies
data.flights_10k
data.flights_200k
data.flights_20k
data.flights_2k
data.flights_3m
data.flights_5k
data.flights_airport
data.gapminder
data.gapminder_health_income
data.gimp
data.github
data.graticule
data.income
data.iowa_electricity
data.iris
data.jobs
data.la_riots

In [29]:
data.cars.url
pd.read_json(data.cars.url).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406 entries, 0 to 405
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Name              406 non-null    object 
 1   Miles_per_Gallon  398 non-null    float64
 2   Cylinders         406 non-null    int64  
 3   Displacement      406 non-null    float64
 4   Horsepower        400 non-null    float64
 5   Weight_in_lbs     406 non-null    int64  
 6   Acceleration      406 non-null    float64
 7   Year              406 non-null    object 
 8   Origin            406 non-null    object 
dtypes: float64(4), int64(2), object(3)
memory usage: 28.7+ KB


### Single view

In [36]:
df = pd.DataFrame({
    'city': ['Seattle', 'Seattle', 'Seattle', 'New York', 'New York', 'New York', 'Chicago', 'Chicago', 'Chicago'],
    'month': ['Apr', 'Aug', 'Dec', 'Apr', 'Aug', 'Dec', 'Apr', 'Aug', 'Dec'],
    'precip': [2.68, 0.87, 5.31, 3.94, 4.13, 3.58, 3.62, 3.98, 2.56]
})

df

Unnamed: 0,city,month,precip
0,Seattle,Apr,2.68
1,Seattle,Aug,0.87
2,Seattle,Dec,5.31
3,New York,Apr,3.94
4,New York,Aug,4.13
5,New York,Dec,3.58
6,Chicago,Apr,3.62
7,Chicago,Aug,3.98
8,Chicago,Dec,2.56


In [40]:
chart = alt.Chart(df)

In [44]:
alt.Chart(df).mark_point()

In [46]:
alt.Chart(df).mark_point().encode(
    y='city'
)

In [48]:
alt.Chart(df).mark_point().encode(
    y='city',
    x='precip'
)

In [57]:
# what if change data type
alt.Chart(df).mark_point().encode(
    y='city',
    x='precip:N'
)

- Explicit annotation of data types is necessary when data is loaded from an external URL directly
  - `b:N` indicates a nominal type (unordered, categorical data)
  - `b:O` indicates an ordinal type (rank-ordered data)
  - `b:Q` indicates a quantitative type (numerical data with meaningful magnitudes)
  - `b:T` indicates a temporal type (date/time data)

📚 Q: how to set sequence for name month to be ordinal type

In [89]:
alt.Chart(df).mark_bar().encode(
    x= 'average(precip)',
    y= 'city',
)

**full list** : [aggregation function](https://altair-viz.github.io/user_guide/encodings/index.html#aggregation-functions)

In [91]:
alt.Chart(df).mark_bar().encode(
    y= 'average(precip)',
    x= 'city',
)

In [98]:
alt.Chart(df).mark_point().encode(
    alt.X('precip',scale=alt.Scale(type='log'),
          axis=alt.Axis(title='Log Value')
         ),
    alt.Y('city', axis=alt.Axis(title='Category'))
)

### Multiple views

In [108]:
cars.columns

Index(['Name', 'Miles_per_Gallon', 'Cylinders', 'Displacement', 'Horsepower',
       'Weight_in_lbs', 'Acceleration', 'Year', 'Origin'],
      dtype='object')

In [140]:
cars.head(2)

Unnamed: 0,Name,Miles_per_Gallon,Cylinders,Displacement,Horsepower,Weight_in_lbs,Acceleration,Year,Origin
0,chevrolet chevelle malibu,18.0,8,307.0,130.0,3504,12.0,1970-01-01,USA
1,buick skylark 320,15.0,8,350.0,165.0,3693,11.5,1970-01-01,USA


In [118]:
alt.Chart(cars).mark_line().encode(
    alt.X('Year'),
    alt.Y('average(Miles_per_Gallon)'),
    color='Origin'
)

#### layering

In [142]:
line = alt.Chart(cars).mark_line().encode(
    alt.X('Year'),
    alt.Y('average(Miles_per_Gallon)'),
    color='Origin'
)
point = alt.Chart(cars).mark_circle().encode(
    alt.X('Year'),
    alt.Y('average(Miles_per_Gallon)'),
    color='Origin',
    # size= 'Horsepower'
)

line + point

In [144]:
mpg = alt.Chart(cars).mark_line().encode(
    alt.X('Year'),
    alt.Y('average(Miles_per_Gallon)'),
    color='Origin'
)

mpg + mpg.mark_circle()

In [170]:
mpg = alt.Chart(cars).mark_line(point=True).encode(
    alt.X('Year'),
    alt.Y('average(Miles_per_Gallon)'),
    color='Origin'
)
mpg

#### concat chart

In [172]:
hp = alt.Chart(cars).mark_line(point=True).encode(
    alt.X('Year'),
    alt.Y('average(Horsepower)'),
    color='Origin'
)
hp | mpg

### Interactive

In [179]:
alt.Chart(cars).mark_point().encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
).interactive()

#### tooltip

In [182]:
alt.Chart(cars).mark_point().encode(
    x='Horsepower',
    y='Miles_per_Gallon',
    color='Origin',
    tooltip=['Name','Origin']
).interactive()

In [241]:
# create an interval selection over an x-axis encoding
brush = alt.selection_interval(encodings=['x'])

# determine opacity based on brush
opacity = alt.condition(brush, alt.value(0.9), alt.value(0.1))

# an overview histogram of cars per year
# add the interval brush to select cars over time
overview = alt.Chart(cars).mark_bar().encode(
    alt.X('Year:O', timeUnit='year', # extract year unit, treat as ordinal
      axis=alt.Axis(title=None, labelAngle=0) # no title, no label angle
    ),
    alt.Y('count()', title=None), # counts, no axis title
    opacity=opacity
).add_params(
    brush      # add interval brush selection to the chart
).properties(
    width=400, # set the chart width to 400 pixels
    height=50  # set the chart height to 50 pixels
)

# a detail scatterplot of horsepower vs. mileage
# modulate point opacity based on the brush selection
detail = alt.Chart(cars).mark_point().encode(
    alt.X('Horsepower'),
    alt.Y('Miles_per_Gallon'),
    # set opacity based on brush selection
    opacity=opacity
).properties(width=400) # set chart width to match the first chart

chart=(overview & detail)
chart.save('VScodeProject/consol_chart.html')
# vertically concatenate (vconcat) charts using the '&' operator
overview & detail


In [None]:
consol = overview & detail
consol.save('test.json')

#### Breakdown

In [191]:
overview = alt.Chart(cars).mark_bar().encode(
    alt.X('Year:O', timeUnit='year', # extract year unit, treat as ordinal
      axis=alt.Axis(title=None, labelAngle=0) # no title, no label angle
    ),
    alt.Y('count()', title=None), # counts, no axis title
)
overview

In [195]:
# create an interval selection over an x-axis encoding
brush = alt.selection_interval(encodings=['x'])

# determine opacity based on brush
opacity = alt.condition(brush, alt.value(0.9), alt.value(0.1))

# an overview histogram of cars per year
# add the interval brush to select cars over time
overview = alt.Chart(cars).mark_bar().encode(
    alt.X('Year:O', timeUnit='year', # extract year unit, treat as ordinal
      axis=alt.Axis(title=None, labelAngle=0) # no title, no label angle
    ),
    alt.Y('count()', title=None), # counts, no axis title
    opacity=opacity
).add_params(
    brush      # add interval brush selection to the chart
).properties(
    width=400, # set the chart width to 400 pixels
    height=50  # set the chart height to 50 pixels
)

overview

### JSON output

In [230]:
chart = alt.Chart(df).mark_bar().encode(
    x='average(precip)',
    y='city',
)
print(chart.to_json())

json_save = chart.to_json()

{
  "$schema": "https://vega.github.io/schema/vega-lite/v5.20.1.json",
  "config": {
    "view": {
      "continuousHeight": 300,
      "continuousWidth": 300
    }
  },
  "data": {
    "name": "data-8e72c2f67818e64f2c6d729f1a903405"
  },
  "datasets": {
    "data-8e72c2f67818e64f2c6d729f1a903405": [
      {
        "city": "Seattle",
        "month": "Apr",
        "precip": 2.68
      },
      {
        "city": "Seattle",
        "month": "Aug",
        "precip": 0.87
      },
      {
        "city": "Seattle",
        "month": "Dec",
        "precip": 5.31
      },
      {
        "city": "New York",
        "month": "Apr",
        "precip": 3.94
      },
      {
        "city": "New York",
        "month": "Aug",
        "precip": 4.13
      },
      {
        "city": "New York",
        "month": "Dec",
        "precip": 3.58
      },
      {
        "city": "Chicago",
        "month": "Apr",
        "precip": 3.62
      },
      {
        "city": "Chicago",
        "month": "Aug",

In [200]:
### shorthand
x = alt.X('average(precip):Q')
print(x.to_json())

{
  "aggregate": "average",
  "field": "precip",
  "type": "quantitative"
}


In [202]:
### full-length
x = alt.X(aggregate='average', field='precip', type='quantitative')
print(x.to_json())

{
  "aggregate": "average",
  "field": "precip",
  "type": "quantitative"
}


### Publish Visualization

In [233]:
chart = alt.Chart(df).mark_bar().encode(
    x='average(precip)',
    y='city',
)
chart.save('testchart.html')