# Altair Tutorial

## Setup

Altair can be installed via the command line or by typing into a notebook cell the following:

`pip install altair`

In [1]:
import altair as alt

In [2]:
import pandas as pd

## Reading in the Data

In [3]:
df_2000 = pd.read_csv("songs.csv")
df_2000.head()

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,Britney Spears,Oops!...I Did It Again,211160,False,2000,77,0.751,0.834,1,-5.444,0,0.0437,0.3,1.8e-05,0.355,0.894,95.053,pop
1,blink-182,All The Small Things,167066,False,1999,79,0.434,0.897,0,-4.918,1,0.0488,0.0103,0.0,0.612,0.684,148.726,"rock, pop"
2,Faith Hill,Breathe,250546,False,1999,66,0.529,0.496,7,-9.007,1,0.029,0.173,0.0,0.251,0.278,136.859,"pop, country"
3,Bon Jovi,It's My Life,224493,False,2000,78,0.551,0.913,0,-4.063,0,0.0466,0.0263,1.3e-05,0.347,0.544,119.992,"rock, metal"
4,*NSYNC,Bye Bye Bye,200560,False,2000,65,0.614,0.928,8,-4.806,0,0.0516,0.0408,0.00104,0.0845,0.879,172.656,pop


In [4]:
df_2000.loc[df_2000["danceability"] > 0.95].head()

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
211,Christina Milian,When You Look At Me - Radio Edit,222546,False,2001,60,0.955,0.839,3,-3.399,0,0.127,0.0934,0.000175,0.099,0.825,108.955,"hip hop, pop, R&B"
225,Missy Elliott,4 My People (feat. Eve),289373,True,2001,49,0.969,0.701,1,-7.503,1,0.156,0.14,0.00161,0.201,0.905,121.392,"hip hop, pop, R&B"
237,Nelly,Hot In Herre,228240,True,2002,75,0.956,0.745,11,-4.753,0,0.12,0.206,0.0,0.0615,0.912,107.075,"hip hop, pop, R&B"
425,Kelis,Trick Me,206106,True,2003,63,0.97,0.72,1,-3.347,0,0.149,0.0369,0.000389,0.326,0.962,107.17,"pop, R&B, Dance/Electronic"
533,Sugababes,Push The Button,218093,False,2005,64,0.962,0.66,8,-5.096,1,0.061,0.0485,0.0,0.076,0.814,126.016,"pop, R&B"


In [5]:
# create new column 'genre1' for first genre in genre list
# df_2000["genre1"] = df_2000["genre"].str.split(',', expand=True)[0]

# create copies of each row for each genre in genre list, store the single genre string in the 'genre1' field
# df_2000.assign(genre1=df_2000["genre"]).explode("genre1")

## Charts & Marks

In [None]:
alt.Chart(data=df_2000).mark_circle()
# alt.Chart(data=df_2000).mark_point()
# alt.Chart(data=df_2000).mark_bar()
# alt.Chart(data=df_2000).mark_rect()

In [7]:
alt.Chart(data=df_2000).mark_circle().encode(
    x="danceability",
    y="energy"
)

In [8]:
df_big3 = df_2000.loc[(df_2000["artist"] == "Kendrick Lamar") | (df_2000["artist"] == "Drake") | (df_2000["artist"] == "J. Cole")]
df_big3.head()

Unnamed: 0,artist,song,duration_ms,explicit,year,popularity,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
954,Drake,Forever,357706,True,2009,73,0.457,0.906,5,-2.278,0,0.342,0.249,0.0,0.182,0.54,104.02,"hip hop, pop, R&B"
986,Drake,Best I Ever Had,258760,True,2010,54,0.431,0.894,5,-2.673,0,0.33,0.0951,0.0,0.188,0.605,162.161,"hip hop, pop, R&B"
1071,Drake,Over,233560,True,2010,57,0.325,0.848,7,-5.611,1,0.279,0.0109,0.0,0.124,0.433,100.093,"hip hop, pop, R&B"
1081,Drake,Find Your Love,208946,False,2010,56,0.625,0.613,6,-6.005,0,0.173,0.0209,0.0,0.0286,0.738,96.033,"hip hop, pop, R&B"
1172,Drake,Headlines,235986,True,2011,74,0.636,0.566,6,-7.16,0,0.106,0.365,0.000353,0.0917,0.425,151.894,"hip hop, pop, R&B"


## Encodings

* Quantitative (Q): a continuous real-valued quantity
* Ordinal (O): a discrete ordered quantity
* Nominal (N): a discrete unordered category
* Temporal (T): a time or date value

In [9]:
alt.Chart(df_big3).mark_point().encode(
    x="year:O",
    y="energy"
)

In [10]:
alt.Chart(df_big3).mark_point().encode(
    x=alt.X("year:O", title="Year"),
    y=alt.Y("energy", title="Song energy")
)

In [11]:
alt.Chart(df_big3, title=alt.Title('Energy of "Big 3" Tracks', subtitle="KL, Drake, and JCole")).mark_point().encode(
    x=alt.X("year:O", title="Year"),
    y=alt.Y("energy", title="Song energy")
)

In [12]:
alt.Chart(df_big3, title="Energy of Big 3 Tracks").mark_point().encode(
    x=alt.X("year:Q", title="Year",
        scale=alt.Scale(domain=(2008, 2020), clamp=True)
    ),
    y=alt.Y("energy", title="Song energy")
)

In [13]:
alt.Chart(df_big3, title="Energy of Big 3 Tracks").mark_point().encode(
    x=alt.X("year:Q", title="Year",
        axis=alt.Axis(tickCount=5, format=".0d"),
        scale=alt.Scale(domain=(2008, 2020), clamp=True)
    ),
    y=alt.Y("energy", title="Song energy")
)

In [14]:
alt.Chart(df_big3, title="Energy of Big 3 Tracks").mark_point(size=70, color="red", opacity=0.3).encode(
    x=alt.X("year:Q", title="Year",
        axis=alt.Axis(tickCount=5, format=".0d"),
        scale=alt.Scale(domain=(2008, 2020), clamp=True)
    ),
    y=alt.Y("energy", title="Song energy")
)

In [15]:
alt.Chart(df_big3, title="Energy of Big 3 Tracks").mark_point(size=50).encode(
    x=alt.X("year:Q", title="Year",
        axis=alt.Axis(tickCount=5, format=".0d"),
        scale=alt.Scale(domain=(2008, 2020), clamp=True)
    ),
    y=alt.Y("energy", title="Song energy"),
    color="artist"
)

In [16]:
alt.Chart(df_big3, title="Energy of Big 3 Tracks").mark_point(size=50).encode(
    x=alt.X("year:Q", title="Year",
        axis=alt.Axis(tickCount=5, format=".0d"),
        scale=alt.Scale(domain=(2008, 2020), clamp=True)
    ),
    y=alt.Y("energy", title="Song energy"),
    color="artist",
    tooltip="song"
)

In [17]:
big3_energy = alt.Chart(df_big3, title="Energy of Big 3 Tracks").mark_point(size=50).encode(
    x=alt.X("year:Q", title="Year",
        axis=alt.Axis(tickCount=5, format=".0d"),
        scale=alt.Scale(domain=(2008, 2020), clamp=True)
    ),
    y=alt.Y("energy", title="Song energy"),
    color="artist"
)

big3_energy.save("big3.html", embed_options={"renderer": "svg"})

## Aggregation Functions

* mean
* median
* count
* distinct
* max/min
* sum

In [18]:
# df_2000["genre1"] = df_2000["genre"].str.split(',', expand=True)[0]

In [19]:
alt.Chart(data=df_2000, title="Top Song Genres").mark_bar(size=30).encode(
    x="genre1:N",
    y="count()"
).properties(
    width=500,
    height=400
)

### Sorting and Customizing Axis Labels

In [20]:
alt.Chart(data=df_2000, title=alt.Title("What?", font="Wingdings")).mark_bar(size=30).encode(
    x=alt.X("genre1:N", sort="-y",
        axis=alt.Axis(labelAngle=-45, labelFont="Garamond", labelColor="blue", labelFontSize=14, title="????")
    ),
    y="mean(danceability)"
).properties(
    width=500,
    height=400
)

### Histogram Example

In [21]:
alt.Chart(data=df_2000).mark_bar().encode(
    x=alt.X("tempo:Q").bin(),
    y="mean(danceability)"
).properties(
    width=500,
    height=400
)

### Binning and Multiple Encodings

In [22]:
alt.Chart(data=df_2000).mark_circle().encode(
    x=alt.X("tempo:Q", scale=alt.Scale(zero=False)).bin(),
    y=alt.Y("loudness:Q").bin(),
    size="count()",
    color="mean(instrumentalness)"
)

### Modifying Color

In [23]:
alt.Chart(data=df_big3).mark_circle().encode(
    x="speechiness",
    y="acousticness",
    color=alt.Color("key:N").scale(scheme="set3"),
    size="danceability"
)

### Line Charts

In [24]:
alt.Chart(data=df_2000).mark_line(point=True).encode(
    x="year:N",
    y=alt.Y("mean(duration_ms):Q", scale=alt.Scale(zero=False))
)