# Data Visualization with Altair

In [1]:
#install altair with pip
!pip install altair



In [2]:
#importing necessary dependencies
import pandas as pd
import altair as alt

In [3]:
#import and load data to variable name
from vega_datasets import data
iris = data.iris()

In [4]:
#preview loaded data
iris.head()

Unnamed: 0,sepalLength,sepalWidth,petalLength,petalWidth,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
#check for unique values
iris.species.unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [6]:
#Check datatypes
iris.dtypes

sepalLength    float64
sepalWidth     float64
petalLength    float64
petalWidth     float64
species         object
dtype: object

In [7]:
#check for missing values
iris.isna().any()

sepalLength    False
sepalWidth     False
petalLength    False
petalWidth     False
species        False
dtype: bool

In [8]:
#check for total count of missing values
iris.count()

sepalLength    150
sepalWidth     150
petalLength    150
petalWidth     150
species        150
dtype: int64

**Performing Visualizations**

In [9]:
#create a bar chart
alt.Chart(iris).mark_bar().encode(
    alt.X('sepalLength:Q'),
    alt.Y('count():Q'),
    alt.Color('species')
)

In [10]:
#another way to create the same chart
alt.Chart(iris).mark_circle().encode(
    x='sepalLength:Q',
    y='count():Q',
    color='species',
)

In [11]:
#creating an interactive point graph
brush = alt.selection_interval()
points = alt.Chart(iris).mark_point().encode(
    x='sepalLength:Q',
    y='sepalWidth:Q',
    color=alt.condition(brush, 'species:N', alt.value('lightgray'))
).add_selection(
    brush
)
bars = alt.Chart(iris).mark_bar().encode(
    y='species:N',
    color='species:N',
    x='count(species):Q'
).transform_filter(
    brush
)
points & bars

**Working on our Own Data: Titanic Dataset**

In [12]:
#using titanic dataset
datat=pd.read_csv("titanic.csv")
datat.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [13]:
#checking for missing values
datat.isna().any()

PassengerId    False
Survived       False
Pclass         False
Name           False
Sex            False
Age             True
SibSp          False
Parch          False
Ticket         False
Fare           False
Cabin           True
Embarked        True
dtype: bool

In [14]:
#count data
datat.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [15]:
#drop column with most missing values
datat=datat.drop(['Cabin'], axis = 1)

In [16]:
#Confirm dropped column
datat.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Embarked       889
dtype: int64

In [17]:
#drop all missing rows
datat=datat.dropna(axis=0)

In [18]:
datat.count()

PassengerId    712
Survived       712
Pclass         712
Name           712
Sex            712
Age            712
SibSp          712
Parch          712
Ticket         712
Fare           712
Embarked       712
dtype: int64

In [19]:
#Visualize the cleaned data
alt.Chart(datat).mark_bar().encode(
    alt.X('Age:Q'),
    alt.Y('count():Q'),
    alt.Color('Survived:N',title='Survived Class')
).properties(width=1000)

In [20]:
#making the Chart interactive with brush
brush = alt.selection_interval()
points = alt.Chart(datat).mark_point().encode(
    x='Age',
    y='Fare',
    color=alt.condition(brush, 'Survived:N', alt.value('lightgray'))
).add_selection(
    brush
).properties(height=400,width=1000)
bars = alt.Chart(datat).mark_bar().encode(
    y='Survived:N',
    color='Survived:N',
    x='count(Survived):N'
).transform_filter(
    brush
)
points & bars

In [21]:
#Create a line chart from titanic data
alt.Chart(datat).mark_line().encode(
    x='Age:Q',
    y='Fare:Q',
    color='Survived:N'
)

**Creating Stacked Bar chart with Barley data**

In [22]:
#load data from barley and preview
barley=data.barley()
barley.head()

Unnamed: 0,yield,variety,year,site
0,27.0,Manchuria,1931,University Farm
1,48.86667,Manchuria,1931,Waseca
2,27.43334,Manchuria,1931,Morris
3,39.93333,Manchuria,1931,Crookston
4,32.96667,Manchuria,1931,Grand Rapids


In [23]:
#create a stacked bar chart from loaded barley data

alt.Chart(barley).mark_bar().encode(
    x='sum(yield)',
    y='variety',
    color='site'
)

**Saving the visualizations**

In [24]:
#Example let's save the chart above
#Save first in variable name

chart1=alt.Chart(barley).mark_bar().encode(
    x='sum(yield)',
    y='variety',
    color='site'
)



In [25]:
#you can save in html and json
chart1.save("stackedbar.html")