***This notebook is a part of chapter 3.5***

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy as sc
import matplotlib.pyplot as plt
import altair as alt
from sklearn.datasets import load_iris,load_wine,load_iris

#0. Load data

In [None]:
iris = load_iris()
df_iris = pd.DataFrame(iris['data'], columns=iris['feature_names'])

# create target_name column
df_iris['target'] = iris['target']
mapper_iris_target = {0:'setosa',1:'vesicolor',2:'virginica'}
df_iris['target_name'] = df_iris['target'].map(mapper_iris_target) # map target with name

In [None]:
df_iris.head()

#1. Univariate visualization

In [None]:
df_iris_uni = df_iris['sepal length (cm)']

In [None]:
df_iris_uni

##1.1 Box plot

In [None]:
# matplotlib
plt.boxplot(df_iris_uni)

In [None]:
# seaborn
sns.boxplot(df_iris_uni)

In [None]:
# altair: pros is try prevent anything that can cause doubt Ex. truncated graph
boxplot = alt.Chart(df_iris_uni.to_frame()).mark_boxplot().encode(
    alt.Y("sepal length (cm):Q")
)
# to_frame(): convert dataset to dataframe
boxplot

##1.2 Histogram plot

In [None]:
plt.hist(df_iris_uni)

In [None]:
# matplotlib
plt.hist(df_iris_uni,bins=[4,4.5,5,5.5,6,6.5,7,7.5,8],density=True,rwidth=0.9)
# bins is the number between the bar
# inclue x, y label
plt.xlabel('sepal length (cm)')
plt.ylabel('count')

In [None]:
# seaborn
sns.displot(df_iris_uni,bins=[4,4.5,5,5.5,6,6.5,7,7.5,8],kde=True)

In [None]:
# altair: fit according to visualization theory than other library
barchart = alt.Chart(pd.DataFrame(df_iris_uni)).mark_bar().encode(
    alt.X("sepal length (cm):Q",bin=alt.Bin(maxbins=10)), # include number of bins
    alt.Y('count():Q', stack=None)
)
barchart

In [None]:
barchart.interactive() # interactive chart

# 2. Bivariate visualization

##2.1 nominal & nominal


### Heatmap

In [None]:
df_titanic = sns.load_dataset("titanic")
df_titanic.head()

In [None]:
df_titanic_nom_nom = df_titanic.loc[:,['class','embark_town']] # select column
df_titanic_nom_nom_pivot = (df_titanic_nom_nom
                            .value_counts()
                            .to_frame('count')
                            .pivot_table(index='class',columns='embark_town',values='count'))
df_titanic_nom_nom_pivot

In [None]:
df_titanic_nom_nom_pivot_pct = df_titanic_nom_nom_pivot.divide(df_titanic_nom_nom_pivot.sum(axis='rows'),axis='columns')
df_titanic_nom_nom_pivot_pct

In [None]:
# seaborn
sns.set(font_scale=1.25)
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(14,4)) # create sub plot

sns.heatmap(df_titanic_nom_nom_pivot, annot=True, fmt=".03f",ax=ax1)
sns.heatmap(df_titanic_nom_nom_pivot_pct, annot=True, fmt=".03f",ax=ax2)

# add title
ax1.set_title("Number of customer",fontsize=15)
ax2.set_title("Percent count of customer for each embark_town",fontsize=15)

In [None]:
# if we use altair to create heatmap, we must unstack data like this
df_titanic_nom_nom_pivot.unstack().to_frame('count').reset_index()

In [None]:
# altair
df_titanic_nom_nom_unstack = df_titanic_nom_nom_pivot.unstack().to_frame('count').reset_index()

# create heatmap chart
heatmap = alt.Chart(df_titanic_nom_nom_unstack).mark_rect().encode(
    x='embark_town:N', # :N is nominal data
    y='class:N',
    color='count:Q' # :Q is qunatitative data
)
# mark may not be important if data is clear to identify type of data

# create number chart
number = alt.Chart(df_titanic_nom_nom_unstack).mark_text(fontSize=20).encode(
    x='embark_town:N',
    y='class:N',
    text='count:Q',
    color=alt.condition(
        alt.datum.count > 300,
        alt.value('white'),
        alt.value('black')
    )
)

# heatmap chart overlap with number chart
(heatmap+number).properties(
    width=200,
    height=150
)

##2.2 nominal & quantitative




In [None]:
df_titanic_nom_quan = df_titanic.loc[:,['class','fare']]
df_titanic_nom_quan

### Barplot

In [None]:
# barplot
df_titanic_nom_quan_bar = df_titanic_nom_quan.groupby('class')[['fare']].mean().reset_index()
df_titanic_nom_quan_bar = df_titanic_nom_quan_bar.rename(columns={'fare':'mean_fare'})
df_titanic_nom_quan_bar

In [None]:
sns.set(font_scale=1.25)
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(14,6))

# vertical bar chart
ax1.bar(x='class',height='mean_fare',data=df_titanic_nom_quan_bar)
ax1.set_xlabel('class')
ax1.set_ylabel('mean of fare')
ax1.set_title('Mean of fare for each class',fontsize=20)

# horizontal bar chart
ax2.barh(y='class',width='mean_fare',data=df_titanic_nom_quan_bar)
ax2.set_xlabel('mean of fare')
ax2.set_ylabel('class')
ax2.set_title('Mean of fare for each class',fontsize=20)

fig.tight_layout(pad=1.0)


In [None]:
sns.set(font_scale=1.25)
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(14,6))

ax1.plot(['First', 'Second', 'Third'],df_titanic_nom_quan_bar['mean_fare'])
ax1.set_xlabel('class')
ax1.set_ylabel('mean of fare')
ax1.set_title('Mean of fare for each class',fontsize=20)

# adjusted y axis
ax2.plot(['First', 'Second', 'Third'],df_titanic_nom_quan_bar['mean_fare'])
ax2.set_ylim(0,90) # set y limit start at 0 to prevent truncated graph
ax2.set_xlabel('class')
ax2.set_ylabel('mean of fare')
ax2.set_title('Mean of fare for each class',fontsize=20)

fig.tight_layout(pad=1.0)


### About distribution plot

In [None]:
# distplot with multiple distribution
sns.kdeplot(x='fare',hue='class',data=df_titanic_nom_quan[df_titanic_nom_quan['class']=='First'].query('fare<100'))
sns.kdeplot(x='fare',hue='class',data=df_titanic_nom_quan[df_titanic_nom_quan['class']=='Second'].query('fare<100'))
sns.kdeplot(x='fare',hue='class',data=df_titanic_nom_quan[df_titanic_nom_quan['class']=='Third'].query('fare<100'))

In [None]:
# boxplot
sns.boxplot(x='class',y='fare',data=df_titanic_nom_quan.query('fare<100'))
# boxplot can show outlier

In [None]:
print('titanic dataframe')
display(df_titanic.head())
print('')
print('')
print('iris dataframe')
display(df_iris.head())

In [None]:
# stripplot
sns.set(font_scale=1.25)
fig,([ax1,ax2],[ax3,ax4]) = plt.subplots(2,2,figsize=(8,8))

sns.stripplot(x='class', y='fare', data=df_titanic_nom_quan,jitter=False,ax=ax1)
# with out jittering, we can't see density of data
sns.stripplot(x='class', y='fare', data=df_titanic_nom_quan,jitter=True,ax=ax2)

sns.stripplot(x='target_name', y='sepal width (cm)', data=df_iris,jitter=False,ax=ax3)
sns.stripplot(x='target_name', y='sepal width (cm)', data=df_iris,jitter=True,ax=ax4)
# stripplot with jittering is good when data is not large Ex. iris data

ax1.set_title("Titanic without jittering",fontsize=15)
ax2.set_title("Titanic with jittering",fontsize=15)

ax3.set_title("Iris without jittering",fontsize=15)
ax4.set_title("Iris with jittering",fontsize=15)

fig.tight_layout(pad=1.0)

In [None]:
# swarmplot: similar to stripplot but different swarmplot is not try to squeeze point closely
# it not good to describe large data
sns.set(font_scale=1.25)
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(12,6))
sns.swarmplot(x='class', y='fare', data=df_titanic_nom_quan,ax=ax1)

sns.swarmplot(x='target_name', y='sepal width (cm)', data=df_iris,ax=ax2)

ax1.set_title("Titanic dataset",fontsize=15)
ax2.set_title("Iris dataset",fontsize=15)


##2.3 quantitative & quantitative

###Scatter plot

In [None]:
df_iris.loc[:,['sepal length (cm)','sepal width (cm)']]

In [None]:
# seaborn
sns.scatterplot(x='sepal length (cm)',y='sepal width (cm)',data=df_iris)
plt.xlabel('sepal length')
plt.ylabel('sepal width')

In [None]:
# altair
scatter = alt.Chart(df_iris).mark_point().encode(
    x='sepal length (cm)',
    y='sepal width (cm)',
)
scatter.properties(
    width=400,
    height=400
).interactive()

#Multivariate data visualization

##3.1 Scatter plot

In [None]:
sns.scatterplot(x='sepal length (cm)',y='sepal width (cm)',hue='target_name',data=df_iris)
plt.xlabel('sepal length')
plt.ylabel('sepal width')

In [None]:
# altair
scatter = alt.Chart(df_iris).mark_point().encode(
    x='sepal length (cm)',
    y='sepal width (cm)',
    color = 'target_name',
)
scatter.properties(
    width=400,
    height=400
)

In [None]:
# car dataset
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]
df_cars = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )

In [None]:
df_cars_tmp = df_cars[df_cars['make'].isin(['isuzu','toyota','honda','volvo'])]

In [None]:
df_cars_tmp.head()

In [None]:
sns.set(font_scale=1)
sns.scatterplot(x=df_cars_tmp['horsepower'],
                y=df_cars_tmp['wheel_base'],
                legend=False)

In [None]:
sns.set(font_scale=1)
sns.scatterplot(x=df_cars_tmp['horsepower'],
                y=df_cars_tmp['wheel_base'],
                hue=df_cars_tmp['make']
                )
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
sns.set(font_scale=1)
sns.scatterplot(x=df_cars_tmp['horsepower'],
                y=df_cars_tmp['wheel_base'],
                hue=df_cars_tmp['make'],
                size = df_cars_tmp['price']
                )
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)


In [None]:
sns.set(font_scale=1.25)
fig,(ax1,ax2,ax3) = plt.subplots(1,3,figsize=(21,7))

sns.scatterplot(x=df_cars_tmp['horsepower'],
                y=df_cars_tmp['wheel_base'],
                # legend=False,
                ax=ax1)
sns.scatterplot(x=df_cars_tmp['horsepower'],
                y=df_cars_tmp['wheel_base'],
                hue=df_cars_tmp['make'],
                # legend=False,
                ax=ax2
               )
ax3=sns.scatterplot(x=df_cars_tmp['horsepower'],
                y=df_cars_tmp['wheel_base'],
                hue=df_cars_tmp['make'],
                size = df_cars_tmp['price'],
                # legend=False,
                ax=ax3
               )


ax1.set_title('2 dimensions')
ax2.set_title('3 dimensions')
ax3.set_title('4 dimensions')

In [None]:
# altair
scatter = alt.Chart(df_cars_tmp).mark_point().encode(
    x='horsepower',
    y='wheel_base',
    color = 'make',
    size = 'price',
    tooltip=['horsepower','wheel_base','fuel_type','num_doors' ,'make', 'price']
)
scatter.properties(
    width=400,
    height=400
).interactive()

##3.2 Stack&multiple bar chart

In [None]:
# preprocess
df_titanic_stack_bar = df_titanic.loc[:,['sex','embark_town']].value_counts().to_frame('count').reset_index()
df_titanic_stack_bar = df_titanic_stack_bar.pivot_table(index='embark_town',columns='sex').reset_index()
df_titanic_stack_bar.columns = ['embark_town','count_male','count_female']
df_titanic_stack_bar

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(14,6))

df_titanic_stack_bar.set_index('embark_town').plot.bar(stacked=True,rot=0,ax=ax1) # default is not stacked
plt.legend(['Men','Women'])

df_titanic_stack_bar.set_index('embark_town').plot.bar(rot=0,ax=ax2)
plt.legend(['Men','Women'])

In [None]:
# altair
stack_bar_chart = alt.Chart(df_titanic).mark_bar().encode(
    x='embark_town',
    y='count(*):Q',
    color='sex:N'
).properties(
    width=400,
    height=400
)

multiple_bar_chart = alt.Chart(df_titanic).mark_bar().encode(
    x=alt.X('sex:O',axis=alt.Axis(title=None, labels=False)),
    y='count(*):Q',
    color='sex:N',
    column=alt.Column('embark_town:N',header=alt.Header(titleOrient='bottom', labelOrient='bottom'))
).properties(
    width=100,
    height=400
)

In [None]:
(stack_bar_chart|multiple_bar_chart).resolve_scale(y='shared',color='shared') # altair it will not exclude null

##3.3 Small multiple

In [None]:
# small multiple it good for quick scan data
sns.pairplot(df_iris.drop(columns='target'),hue='target_name')

In [None]:
# pairplot using altair
alt.Chart(df_iris).mark_circle().encode(
    alt.X(alt.repeat("column"), type='quantitative'),
    alt.Y(alt.repeat("row"), type='quantitative'),
    color='target_name:N'
).properties(
    width=150,
    height=150
).repeat(
    row=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)','petal width (cm)'],
    column=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)','petal width (cm)'],
).interactive()

# Visual variable separability

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(14,7))

sns.scatterplot(x='sepal length (cm)',y='sepal width (cm)',style='target_name',data=df_iris, ax=ax1)
# if not slect right visual variable it can make confusion and hard to understand
sns.scatterplot(x='sepal length (cm)',y='sepal width (cm)',hue='target_name',data=df_iris, ax=ax2)


Right figure is easier to understand than the left.