# Visualization Curriculum

---
* Author:  [Yuttapong Mahasittiwat](mailto:khala1391@gmail.com)
* Technologist | Data Modeler | Data Analyst
* [YouTube](https://www.youtube.com/khala1391)
* [LinkedIn](https://www.linkedin.com/in/yuttapong-m/)
---

Source: [kaggle](https://www.kaggle.com/datasets/iamsouravbanerjee/world-population-dataset)

In [118]:
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import altair as alt
print("pandas version :",pd.__version__)
print("numpy version :",np.__version__)
print("matplotlib version :",mpl.__version__)
print("seaborn version :",sns.__version__)
print("altair version :",alt.__version__)

pandas version : 2.2.1
numpy version : 1.26.4
matplotlib version : 3.8.4
seaborn version : 0.13.2
altair version : 5.4.0


In [119]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning, message="the convert_dtype parameter is deprecated")

In [120]:
df = pd.read_csv('data/world_population.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 234 entries, 0 to 233
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Rank                         234 non-null    int64  
 1   CCA3                         234 non-null    object 
 2   Country/Territory            234 non-null    object 
 3   Capital                      234 non-null    object 
 4   Continent                    234 non-null    object 
 5   2022 Population              234 non-null    int64  
 6   2020 Population              234 non-null    int64  
 7   2015 Population              234 non-null    int64  
 8   2010 Population              234 non-null    int64  
 9   2000 Population              234 non-null    int64  
 10  1990 Population              234 non-null    int64  
 11  1980 Population              234 non-null    int64  
 12  1970 Population              234 non-null    int64  
 13  Area (km²)          

In [121]:
df.head()

Unnamed: 0,Rank,CCA3,Country/Territory,Capital,Continent,2022 Population,2020 Population,2015 Population,2010 Population,2000 Population,1990 Population,1980 Population,1970 Population,Area (km²),Density (per km²),Growth Rate,World Population Percentage
0,36,AFG,Afghanistan,Kabul,Asia,41128771,38972230,33753499,28189672,19542982,10694796,12486631,10752971,652230,63.0587,1.0257,0.52
1,138,ALB,Albania,Tirana,Europe,2842321,2866849,2882481,2913399,3182021,3295066,2941651,2324731,28748,98.8702,0.9957,0.04
2,34,DZA,Algeria,Algiers,Africa,44903225,43451666,39543154,35856344,30774621,25518074,18739378,13795915,2381741,18.8531,1.0164,0.56
3,213,ASM,American Samoa,Pago Pago,Oceania,44273,46189,51368,54849,58230,47818,32886,27075,199,222.4774,0.9831,0.0
4,203,AND,Andorra,Andorra la Vella,Europe,79824,77700,71746,71519,66097,53569,35611,19860,468,170.5641,1.01,0.0


In [122]:
df.columns

Index(['Rank', 'CCA3', 'Country/Territory', 'Capital', 'Continent',
       '2022 Population', '2020 Population', '2015 Population',
       '2010 Population', '2000 Population', '1990 Population',
       '1980 Population', '1970 Population', 'Area (km²)', 'Density (per km²)',
       'Growth Rate', 'World Population Percentage'],
      dtype='object')

In [123]:
df = df.drop(columns=['Rank', 'CCA3','Capital','Density (per km²)','Growth Rate', 'World Population Percentage'])
df = df.rename(columns={'Country/Territory':'Country','Area (km²)':'Area (km2)'})
df.head()

Unnamed: 0,Country,Continent,2022 Population,2020 Population,2015 Population,2010 Population,2000 Population,1990 Population,1980 Population,1970 Population,Area (km2)
0,Afghanistan,Asia,41128771,38972230,33753499,28189672,19542982,10694796,12486631,10752971,652230
1,Albania,Europe,2842321,2866849,2882481,2913399,3182021,3295066,2941651,2324731,28748
2,Algeria,Africa,44903225,43451666,39543154,35856344,30774621,25518074,18739378,13795915,2381741
3,American Samoa,Oceania,44273,46189,51368,54849,58230,47818,32886,27075,199
4,Andorra,Europe,79824,77700,71746,71519,66097,53569,35611,19860,468


In [124]:
df = df.melt(id_vars=['Country','Continent','Area (km2)'],
        value_vars=['2022 Population', '2020 Population', '2015 Population','2010 Population', '2000 Population', '1990 Population','1980 Population', '1970 Population'],
       var_name='Year', value_name='Population')

In [125]:
df.Year = df.Year.str.extract(r'(\d+)')

In [126]:
df = df.astype({'Year':'int32'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1872 entries, 0 to 1871
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Country     1872 non-null   object
 1   Continent   1872 non-null   object
 2   Area (km2)  1872 non-null   int64 
 3   Year        1872 non-null   int32 
 4   Population  1872 non-null   int64 
dtypes: int32(1), int64(2), object(2)
memory usage: 65.9+ KB


In [127]:
gr = ['Continent','Year']

for i in gr:
    print(df[i].value_counts())

Continent
Africa           456
Asia             400
Europe           400
North America    320
Oceania          184
South America    112
Name: count, dtype: int64
Year
2022    234
2020    234
2015    234
2010    234
2000    234
1990    234
1980    234
1970    234
Name: count, dtype: int64


In [128]:
alt.Chart(df).mark_line().encode(
    alt.Color('Continent:N'),
    alt.Y('average(Population):Q'),
    # alt.X('Area (km2):Q'),
    alt.X('Year:O'),
    # alt.Column('Continent:O'),
).properties(width=400,height=200)

In [129]:
alt.Chart(df).mark_circle().encode(
    alt.Color('Continent:N'),
    alt.Y('average(Population):Q'),
    alt.X('Area (km2):Q'),
    # alt.X('Year:O'),
    alt.Column('Continent:N'),
).properties(width=100,height=100)

In [130]:
selection = alt.selection_interval()

alt.Chart(df).mark_circle().encode(
    alt.X('Population:Q'),
    alt.Y('Area (km2):Q'),
    color= alt.condition(selection,'Continent:N',alt.value('gray'))
    # alt.X('Year:O'),
    # alt.Column('Continent:O'),
).properties(width=400,height=200).add_params(selection)

In [131]:
brush = alt.selection_interval(encodings=['x'])
opacity = alt.condition(brush, alt.value(0.9), alt.value(0.1))

bar_yr = alt.Chart(df).mark_bar().encode(
    alt.X('Year:O',axis=alt.Axis(title=None, labelAngle=0)),
    alt.Y('average(Population):Q',axis=alt.Axis(title=None, labelAngle=0)),
    opacity = opacity
).add_params(
    brush      
).properties(width=400,height=100
            )

detail = alt.Chart(df).mark_circle().encode(
    alt.Color('Continent:N'),
    alt.X('Population:Q'),
    alt.Y('Area (km2):Q'),
    # alt.X('Year:O'),
    # alt.Column('Continent:O'),
    opacity = opacity,
).properties(width=400,height=200)

bar_yr & detail

In [132]:
select_year = alt.selection_point(
    name='select', fields=['Year'],
    value=1970,
    bind=alt.binding_range(min=1970, max=2022, step=5)
)

alt.Chart(df).mark_circle().encode(
    alt.X('Population:Q', scale=alt.Scale(domain=[0,1600000000])),
    alt.Y('Area (km2):Q', scale=alt.Scale(domain=[0,20000000])),
    alt.Color('Continent:N'),
    # alt.X('Year:O'),
    # alt.Column('Continent:O'),
    # opacity = opacity,
).properties(width=400,height=200).add_params(select_year).transform_filter(select_year)

In [133]:
list(sorted(df['Year'].unique()))

[1970, 1980, 1990, 2000, 2010, 2015, 2020, 2022]

In [134]:
select_year = alt.selection_point(
    name='select', fields=['Year'],
    value=1970,
    bind=alt.binding_radio(options=list(sorted(df['Year'].unique())))
)

alt.Chart(df).mark_circle().encode(
    alt.X('Population:Q', scale=alt.Scale(domain=[0,1600000000])),
    alt.Y('Area (km2):Q', scale=alt.Scale(domain=[0,20000000])),
    alt.Color('Continent:N'),
    # alt.X('Year:O'),
    # alt.Column('Continent:O'),
    # opacity = opacity,
).properties(width=400,height=200).add_params(select_year).transform_filter(select_year)

In [152]:
select_year = alt.selection_point(
    name='select', fields=['Year'],
    value=1970,
    bind=alt.binding_select(options=list(sorted(df['Year'].unique())))
)

alt.Chart(df).mark_circle().encode(
    alt.X('Population:Q', scale=alt.Scale(domain=[0,1600000000])),
    alt.Y('Area (km2):Q', scale=alt.Scale(domain=[0,20000000])),
    alt.Color('Continent:N'),
    # alt.X('Year:O'),
    # alt.Column('Continent:O')
    # opacity = opacity,
).properties(width=400,height=200).add_params(select_year).transform_filter(select_year)

In [91]:
alt.selection*?

alt.selection_interval
alt.selection_point

In [93]:
alt.binding*?

alt.binding
alt.binding_checkbox
alt.binding_radio
alt.binding_range
alt.binding_select