# Animated Scatterplot 

similar to the one by Hans Rosling (see [his Box-by-Box talk](https://www.youtube.com/watch?v=fTznEIZRkLg))

In [None]:
!python -m pip install pandas matplotlib imageio seaborn openpyxl

In [None]:
# all imports in one place and on top
import pandas as pd
from matplotlib import pyplot as plt,colors
import seaborn as sns
import imageio

In [None]:
# create output path (uses a bash command)
!mkdir -p frames

In [None]:
PATH = 'frames'

In [None]:
# read data
fert = pd.read_csv('gapminder_total_fertility.csv', index_col=0)
life = pd.read_excel('gapminder_lifeexpectancy.xlsx', index_col=0)
pop = pd.read_excel('gapminder_population.xlsx', index_col=0)
cont = pd.read_csv('continents.csv', sep=';')

In [None]:
pop.head(5)

In [None]:
# inspect shapes
fert.shape, life.shape, pop.shape, cont.shape

In [None]:
# convert column labels to integers
fert.columns = fert.columns.astype(int)

In [None]:
# change index labels
fert.index.name = 'country'
pop.index.name = 'country'
life.index.name = 'country'

### Simple Statistics

In [None]:
pop[2010].sum()  # sum over all countries (data not 100% clean!)

In [None]:
top10 = pop[2010].sort_values(ascending=False).head(10)
top10mil = top10 // 1_000_000
top10.plot.bar()
plt.title('most populous countries in 2010')
plt.ylabel('population[millions]')

In [None]:
pop.loc['Romania'].plot()
plt.title('population of Romania')

In [None]:
fert[2010].hist(bins=20)
plt.title('distribution of children per woman in 2010')

### Convert to long format

In [None]:
fert_long = fert.reset_index()
fert_long = fert_long.melt(id_vars='country', var_name='year', value_name='fertility_rate')
fert_long.head(3) # check how it looks like

In [None]:
pop_long = pop.reset_index()
pop_long = pop_long.melt(id_vars='country', var_name='year', value_name='population')

In [None]:
life_long = life.reset_index()
life_long = life_long.melt(id_vars='country', var_name='year', value_name='life_exp')

### Merge

In [None]:
# merge on two columns
df = pd.merge(fert_long, pop_long, on=['country', 'year'])
df = pd.merge(df, life_long, on=['country', 'year'])
df = pd.merge(df, cont, on='country', how='outer')
df.iloc[100:105]  # manually found spot without NaNs

In [None]:
 df.shape # all countries x years

In [None]:
# remove empty values - we only want complete data
df.dropna(inplace=True)

### Plotting

In [None]:
# big image with all data as a teaser
sns.scatterplot(data=df, x='life_exp', y='fertility_rate', hue='continent')

In [None]:
# scale to range [0,1]
df['population_scaled'] = (df['population'] - df['population'].min())/(df['population'].max()-df['population'].min())
# scale to range [1, 10000]
df['population_scaled'] = (10000-1)*df['population_scaled']+1

In [None]:
def create_scatterplot(df, year):
    """Prepare hihger quality scatterplot for one year"""
    subdf = df.loc[df['year'] == year]
   
    plt.figure(figsize=(12, 8)) # <-- reset figure, crashes without this line
    sns.scatterplot(x='life_exp', y='fertility_rate', hue='continent', 
                    size='population', 
                    # for correct global scaling across all years
                    sizes=(subdf['population_scaled'].min(), subdf['population_scaled'].max()), 
                    legend=False, alpha=0.8, data=subdf)
    plt.axis([0, 85, 0, 9.5])
    plt.title(f"{year}", loc='left', fontsize=16)
    plt.xlabel('life expectancy [years]', fontsize=16)
    plt.ylabel('fertility [children per women]', fontsize=16)
    sns.despine()

In [None]:
create_scatterplot(df, 1960)

In [None]:
# write frames for all years
for year in range(1950, 2016): # add ,10 for bigger steps -> faster when debugging
    print(year, end=', ')  # debug info
    create_scatterplot(df, year)
    plt.savefig(f'{PATH}/frame_{year}.png', dpi=150)
    plt.close() # do not display plot in output

In [None]:
images = []

for year in range(1950, 2016):
    filename = f'{PATH}/frame_{year}.png'
    images.append(imageio.imread(filename))

imageio.mimsave('animated_scatterplot.gif', images, fps=10)

![](animated_scatterplot.gif)