# 1. Swedish demographic data (*)

Go to Swedish-language wikipedia page Sveriges demografi.

[https://sv.wikipedia.org/wiki/Sveriges_demografi]


In [1]:
import pandas as pd

#a) Read in the table under "Befolkningsstatistik sedan 1900" into a DataFrame
# https://sv.wikipedia.org/wiki/Sveriges_demografi

link = "https://sv.wikipedia.org/wiki/Sveriges_demografi"
swedenpop = pd.read_html(link, 
    match="Befolkningsförändringar",
    thousands=" ",
    decimal=","
    )[0]

# Chooses swedish standards for thousands (whitespace)
# and , as decimal divider
# NOTE if " " doesn't work, use "\xa0"

# Change name on year column
swedenpop = swedenpop.rename({"Unnamed: 0":"Årtal"}, axis="columns")

# All columns:
# 'Årtal', 'Folkmängd', 'Födda', 'Döda', 'Befolkningsförändringar',
#       'Nativiteten (per 1000)', 'Dödstalen (per 1000)',
#       'Befolkningsförändringar (per 1000)', 'Total fertilitet']
swedenpop


Unnamed: 0,Årtal,Folkmängd,Födda,Döda,Befolkningsförändringar,Nativiteten (per 1000),Dödstalen (per 1000),Befolkningsförändringar (per 1000),Total fertilitet
0,1900,5117000,138139,86146,51993,27.0,16.8,10.2,4.02
1,1901,5156000,139370,82772,56598,27.0,16.1,11.0,4.04
2,1902,5187000,137364,79722,57642,26.5,15.4,11.1,3.95
3,1903,5210000,133896,78610,55286,25.7,15.1,10.6,3.82
4,1904,5241000,134952,80152,54800,25.7,15.3,10.5,3.83
...,...,...,...,...,...,...,...,...,...
116,2016,9995000,117425,90982,26443,11.8,9.2,2.6,1.85
117,2017,10120000,115416,91972,23444,11.4,9.1,2.3,1.79
118,2018,10230000,115832,92185,23647,11.3,9.0,2.3,1.75
119,2019,10327589,114523,88766,28727,11.1,8.7,2.4,1.70


In [2]:
import plotly_express as px
import matplotlib.pyplot as plt

#b) Choose to do some EDA (exploratory data analysis) on this dataset. And draw some relevant graphs.

# Plot a lot of plots
#
# fig1: population
fig1 = px.line(swedenpop, 
    x="Årtal", 
    y="Folkmängd", 
    title="Demography 1900 to 2020", 
    range_y=[0,11e6])
fig1.show()


# fig2: born and dead, and difference
fig2 = px.line(swedenpop, 
    x="Årtal", 
    y=["Födda","Döda","Befolkningsförändringar"])
fig2.show()    

# fig3: nativity, differences deathnumber per 1000
fig3 = px.line(swedenpop, 
    x="Årtal", 
    y=["Nativiteten (per 1000)","Dödstalen (per 1000)","Befolkningsförändringar (per 1000)"])
fig3.show()    

# fig4: fertility
fig4 = px.line(swedenpop, 
    x="Årtal", 
    y="Total fertilitet",
    range_y=[0,5])
fig4.show()    


In [3]:
# Plot a few more inte-connected plots
#
fig1 = px.line(swedenpop, 
    x="Folkmängd", 
    y=["Födda","Döda","Befolkningsförändringar"], 
    title="Demography 1900 to 2020")
fig1.show()

fig2 = px.line(swedenpop, 
    x="Folkmängd", 
    y=["Nativiteten (per 1000)","Dödstalen (per 1000)","Befolkningsförändringar (per 1000)"])
fig2.show()


In [4]:
# c) Now we want to go backwards in time (before 1900) to see how population has changed
#  in Sweden. Read in the table under history and keep the data of "Folkmängd" from 1570-1865.

link = "https://sv.wikipedia.org/wiki/Sveriges_demografi"
swedenpopold = pd.read_html(link, 
    match="Vid utgången av år",
    thousands="\xa0",
    decimal=",",
    )[0].drop([7,8,9,10], axis="rows"
    ).drop(["Årlig tillväxt"], axis="columns"
    ).rename({"Vid utgången av år":"Årtal"}, axis="columns")
# Chooses swedish standards for thousands (whitespace)
# and , as decimal divider
# NOTE if " " doesn't work, use "\xa0"
# removes data after 1865 (already included above)
# removes columns under "Årlig tillväxt", not used here
# renames column to year

# Rename the double column names:
#MultiIndex([(    'Årtal',     'Årtal'),
#            ('Folkmängd', 'Folkmängd')],
swedenpopold = swedenpopold.set_axis(["Årtal", "Folkmängd"], axis=1, inplace=False)
swedenpopold


dropping on a non-lexsorted multi-index without a level parameter may impact performance.



Unnamed: 0,Årtal,Folkmängd
0,1570,900000
1,1650,1225000
2,1700,1485000
3,1720,1350000
4,1755,1878000
5,1815,2465000
6,1865,4099000


In [5]:
 # d) Now concatenate this with the table from 1900 so that you have population data 
 # from 1570 to 2020. Note that you may need to clean the data in order for it to fit 
 # properly. Also you may be able to do this in several ways.

swedenpoptot = pd.concat([swedenpopold,swedenpop[["Årtal","Folkmängd"]]]).astype(int)
swedenpoptot.head(10)


Unnamed: 0,Årtal,Folkmängd
0,1570,900000
1,1650,1225000
2,1700,1485000
3,1720,1350000
4,1755,1878000
5,1815,2465000
6,1865,4099000
0,1900,5117000
1,1901,5156000
2,1902,5187000


In [6]:
#e) Draw a graph of population data from 1570-2020.
px.line(
    swedenpoptot, 
    x="Årtal", y="Folkmängd", 
    markers=True, 
    range_y=[0,11e6],
    title="Sweden's population per year (million people)"
)



# 2. Denmark demographic data (*)

Go to the Danish-language wikipedia page Danmarks demografi. https://da.wikipedia.org/wiki/Danmarks_demografi

  a) Read in the table under "Demografiske data" into a DataFrame (*)

  b) Clean the data and draw a graph of population against year from 1769-2020. (**)

In [28]:
# Get Denmark's data
keyword = "pr. 1. januar"
link = "https://da.wikipedia.org/wiki/Danmarks_demografi"

# Extract top half of the table
denmarktop = pd.read_html(
    link,
    match=keyword,
    thousands=".",
    decimal=",",
    skiprows=2
)[0][["1200","1.000.000[5]"]].rename(
    {"1200":"År","1.000.000[5]":"Folkmängd"},
    axis="columns"
)

# Extract bottom half of table
denmarkbot = pd.read_html(
    link,
    match=keyword,
    thousands=".",
    decimal=",",
    skiprows=2
)[0][["Unnamed: 2","Unnamed: 3"]].rename(
    {"Unnamed: 2":"År","Unnamed: 3":"Folkmängd"},
    axis="columns"
)

# Concatenate top and bot:
denmarkpop = pd.concat(
    [denmarktop,denmarkbot], axis="rows"
).reset_index(
).drop("index", axis="columns")

# Check middle
denmarkpop.tail(10)



Unnamed: 0,År,Folkmängd
54,1998,5294860
55,1999,5313577
56,2000,5330020
57,2001,5349212
58,2002,5368354
59,2003,5383507
60,2005,5411405
61,2010,5534738
62,2015,5659715
63,2020,5822763


In [29]:
# Plot Denmarks populations
px.line(
    denmarkpop, 
    x="År", y="Folkmängd", 
    title="Denmark's population per year (million people)",
    markers=True,
    range_y=[0,6e6]
)

In [66]:
# Extract Norway's data
link = "https://sv.wikipedia.org/wiki/Norges_demografi"
keyword = "(x 1000)"

# Extract Norway data, and remove all but year and population
norwaypop = pd.read_html(
    link,
    match=keyword,
    thousands=" ",
    decimal=","
)[0].rename(
    {
        "Unnamed: 0":"Årtal",
        "Befolkning i tusentals (x 1000)":"Folkmängd Norge"
    }
    , axis="columns"
)

norwaypop = norwaypop.drop(
    norwaypop.columns[2:], 
    axis="columns"
)
norwaypop

Unnamed: 0,Årtal,Folkmängd Norge
0,1900,2231
1,1901,2255
2,1902,2276
3,1903,2288
4,1904,2298
...,...,...
108,2008,4768
109,2009,4829
110,2010,4889
111,2011,4953


In [67]:
# Load fertility from english wikipage
link = "https://en.wikipedia.org/wiki/Demographics_of_Norway"
keyword = "Total fertility"

norwayeng = pd.read_html(
    link,
    match=keyword,
    thousands=",",
    decimal="."
)[0]

# Drop all columns but the last one where the fertility rate is
# Rename the last column also
norwayeng = norwayeng.drop(
    norwayeng.columns[1:-1], axis="columns"
).rename(
    {norwayeng.columns[-1]:"Total fertilitet Norge"}, axis="columns"
)

# Merge with previous norway data
# Remove old Årtal, and change new Unnamed to Årtal
norwaypop = pd.concat(
    [norwaypop,norwayeng], axis="columns"
).drop(
    "Årtal", axis="columns"
).rename(
    {"Unnamed: 0":"Årtal"}, axis="columns"
)


Unnamed: 0,Folkmängd Norge,Årtal,Total fertilitet Norge
0,2231.0,1900,4.40
1,2255.0,1901,4.37
2,2276.0,1902,4.26
3,2288.0,1903,4.16
4,2298.0,1904,4.07
...,...,...,...
117,,2017,1.62
118,,2018,1.56
119,,2019,1.53
120,,2020,1.48


In [72]:
norwaypop.iloc[100:113]

Unnamed: 0,Folkmängd Norge,Årtal,Total fertilitet Norge
100,4491.0,2000,1.85
101,4514.0,2001,1.78
102,4538.0,2002,1.75
103,4565.0,2003,1.8
104,4592.0,2004,1.83
105,4623.0,2005,1.84
106,4661.0,2006,1.9
107,4709.0,2007,1.9
108,4768.0,2008,1.96
109,4829.0,2009,1.98


In [68]:
# Plot Norway population
px.line(
    norwaypop, 
    x="Årtal", y="Folkmängd Norge", 
    markers=True, 
    title="Norway's population per year (1000s of people)"
)

# 4. Merge Sweden-Norway (*)

Create a population graph and a fertility graph showing Sweden and Norway.

In [73]:
# I will do even more and merge Sweden, Denmark and Norway into one plot

#norwaypop["Årtal","Folkmängd Norge"]
#denmarkpop["År","Folkmängd"] .drop(index)
#swedenpoptot["Årtal","Folkmängd"]

# Change name of år, add country name to folkmängd
# put years as indeces

# Denmark
dkpop = denmarkpop.rename(
    {"Folkmängd":"Folkmängd Danmark"}, axis="columns"
).set_index("År")
dkpop = dkpop.astype(int)

# Norway
nopop = norwaypop.iloc[:113].drop(
    "Total fertilitet Norge", axis="columns"
).set_index("Årtal")

# Change norwayspopulation from per 1000 to per indivudal 
nopop["Folkmängd Norge"] = nopop["Folkmängd Norge"]*1000
# And change to integers
nopop = nopop.astype(int)

# Sweden
sepop = swedenpoptot.rename(
    {"Folkmängd":"Folkmängd Sverige"}, axis="columns"
).set_index("Årtal")
# And change to integers
sepop = sepop.astype(int)

# Marge all three
scandinaviapop = pd.concat([sepop,nopop,dkpop], axis="columns")

scandinaviapop


Unnamed: 0,Folkmängd Sverige,Folkmängd Norge,Folkmängd Danmark
1570,900000.0,,
1650,1225000.0,,
1700,1485000.0,,
1720,1350000.0,,
1755,1878000.0,,
...,...,...,...
2016,9995000.0,,
2017,10120000.0,,
2018,10230000.0,,
2019,10327589.0,,


In [74]:
scandinaviapop.head(20)

Unnamed: 0,Folkmängd Sverige,Folkmängd Norge,Folkmängd Danmark
1570,900000.0,,
1650,1225000.0,,
1700,1485000.0,,
1720,1350000.0,,
1755,1878000.0,,
1769,,,797584.0
1787,,,841806.0
1801,,,929001.0
1815,2465000.0,,
1834,,,1230964.0


In [75]:
# Plot all
px.line(scandinaviapop,
    markers=True,
    range_y=[0,11e6]
)


In [76]:
norwaypop

Unnamed: 0,Folkmängd Norge,Årtal,Total fertilitet Norge
0,2231.0,1900,4.40
1,2255.0,1901,4.37
2,2276.0,1902,4.26
3,2288.0,1903,4.16
4,2298.0,1904,4.07
...,...,...,...
117,,2017,1.62
118,,2018,1.56
119,,2019,1.53
120,,2020,1.48


In [79]:
norwaypop

Unnamed: 0,Folkmängd Norge,Årtal,Total fertilitet Norge
0,2231.0,1900,4.40
1,2255.0,1901,4.37
2,2276.0,1902,4.26
3,2288.0,1903,4.16
4,2298.0,1904,4.07
...,...,...,...
117,,2017,1.62
118,,2018,1.56
119,,2019,1.53
120,,2020,1.48


In [80]:
# Extract sweden's fertility
sefert = swedenpop.set_index(
    "Årtal"
).drop(
    swedenpop.columns[1:-1], axis="columns"
).rename(
    {swedenpop.columns[-1]:"Fertilitet i SE"}, axis="columns"
)

# Extract Norway's fertility
nofert = norwaypop.set_index(
    "Årtal"
).drop(
    "Folkmängd Norge", axis="columns"
).rename(
    {norwaypop.columns[-1]:"Fertilitet i NO"}, axis="columns"
)

# Merge Sweden and Norways fertility rates
senofert = pd.concat(
    [sefert,nofert], 
    axis="columns"
)

# Plot these fertility rates
px.line(senofert)



In [82]:
# Population graph with only the 1900s
px.line(
    scandinaviapop.iloc[-121:],
    range_y=[0,11e6],
    markers=True
)
