# Productivity Forecast

Labour productivity is defined as real gross domestic product (GDP) per hour worked. This captures the use of labour inputs better than just output per employee, with labour input defined as total hours worked by all persons involved. The data are derived as average hours worked multiplied by the corresponding and consistent measure of employment for each particular country. Forecast is based on an assessment of the economic climate in individual countries and the world economy, using a combination of model-based analyses and expert judgement. This indicator is measured as an index with 2010=1.


### Setup

In [69]:
import pandas as pd
import altair as alt

In [70]:

#alt.data_transformers.disable_max_rows()

## Data

Data Import

#### Dataset Labour productivity forecast 2019 - 2024

In [71]:
LINK = '/Users/Lea/Desktop/dst-projekt/laborproductivity_20002024.csv'

df = pd.read_csv(LINK)

In [72]:
df
# falls 50 Zeilen werden alle angezeigt 
#df.head() für begrenzte Anzeigen

Unnamed: 0,LOCATION,INDICATOR,SUBJECT,MEASURE,FREQUENCY,TIME,Value,Flag Codes
0,DEU,GDPHRWKDFORECAST,TOT,IDX2015,A,2000,0.911951,
1,DEU,GDPHRWKDFORECAST,TOT,IDX2015,A,2001,0.931053,
2,DEU,GDPHRWKDFORECAST,TOT,IDX2015,A,2002,0.934060,
3,DEU,GDPHRWKDFORECAST,TOT,IDX2015,A,2003,0.937559,
4,DEU,GDPHRWKDFORECAST,TOT,IDX2015,A,2004,0.941154,
...,...,...,...,...,...,...,...,...
70,EA17,GDPHRWKDFORECAST,TOT,IDX2015,A,2020,0.967096,
71,EA17,GDPHRWKDFORECAST,TOT,IDX2015,A,2021,1.008287,
72,EA17,GDPHRWKDFORECAST,TOT,IDX2015,A,2022,1.019841,
73,EA17,GDPHRWKDFORECAST,TOT,IDX2015,A,2023,1.012843,


In [73]:
df.info()

# Objekt kann immer alles sein
# float = numerisch stetig

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   LOCATION    75 non-null     object 
 1   INDICATOR   75 non-null     object 
 2   SUBJECT     75 non-null     object 
 3   MEASURE     75 non-null     object 
 4   FREQUENCY   75 non-null     object 
 5   TIME        75 non-null     int64  
 6   Value       75 non-null     float64
 7   Flag Codes  0 non-null      float64
dtypes: float64(2), int64(1), object(5)
memory usage: 4.8+ KB


### Eliminate Clutter

In [74]:
df_drop = ['FREQUENCY', 'Flag Codes', 'INDICATOR', 'MEASURE']

for column in df_drop:
    if column in df.columns:
        df.drop(column, axis=1, inplace=True)


In [75]:
df

Unnamed: 0,LOCATION,SUBJECT,TIME,Value
0,DEU,TOT,2000,0.911951
1,DEU,TOT,2001,0.931053
2,DEU,TOT,2002,0.934060
3,DEU,TOT,2003,0.937559
4,DEU,TOT,2004,0.941154
...,...,...,...,...
70,EA17,TOT,2020,0.967096
71,EA17,TOT,2021,1.008287
72,EA17,TOT,2022,1.019841
73,EA17,TOT,2023,1.012843


In [76]:
list_cat = ['LOCATION', 'SUBJECT']

for i in list_cat:
    df[i] = df[i].astype('category')

In [77]:
df.rename(columns={'Value': 'VALUE'}, inplace=True)

In [78]:
# Time as Year
df['TIME'] = pd.to_datetime(df['TIME'], format='%Y').dt.year

In [83]:
# NUR GANZZAHLIGE VALUE EINTRÄGE
#df['VALUE'] = df['VALUE'].astype(int)
df['VALUE'] = df['VALUE'].round(2)


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75 entries, 0 to 74
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   LOCATION  75 non-null     category
 1   SUBJECT   75 non-null     category
 2   TIME      75 non-null     int32   
 3   VALUE     75 non-null     float64 
dtypes: category(2), float64(1), int32(1)
memory usage: 1.4 KB


Focus on selected countries and eliminate double listing

In [85]:
df3_selectedlocations = df[df['LOCATION'].isin(['EU17', 'DEU', 'GBR'])]

df3_selectedlocations

Unnamed: 0,LOCATION,SUBJECT,TIME,VALUE
0,DEU,TOT,2000,0.91
1,DEU,TOT,2001,0.93
2,DEU,TOT,2002,0.93
3,DEU,TOT,2003,0.94
4,DEU,TOT,2004,0.94
5,DEU,TOT,2005,0.95
6,DEU,TOT,2006,0.98
7,DEU,TOT,2007,1.0
8,DEU,TOT,2008,0.99
9,DEU,TOT,2009,0.93


In [82]:
df3_selectedlocations.count()

LOCATION    50
SUBJECT     50
TIME        50
VALUE       50
dtype: int64

### Data Explo

In [89]:
df3_selectedlocations.describe()

Unnamed: 0,TIME,VALUE
count,50.0,50.0
mean,2012.0,0.9822
std,7.284314,0.039605
min,2000.0,0.89
25%,2006.0,0.95
50%,2012.0,0.99
75%,2018.0,1.01
max,2024.0,1.05


### Data Viz

In [90]:
#Color Scale

colors = alt.Scale(
    range=['#003f5c','#58508d','#bc5090','#ff6361','#ffa600']
)
colors


Scale({
  range: ['#003f5c', '#58508d', '#bc5090', '#ff6361', '#ffa600']
})

Line Chart Viz

In [101]:
colors_linechart3 = alt.Scale(
    range=['#58508d','#ff6361','#ffa600']
)
colors_linechart3

Scale({
  range: ['#58508d', '#ff6361', '#ffa600']
})

In [204]:
linechart3 = alt.Chart(df3_selectedlocations).mark_line().encode(
    x=alt.X('TIME:O', title='Jahr').axis(
        titleAnchor='start',
        labelAngle= -0,
        ),
    y=alt.Y('VALUE').scale(domain=(0.8,1.25)).axis(
        title='GDP per hour worked',
        titleAnchor='end',
        grid= False,
        ),
    strokeWidth=alt.value(2), 
    color=alt.Color('LOCATION', scale=colors_linechart3),
    tooltip=['LOCATION']
).properties(
    title='Labor Productivity Forecast'
)


In [205]:
location_list = df3_selectedlocations['LOCATION'].tolist()

linechart_labels = alt.Chart(df3_selectedlocations).mark_text(align='left', dx=3).encode(
    alt.X('TIME:O', aggregate='max'),
    alt.Y('VALUE:Q', aggregate={'argmax': 'VALUE'}),
    alt.Text('LOCATION'),
    alt.Color('LOCATION:N', legend=None, scale=alt.Scale(domain=location_list,type='ordinal')), 
).properties(
    width=800,
    height=500,    
)

In [206]:
linechart3_final = alt.layer(linechart3, linechart_labels).configure_view(
    strokeWidth=0
).configure_title(
    fontSize=20,
    anchor='start',
    fontWeight='bold',
).configure_axis(
    labelFontSize = 11,
    titleFontSize = 12,
    titleFontWeight= 'normal',
    titleColor='grey'
).configure_text(
    fontWeight='bold',
    fontSize = 12
)

linechart3_final