## Summary of Changes:
Function Definitions: Created functions for repetitive tasks to improve readability and maintainability.

Loop Structures: Utilized loops for repetitive tasks to reduce code redundancy.

Descriptive Variable Names: Improved variable names for better clarity.

Plotting Function: Encapsulated plotting code in a function for better reuse.

This refactored code should be easier to read, maintain, and extend for additional countries or tasks.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gmean

# Set the working directory
os.chdir("Z:\\File folders\\Teaching\\Reproducible Research\\2023\\Repository\\RRcourse2023\\6. Coding and documentation")

# Load the O*NET task data
task_data = pd.read_csv("Data\\onet_tasks.csv")

# Load the Eurostat employment data
isco_sheets = ["ISCO1", "ISCO2", "ISCO3", "ISCO4", "ISCO5", "ISCO6", "ISCO7", "ISCO8", "ISCO9"]
isco_data = {sheet: pd.read_excel("Data\\Eurostat_employment_isco.xlsx", sheet_name=sheet) for sheet in isco_sheets}

# Calculate worker totals for each country
def calculate_total_workers(isco_data, country):
    return sum(df[country] for df in isco_data.values())

total_workers = {
    "Belgium": calculate_total_workers(isco_data, "Belgium"),
    "Spain": calculate_total_workers(isco_data, "Spain"),
    "Poland": calculate_total_workers(isco_data, "Poland")
}

# Merge ISCO data and add occupation category column
for i, sheet in enumerate(isco_sheets, start=1):
    isco_data[sheet]['ISCO'] = i

all_data = pd.concat(isco_data.values(), ignore_index=True)

# Add total workers and share columns
for country in total_workers:
    all_data[f"total_{country}"] = pd.concat([total_workers[country]] * 9, ignore_index=True)
    all_data[f"share_{country}"] = all_data[country] / all_data[f"total_{country}"]

# Process task data
task_data["isco08_1dig"] = task_data["isco08"].astype(str).str[:1].astype(int)
aggdata = task_data.groupby("isco08_1dig").mean().drop(columns=["isco08"])

# Merge all_data with aggdata
combined = pd.merge(all_data, aggdata, left_on='ISCO', right_on='isco08_1dig', how='left')

# Standardize task values
def standardize_tasks(combined, task, country):
    mean_val = np.average(combined[task], weights=combined[f"share_{country}"])
    std_val = np.sqrt(np.average((combined[task] - mean_val) ** 2, weights=combined[f"share_{country}"]))
    combined[f"std_{country}_{task}"] = (combined[task] - mean_val) / std_val

tasks = ["t_4A2a4", "t_4A2b2", "t_4A4a1"]

for task in tasks:
    for country in total_workers:
        standardize_tasks(combined, task, country)

# Calculate Non-routine cognitive analytical (NRCA) scores
def calculate_NRCA(combined, country):
    combined[f"{country}_NRCA"] = sum(combined[f"std_{country}_{task}"] for task in tasks)

for country in total_workers:
    calculate_NRCA(combined, country)

# Standardize NRCA scores
def standardize_NRCA(combined, country):
    mean_val = np.average(combined[f"{country}_NRCA"], weights=combined[f"share_{country}"])
    std_val = np.sqrt(np.average((combined[f"{country}_NRCA"] - mean_val) ** 2, weights=combined[f"share_{country}"]))
    combined[f"std_{country}_NRCA"] = (combined[f"{country}_NRCA"] - mean_val) / std_val

for country in total_workers:
    standardize_NRCA(combined, country)

# Calculate country-level mean NRCA over time
def calculate_country_NRCA_over_time(combined, country):
    combined[f"multip_{country}_NRCA"] = combined[f"std_{country}_NRCA"] * combined[f"share_{country}"]
    return combined.groupby("TIME")[f"multip_{country}_NRCA"].sum().reset_index()

agg_NRCA = {country: calculate_country_NRCA_over_time(combined, country) for country in total_workers}

# Plot NRCA over time
def plot_NRCA_over_time(agg_data, country):
    plt.plot(agg_data["TIME"], agg_data[f"multip_{country}_NRCA"])
    plt.xticks(range(0, len(agg_data), 3), agg_data["TIME"][::3])
    plt.title(f"NRCA Over Time in {country}")
    plt.xlabel("Time")
    plt.ylabel("NRCA")
    plt.show()

for country in total_workers:
    plot_NRCA_over_time(agg_NRCA[country], country)
