In [30]:
import pandas as pd
import os
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
from datetime import datetime
from scipy import sparse
import xlrd
from stats_can import StatsCan

In [31]:
# Option d'affichage
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)

In [32]:
# Options d'affchage pour les graphiques
rc('font', **{'family': 'serif', 'serif': ['Palatino']})
rc('text', usetex=True)
palette = ['#002855', '#26d07c', '#ff585d', '#f3d03e', '#0072ce', '#eb6fbd', '#00aec7', '#888b8d']

# Value added

### 1947 à 1996

In [33]:
# 1947 à 1996
value_added_1 = os.path.join(
    Path(os.getcwd()).parent,
    "Data",
    "value_added_1947.xlsx"
)

df_v1 = pd.read_excel(value_added_1, header = 5)

# Changer le nom de colonne
df_v1.columns.values[1] = "industry"
df_v1["industry"] = df_v1["industry"].str.strip()

# Dictionnaire python pour ajotuer les NAICS
naics = {
    "Farms": "111-112",
    "Forestry, fishing, and related activities": "113-115",
    "Utilities": "221",
    "Construction": "23",
    "Wood products": "321",
    "Nonmetallic mineral products": "327",
    "Primary metals": "331",
    "Fabricated metal products": "332",
    "Machinery": "333",
    "Computer and electronic products": "334",
    "Electrical equipment, appliances, and components": "335",
    "Motor vehicles, bodies and trailers, and parts": "336",
    "Furniture and related products": "337",
    "Miscellaneous manufacturing": "339",
    "Food and beverage and tobacco products": "311-312",
    "Textile mills and textile product mills": "313-314",
    "Apparel and leather and allied products": "315-316",
    "Paper products": "322",
    "Printing and related support activities": "323",
    "Petroleum and coal products": "324",
    "Chemical products": "325",
    "Plastics and rubber products": "326",
    "Wholesale trade": "41",
    "Retail trade": "44-45",
    "Transportation and warehousing": "48-49",
    "Information": "51",
    "Finance, insurance, real estate, rental, and leasing": "52-53",
    "Professional, scientific, and technical services": "54",
    "Administrative and waste management services": "56",
    "Health care and social assistance": "62",
    "Hospitals": "622",
    "Arts, entertainment, and recreation": "71",
    "Accommodation and food services": "72",
    "Other services, except government": "81"
}

# Ajouter les NAICS
df_v1["naics"] = df_v1["industry"].map(naics)


# Enlever les industries et sous industries inutiles
# ATTENTION : différences avec l'analyse industrielle canadienne
#   1. 113 et 114 sont aggrégé et deviennent : 113-114
#   2. Idem pour 311-312
#   3. Il n'y a pas l'industrie 115 dans les donneés américaines
df_v1 = df_v1.dropna(subset=["naics"])

# Remplacer les NaN par 0
df_v1 = df_v1.replace('---', 0)

# Enlever hospital de 622 (hospital) de 62
cols = [col for col in df_v1.columns if col not in ["naics", "industry"]]

# S'assurer que ces colonnes sont numériques (convertit si besoin)
for col in cols:
    df_v1[col] = pd.to_numeric(df_v1[col], errors='coerce')

if not ((df_v1["naics"] == "62").any() and (df_v1["naics"] == "622").any()):
    raise ValueError("La ligne avec naics '62' ou '622' est manquante dans le DataFrame.")


ligne_62 = df_v1[df_v1["naics"] == "62"][cols].values[0]
ligne_622 = df_v1[df_v1["naics"] == "622"][cols].values[0]
nouvelle_ligne_valeurs = ligne_62 - ligne_622
nouvelle_ligne = pd.DataFrame([nouvelle_ligne_valeurs], columns=cols)
df_v1 = df_v1[~df_v1["naics"].isin(["62", "622"])]
nouvelle_ligne.insert(0, "naics", "62")  # Ajouter la colonne 'naics' en première position
nouvelle_ligne.insert(0, "industry", "Health care and social assistance (except hospitals)")

# Ajouter la nouvelle ligne 62 au DataFrame
top = df_v1.iloc[:29]
bottom = df_v1.iloc[29:]
df_v1 = pd.concat([top, nouvelle_ligne, bottom], ignore_index=True)

# Mettre la colonne "naics" à la deuxième colonne
cols = list(df_v1.columns)
cols.insert(1, cols.pop(cols.index("naics")))
df_v1 = df_v1[cols]

# Reset l'index
df_v1 = df_v1.reset_index(drop=True)

# Drop les colonnes inutiles
df_v1 = df_v1.drop(columns = "Line")

  warn("Workbook contains no default style, apply openpyxl's default")
  df_v1 = df_v1.replace('---', 0)


### 1996 à 2019

In [34]:
# 1947 à 1996
value_added_2 = os.path.join(
    Path(os.getcwd()).parent,
    "Data",
    "value_added_1997.xlsx"
)


df_v2 = pd.read_excel(value_added_2, header = 5)

# Changer le nom de colonne
df_v2.columns.values[1] = "industry"
df_v2["industry"] = df_v2["industry"].str.strip()

# Ajouter les NAICS
df_v2["naics"] = df_v2["industry"].map(naics)


# Enlever le industries et sous industries inutiles
# ATTENTION : différences avec l'analyse canadienne
#   1. 113 et 114 sont aggrégé et deviennent : 113-114
#   2. Idem pour 311-312
#   3. Il n'y a pas l'industrie 115 dans les donneés américaines
df_v2 = df_v2.dropna(subset=["naics"])

# Remplacer les NaN par 0
df_v2 = df_v2.replace('---', 0)

# Enlever hospital de 622 (hospital) de 62
cols = [col for col in df_v2.columns if col not in ["naics", "industry"]]

# S'assurer que ces colonnes sont numériques (convertit si besoin)
for col in cols:
    df_v2[col] = pd.to_numeric(df_v2[col], errors='coerce')

if not ((df_v2["naics"] == "62").any() and (df_v2["naics"] == "622").any()):
    raise ValueError("La ligne avec naics '62' ou '622' est manquante dans le DataFrame.")


ligne_62 = df_v2[df_v2["naics"] == "62"][cols].values[0]
ligne_622 = df_v2[df_v2["naics"] == "622"][cols].values[0]
nouvelle_ligne_valeurs = ligne_62 - ligne_622
nouvelle_ligne = pd.DataFrame([nouvelle_ligne_valeurs], columns=cols)
df_v2 = df_v2[~df_v2["naics"].isin(["62", "622"])]
nouvelle_ligne.insert(0, "naics", "62")  # Ajouter la colonne 'naics' en première position
nouvelle_ligne.insert(0, "industry", "Health care and social assistance (except hospitals)")

# Ajouter la nouvelle ligne 62 au DataFrame
top = df_v2.iloc[:29]
bottom = df_v2.iloc[29:]
df_v2 = pd.concat([top, nouvelle_ligne, bottom], ignore_index=True)

# Mettre la colonne "naics" à la deuxième colonne
cols = list(df_v2.columns)
cols.insert(1, cols.pop(cols.index("naics")))
df_v2 = df_v2[cols]

# Reset l'index
df_v2 = df_v2.reset_index(drop=True)

# Drop les colonnes inutiles
df_v2 = df_v2.drop(columns = "Line")


### Dataframe final (1947 à 2019)

In [35]:
# Supprimer les colonnes identiques ('industry' et 'naics') dans df_v2
df_v2 = df_v2.drop(columns=["naics", "industry"])

# Fusionner horizontalement les deux DataFrames
df_value_added = pd.concat([df_v1, df_v2], axis=1)

# Transformer en format longue
id_vars = ["naics", "industry"]
value_vars = [col for col in df_value_added.columns if col not in id_vars]
df_value_added = pd.melt(
    df_value_added,
    id_vars=id_vars,
    value_vars=value_vars,
    var_name="year",
    value_name="va"
)

In [36]:
df_value_added

Unnamed: 0,naics,industry,year,va
0,111-112,Farms,1947,19.1
1,113-115,"Forestry, fishing, and related activities",1947,0.9
2,221,Utilities,1947,3.5
3,23,Construction,1947,8.9
4,321,Wood products,1947,1.7
...,...,...,...,...
2404,56,Administrative and waste management services,2019,670.2
2405,62,Health care and social assistance (except hospitals),2019,1088.9
2406,71,"Arts, entertainment, and recreation",2019,237.1
2407,72,Accommodation and food services,2019,685.0


# Gross ouptut

### 1947 à 1996

In [37]:
# 1947 à 1996
gross_output_1 = os.path.join(
    Path(os.getcwd()).parent,
    "Data",
    "gross output 1947-1996.xlsx"
)

df_g1 = pd.read_excel(gross_output_1, header = 5)

# Changer le nom de colonne
df_g1.columns.values[1] = "industry"
df_g1["industry"] = df_g1["industry"].str.strip()

# Ajouter les NAICS
df_g1["naics"] = df_g1["industry"].map(naics)


# Enlever le industries et sous industries inutiles
# ATTENTION : différences avec l'analyse canadienne
#   1. 113 et 114 sont aggrégé et deviennent : 113-114
#   2. Idem pour 311-312
#   3. Il n'y a pas l'industrie 115 dans les donneés américaines
df_g1 = df_g1.dropna(subset=["naics"])

# Remplacer les NaN par 0
df_g1 = df_g1.replace('---', 0)

# Enlever hospital de 622 (hospital) de 62
cols = [col for col in df_g1.columns if col not in ["naics", "industry"]]

# S'assurer que ces colonnes sont numériques (convertit si besoin)
for col in cols:
    df_g1[col] = pd.to_numeric(df_g1[col], errors='coerce')

if not ((df_g1["naics"] == "62").any() and (df_g1["naics"] == "622").any()):
    raise ValueError("La ligne avec naics '62' ou '622' est manquante dans le DataFrame.")


ligne_62 = df_g1[df_g1["naics"] == "62"][cols].values[0]
ligne_622 = df_g1[df_g1["naics"] == "622"][cols].values[0]
nouvelle_ligne_valeurs = ligne_62 - ligne_622
nouvelle_ligne = pd.DataFrame([nouvelle_ligne_valeurs], columns=cols)
df_g1 = df_g1[~df_g1["naics"].isin(["62", "622"])]
nouvelle_ligne.insert(0, "naics", "62")  # Ajouter la colonne 'naics' en première position
nouvelle_ligne.insert(0, "industry", "Health care and social assistance (except hospitals)")

# Ajouter la nouvelle ligne 62 au DataFrame
top = df_g1.iloc[:29]
bottom = df_g1.iloc[29:]
df_g1 = pd.concat([top, nouvelle_ligne, bottom], ignore_index=True)

# Mettre la colonne "naics" à la deuxième colonne
cols = list(df_g1.columns)
cols.insert(1, cols.pop(cols.index("naics")))
df_g1 = df_g1[cols]

# Reset l'index
df_g1 = df_g1.reset_index(drop=True)

# Drop les colonnes inutiles
df_g1 = df_g1.drop(columns = "Line")

  warn("Workbook contains no default style, apply openpyxl's default")


### 1997 à 2019

In [38]:
# 1947 à 1996
gross_output_2 = os.path.join(
    Path(os.getcwd()).parent,
    "Data",
    "gross output 1997-2019.xlsx"
)


df_g2 = pd.read_excel(gross_output_2, header = 5)

# Changer le nom de colonne
df_g2.columns.values[1] = "industry"
df_g2["industry"] = df_g2["industry"].str.strip()

# Ajouter les NAICS
df_g2["naics"] = df_g2["industry"].map(naics)


# Enlever le industries et sous industries inutiles
# ATTENTION : différences avec l'analyse canadienne
#   1. 113 et 114 sont aggrégé et deviennent : 113-114
#   2. Idem pour 311-312
#   3. Il n'y a pas l'industrie 115 dans les donneés américaines
df_g2 = df_g2.dropna(subset=["naics"])

# Remplacer les NaN par 0
df_g2 = df_g2.replace('---', 0)

# Enlever hospital de 622 (hospital) de 62
cols = [col for col in df_g2.columns if col not in ["naics", "industry"]]

# S'assurer que ces colonnes sont numériques (convertit si besoin)
for col in cols:
    df_g2[col] = pd.to_numeric(df_g2[col], errors='coerce')

if not ((df_g2["naics"] == "62").any() and (df_g2["naics"] == "622").any()):
    raise ValueError("La ligne avec naics '62' ou '622' est manquante dans le DataFrame.")


ligne_62 = df_g2[df_g2["naics"] == "62"][cols].values[0]
ligne_622 = df_g2[df_g2["naics"] == "622"][cols].values[0]
nouvelle_ligne_valeurs = ligne_62 - ligne_622
nouvelle_ligne = pd.DataFrame([nouvelle_ligne_valeurs], columns=cols)
df_g2 = df_g2[~df_g2["naics"].isin(["62", "622"])]
nouvelle_ligne.insert(0, "naics", "62")  # Ajouter la colonne 'naics' en première position
nouvelle_ligne.insert(0, "industry", "Health care and social assistance (except hospitals)")

# Ajouter la nouvelle ligne 62 au DataFrame
top = df_g2.iloc[:29]
bottom = df_g2.iloc[29:]
df_g2 = pd.concat([top, nouvelle_ligne, bottom], ignore_index=True)

# Mettre la colonne "naics" à la deuxième colonne
cols = list(df_g2.columns)
cols.insert(1, cols.pop(cols.index("naics")))
df_g2 = df_g2[cols]

# Reset l'index
df_g2 = df_g2.reset_index(drop=True)

# Drop les colonnes inutiles
df_g2 = df_g2.drop(columns = "Line")

  warn("Workbook contains no default style, apply openpyxl's default")
  df_g2 = df_g2.replace('---', 0)


### Dataframe final (1947 à 2019)

In [39]:
# Supprimer les colonnes identiques ('industry' et 'naics') dans df_g2
df_g2 = df_g2.drop(columns=["industry", "naics"])

# Fusionner horizontalement les deux DataFrames
df_gross = pd.concat([df_g1, df_g2], axis=1)

# Transformer en format longue
value_vars = [col for col in df_gross.columns if col not in id_vars]
df_gross = pd.melt(
    df_gross,
    id_vars=id_vars,
    value_vars=value_vars,
    var_name="year",
    value_name="sales"
)

# Real value added

### 1947 à 1996

In [40]:
# 1947 à 1996
real_added_1 = os.path.join(
    Path(os.getcwd()).parent,
    "Data",
    "real_1947.xlsx"
)

df_r1 = pd.read_excel(real_added_1, header = 5)

# Changer le nom de colonne
df_r1.columns.values[1] = "industry"
df_r1["industry"] = df_r1["industry"].str.strip()

# Ajouter les NAICS
df_r1["naics"] = df_r1["industry"].map(naics)


# Enlever le industries et sous industries inutiles
# ATTENTION : différences avec l'analyse canadienne
#   1. 113 et 114 sont aggrégé et deviennent : 113-114
#   2. Idem pour 311-312
#   3. Il n'y a pas l'industrie 115 dans les donneés américaines
df_r1 = df_r1.dropna(subset=["naics"])

# Remplacer les NaN par 0
df_r1 = df_r1.replace('---', 0)

# Enlever hospital de 622 (hospital) de 62
cols = [col for col in df_r1.columns if col not in ["naics", "industry"]]

# S'assurer que ces colonnes sont numériques (convertit si besoin)
for col in cols:
    df_r1[col] = pd.to_numeric(df_r1[col], errors='coerce')

if not ((df_r1["naics"] == "62").any() and (df_r1["naics"] == "622").any()):
    raise ValueError("La ligne avec naics '62' ou '622' est manquante dans le DataFrame.")


ligne_62 = df_r1[df_r1["naics"] == "62"][cols].values[0]
ligne_622 = df_r1[df_r1["naics"] == "622"][cols].values[0]
nouvelle_ligne_valeurs = ligne_62 - ligne_622
nouvelle_ligne = pd.DataFrame([nouvelle_ligne_valeurs], columns=cols)
df_r1 = df_r1[~df_r1["naics"].isin(["62", "622"])]
nouvelle_ligne.insert(0, "naics", "62")  # Ajouter la colonne 'naics' en première position
nouvelle_ligne.insert(0, "industry", "Health care and social assistance (except hospitals)")

# Ajouter la nouvelle ligne 62 au DataFrame
top = df_r1.iloc[:29]
bottom = df_r1.iloc[29:]
df_r1 = pd.concat([top, nouvelle_ligne, bottom], ignore_index=True)

# Mettre la colonne "naics" à la deuxième colonne
cols = list(df_r1.columns)
cols.insert(1, cols.pop(cols.index("naics")))
df_r1 = df_r1[cols]

# Reset l'index
df_r1 = df_r1.reset_index(drop=True)

# Drop les colonnes inutiles
df_r1 = df_r1.drop(columns = "Line")

  warn("Workbook contains no default style, apply openpyxl's default")
  df_r1 = df_r1.replace('---', 0)


### 1997 à 2019

In [41]:
# 1947 à 1996
real_added_2 = os.path.join(
    Path(os.getcwd()).parent,
    "Data",
    "real_1997.xlsx"
)


df_r2 = pd.read_excel(real_added_2, header = 5)

# Changer le nom de colonne
df_r2.columns.values[1] = "industry"
df_r2["industry"] = df_r2["industry"].str.strip()

# Ajouter les NAICS
df_r2["naics"] = df_r2["industry"].map(naics)


# Enlever le industries et sous industries inutiles
# ATTENTION : différences avec l'analyse canadienne
#   1. 113 et 114 sont aggrégé et deviennent : 113-114
#   2. Idem pour 311-312
#   3. Il n'y a pas l'industrie 115 dans les donneés américaines
df_r2 = df_r2.dropna(subset=["naics"])

# Remplacer les NaN par 0
df_r2 = df_r2.replace('---', 0)

# Enlever hospital de 622 (hospital) de 62
cols = [col for col in df_r2.columns if col not in ["naics", "industry"]]

# S'assurer que ces colonnes sont numériques (convertit si besoin)
for col in cols:
    df_r2[col] = pd.to_numeric(df_r2[col], errors='coerce')

if not ((df_r2["naics"] == "62").any() and (df_r2["naics"] == "622").any()):
    raise ValueError("La ligne avec naics '62' ou '622' est manquante dans le DataFrame.")


ligne_62 = df_r2[df_r2["naics"] == "62"][cols].values[0]
ligne_622 = df_r2[df_r2["naics"] == "622"][cols].values[0]
nouvelle_ligne_valeurs = ligne_62 - ligne_622
nouvelle_ligne = pd.DataFrame([nouvelle_ligne_valeurs], columns=cols)
df_r2 = df_r2[~df_r2["naics"].isin(["62", "622"])]
nouvelle_ligne.insert(0, "naics", "62")  # Ajouter la colonne 'naics' en première position
nouvelle_ligne.insert(0, "industry", "Health care and social assistance (except hospitals)")

# Ajouter la nouvelle ligne 62 au DataFrame
top = df_r2.iloc[:29]
bottom = df_r2.iloc[29:]
df_r2 = pd.concat([top, nouvelle_ligne, bottom], ignore_index=True)

# Mettre la colonne "naics" à la deuxième colonne
cols = list(df_r2.columns)
cols.insert(1, cols.pop(cols.index("naics")))
df_r2 = df_r2[cols]

# Reset l'index
df_r2 = df_r2.reset_index(drop=True)

# Drop les colonnes inutiles
df_r2 = df_r2.drop(columns = "Line")

  warn("Workbook contains no default style, apply openpyxl's default")


### Dataframe final (1947 à 2019)

In [42]:
# Supprimer les colonnes identiques ('industry' et 'naics') dans df_r2
df_r2 = df_r2.drop(columns=["industry", "naics"])

# Fusionner horizontalement les deux DataFrames
df_real = pd.concat([df_r1, df_r2], axis=1)

# Transformer en format longue
value_vars = [col for col in df_real.columns if col not in id_vars]
df_real = pd.melt(
    df_real,
    id_vars=id_vars,
    value_vars=value_vars,
    var_name="year",
    value_name="real_va"
)

# Dataframe value-added, gross output et real value-added

In [43]:
df = pd.merge(
    df_value_added,
    df_real,
    on=["naics", "industry", "year"],
    how="inner"
)

df = pd.merge(
    df,
    df_gross,
    on=["naics", "industry", "year"],
    how="inner"
)

# Productivité

In [44]:
fichier_tfp = os.path.join(
    Path(os.getcwd()).parent,
    "Data",
    "tfp.xlsx"
)

df_tfp = pd.read_excel(fichier_tfp, sheet_name = "MachineReadable")

In [45]:
# Harmoniser avec les dataframes de value added
dic_tfp = {
    "22": "221",
    "3361-3363": "336",
    "42": "41",
    "44,45": "44-45",
    "622-623": "622"
}

df_tfp["NAICS"] = df_tfp["NAICS"].replace(dic_tfp)
df_tfp = df_tfp.rename(columns={
    "NAICS": "naics"
    })

# Les industries à garder
liste_valeurs = [
    "111-112",
    "113-115",
    "221",
    "23",
    "321",
    "327",
    "331",
    "332",
    "333",
    "334",
    "335",
    "336",
    "337",
    "339",
    "311-312",
    "313-314",
    "315-316",
    "322",
    "323",
    "324",
    "325",
    "326",
    "41",
    "44-45",
    "48-49",
    "51",
    "52-53",
    "54",
    "56",
    "62",
    "71",
    "72",
    "81"
]

# Garder ces industries seulement
df_tfp = df_tfp[df_tfp["naics"].isin(liste_valeurs)]

# Garder les variables utiles 
variables_a_garder = [
    "Total factor productivity",
    "Labor input",
    "Capital input",
    "Labor costs",
    "Hours worked",
    "Capital costs"
]

# Enlever les éléments inutiles du dataframe
df_tfp = df_tfp[df_tfp["Measure"].isin(variables_a_garder)]
df_tfp = df_tfp.drop(columns = ["Basis", "Units"])

# Enlever les NaN de la colonne "Value"
df_tfp = df_tfp[df_tfp["Value"] != "N.A."]

# Créer des colones pour nos variables (mesures) pertinentes
df_tfp = df_tfp.pivot_table(
    index=["naics", "Industry", "Year"],  # Identifiants uniques
    columns="Measure",                    # Valeurs qui deviendront des colonnes
    values="Value",                       # Valeurs à placer dans les nouvelles colonnes
    aggfunc="first"                       # Si doublons, prendre la première valeur
).reset_index()

# Renommer les colonnes
df_tfp = df_tfp.rename(columns={
    "Year": "year",
    "Industry": "industry",
    "Total factor productivity": "tfp",
    "Labor input": "labor",
    "Capital input": "capital",
    "Labor costs": "labor_cost",
    "Hours worked": "hours",
    "Capital costs": "capital_cost",
    })

# Harmoniser la colonnes "industry"
industry = {
    "Crop & animal production (Farms)": "Farms",
    "Primary metal products": "Primary metals",
    "Apparel and leather and applied products": "Apparel and leather and allied products",
    "Finance, insurance, real estate, and leasing": "Finance, insurance, real estate, rental, and leasing",
    "Health care and social assistance": "Health care and social assistance (except hospitals)"
}

df_tfp["industry"] = df_tfp["industry"].replace(industry)

# Fusionner les deux datagframe
df_tfp["year"] = df_tfp["year"].astype(int)
df["year"] = df["year"].astype(int)

df = df_tfp.merge(
    df,
    on=["naics", "year", "industry"],
    how="left"
)

# Garder les données avant 2020
df = df[df['year'] < 2020]

# Liste des colonnes à convertir en numérique
colonnes_numeriques = ['tfp', 'capital', 'labor', 'va', 'real_va', 'capital_cost', 'labor_cost']

# Appliquer la conversion avec gestion des erreurs
for col in colonnes_numeriques:
    df[col] = pd.to_numeric(df[col], errors='coerce')

In [46]:
for i in df["industry"].unique():
    print(i)

Farms
Forestry, fishing, and related activities
Utilities
Construction
Food and beverage and tobacco products
Textile mills and textile product mills
Apparel and leather and allied products
Wood products
Paper products
Printing and related support activities
Petroleum and coal products
Chemical products
Plastics and rubber products
Nonmetallic mineral products
Primary metals
Fabricated metal products
Machinery
Computer and electronic products
Electrical equipment, appliances, and components
Motor vehicles, bodies and trailers, and parts
Furniture and related products
Miscellaneous manufacturing
Wholesale trade
Retail trade
Transportation and warehousing
Information
Finance, insurance, real estate, rental, and leasing
Professional, scientific, and technical services
Administrative and waste management services
Health care and social assistance (except hospitals)
Arts, entertainment, and recreation
Accommodation and food services
Other services, except government


In [47]:
for i in df["naics"].unique():
    print(i)

111-112
113-115
221
23
311-312
313-314
315-316
321
322
323
324
325
326
327
331
332
333
334
335
336
337
339
41
44-45
48-49
51
52-53
54
56
62
71
72
81


In [48]:
df.columns

Index(['naics', 'industry', 'year', 'capital_cost', 'capital', 'hours',
       'labor_cost', 'labor', 'tfp', 'va', 'real_va', 'sales'],
      dtype='object')

# Dataframe utilisé pour calculer lamda

In [49]:
df_cl = df[["industry",
            "capital_cost",
            "va", 
            "sales",
            "labor_cost", 
            "year", 
            "naics"]]

In [50]:
df_cl

Unnamed: 0,industry,capital_cost,va,sales,labor_cost,year,naics
0,Farms,57.960,62.0,147.1,20.262,1987,111-112
1,Farms,53.826,61.4,153.7,20.873,1988,111-112
2,Farms,60.486,73.9,171.9,22.637,1989,111-112
3,Farms,59.841,77.8,179.9,24.064,1990,111-112
4,Farms,52.002,70.4,175.3,23.207,1991,111-112
...,...,...,...,...,...,...,...
1245,"Other services, except government",28.005,403.5,650.7,270.380,2015,81
1246,"Other services, except government",28.785,415.9,682.2,285.821,2016,81
1247,"Other services, except government",33.855,433.2,708.9,300.171,2017,81
1248,"Other services, except government",38.076,457.7,741.3,315.074,2018,81


# Préparer les tableaux de IO

### 1947 à 1962

### 1963 à 1996

In [51]:
# Dictionnaire pour mapper les industries des donneés de IO tables
naics_mapping = {
    "111CA": "111-112",
    "113FF": "113-115",
    "22": "221",
    "23": "23",
    "321": "321",
    "327": "327",
    "331": "331",
    "332": "332",
    "333": "333",
    "334": "334",
    "335": "335",
    "3361MV": "336",
    "3364OT": "336",
    "337": "337",
    "339": "339",
    "311FT": "311-312",
    "313TT": "313-314",
    "315AL": "315-316",
    "322": "322",
    "323": "323",
    "324": "324",
    "325": "325",
    "326": "326",
    "42": "41",
    "44RT": "44-45",
    "441": "44-45", # Pour données 1997-2019
    "445": "44-45", # Pour données 1997-2019
    "452": "44-45", # Pour données 1997-2019
    "4A0": "44-45", # Pour données 1997-2019
    "481": "48-49",
    "482": "48-49",
    "483": "48-49",
    "484": "48-49",
    "485": "48-49",
    "486": "48-49",
    "487OS": "48-49",
    "493": "48-49",
    "511": "51",
    "512": "51",
    "513": "51",
    "514": "51",
    "521CI": "52-53",
    "523": "52-53",
    "524": "52-53",
    "525": "52-53",
    "531": "52-53",
    "532RL": "52-53",
    "5411": "54",
    "5415": "54",
    "5412OP": "54",
    "561": "56",
    "562": "56",
    "621": "62",
    #"622HO": "622", # À enlever de 62 (hospital)
    "624": "62",
    "711AS": "71",
    "713": "71",
    "721": "72",
    "722": "72",
    "81": "81"
}

# Le chemin pour fichier excel
io_1963 = os.path.join(
    Path(os.getcwd()).parent,
    "Data",
    "IO 1963-1996.xlsx"
)

# Créer le Dataframe vide
df_63_96 = pd.DataFrame()

# Itération pour chaque année
for year in range(1963, 1996 + 1):
    df = pd.read_excel(io_1963, sheet_name = str(year), header = 6)

    # Enlever les colones inutles et renommer les colonnes
    df = df.drop(columns = ["Industry Description"])
    df = df.rename(columns={ "Code": "supply_naics_agg"})

    # Ajouter colonne année
    df["year"] = (year)

    df = df.melt(
        id_vars=["supply_naics_agg", "year"],
        var_name="use_naics_agg",
        value_name="value"
        )

    # Mapper les codes selon le dictionnaire
    df["supply_naics_agg"] = df["supply_naics_agg"].map(naics_mapping)
    df["use_naics_agg"] = df["use_naics_agg"].map(naics_mapping)

    # Supprimer lignes avec NaN (non mappées)
    df = df.dropna(subset=["supply_naics_agg", "use_naics_agg"])

    # Enlever les lignes où "value" est "..."
    df["value"] = df["value"].replace ("...", 0)

    # Additioner les industries aggrégées
    df = df.groupby(['supply_naics_agg', 'use_naics_agg', 'year'])['value'].sum().reset_index()

    # Mettre les colonnes dans la bonne ordre
    df= df[["supply_naics_agg", "use_naics_agg", "year", "value"]]

    # Include the capital and labor costs
    df_capital = df_cl.loc[df_cl['year'] == year, ['capital_cost', 'naics']].rename(columns={'naics': 'use_naics_agg'})
    df_capital['supply_naics_agg'] = 'capital'
    df_capital['capital_cost'] = df_capital['capital_cost'] * 1000
    df = pd.merge(df, df_capital, on=['use_naics_agg', 'supply_naics_agg'], how='left')
    df.loc[(df['supply_naics_agg'] == 'capital') & ~df['use_naics_agg'].isin(['capital', 'labor']), 'value'] = df.loc[(df['supply_naics_agg'] == 'capital') & ~df['use_naics_agg'].isin(['capital', 'labor']), 'capital_cost']
    df = df.drop(columns=['capital_cost'])
    df_labor = df_cl.loc[df_cl['year'] == year, ['labor_cost', 'naics']].rename(columns={'naics': 'use_naics_agg'})
    df_labor['supply_naics_agg'] = 'labor'
    df_labor['labor_cost'] = df_labor['labor_cost'] * 1000
    df = pd.merge(df, df_labor, on=['supply_naics_agg', 'use_naics_agg'], how='left')
    df.loc[(df['supply_naics_agg'] == 'labor') & ~df['use_naics_agg'].isin(['capital', 'labor']), 'value'] = df.loc[(df['supply_naics_agg'] == 'labor') & ~df['use_naics_agg'].isin(['capital', 'labor']), 'labor_cost']
    df = df.drop(columns=['labor_cost'])

    # Fill in the missing values with 0
    df.loc[df['value'].isna(), 'value'] = 0

    # Calculate the cost share of each industry
    df['cost_share'] = df.groupby('use_naics_agg')['value'].transform(lambda x: x / x.sum())
    df.loc[df['cost_share'].isna(), 'cost_share'] = 0

    # Sort the data frame by year, use_naics_agg, and supply_naics_agg
    df = df.sort_values(by=['use_naics_agg', 'supply_naics_agg'])

    # Append the data to the Dataframe
    df_63_96 = pd.concat([df_63_96, df], ignore_index=True)

# Sort the data frame by year, use_naics_agg, and supply_naics_agg
df_63_96 = df_63_96.sort_values(by=['year', 'use_naics_agg', 'supply_naics_agg'])

  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"] = df["value"].replace ("...", 0)
  df["value"]

### 1997 à 2019


In [52]:
# Créer le Dataframe vide
df_97_19 = pd.DataFrame()

# Itération pour les années 1997 à 2019
for year in range(1997, 2019 + 1):
    df = pd.read_excel(
        os.path.join(Path(os.getcwd()).parent, 
                     "Data",
                     "Table " + str(year) + ".xlsx"), 
                     header= 5
    )
    
    # Enlever les colonnes inutiles et renommer
    df = df.drop(columns = ["Industries/Industries"])
    df = df.rename(columns={ "Unnamed: 0": "supply_naics_agg"})
    df = df.drop(index = 0)
    
    # Ajouter colonne année
    df["year"] = (year)

    df = df.melt(
        id_vars=["supply_naics_agg", "year"],
        var_name="use_naics_agg",
        value_name="value"
        )

    # Mapper les codes selon le dictionnaire
    df["supply_naics_agg"] = df["supply_naics_agg"].map(naics_mapping)
    df["use_naics_agg"] = df["use_naics_agg"].map(naics_mapping)

    # Supprimer lignes avec NaN (non mappées)
    df = df.dropna(subset=["supply_naics_agg", "use_naics_agg"])

    # Enlever les lignes où "value" est "..."
    df["value"] = df["value"].replace ("...", 0)

    # Additioner les industries aggrégées
    df = df.groupby(['supply_naics_agg', 'use_naics_agg', 'year'])['value'].sum().reset_index()

    # Mettre les colonnes dans la bonne ordre
    df= df[["supply_naics_agg", "use_naics_agg", "year", "value"]]

    # Include the capital and labor costs
    df_capital = df_cl.loc[df_cl['year'] == year, ['capital_cost', 'naics']].rename(columns={'naics': 'use_naics_agg'})
    df_capital['supply_naics_agg'] = 'capital'
    df_capital['capital_cost'] = df_capital['capital_cost'] * 1000
    df = pd.merge(df, df_capital, on=['use_naics_agg', 'supply_naics_agg'], how='left')
    df.loc[(df['supply_naics_agg'] == 'capital') & ~df['use_naics_agg'].isin(['capital', 'labor']), 'value'] = df.loc[(df['supply_naics_agg'] == 'capital') & ~df['use_naics_agg'].isin(['capital', 'labor']), 'capital_cost']
    df = df.drop(columns=['capital_cost'])
    df_labor = df_cl.loc[df_cl['year'] == year, ['labor_cost', 'naics']].rename(columns={'naics': 'use_naics_agg'})
    df_labor['supply_naics_agg'] = 'labor'
    df_labor['labor_cost'] = df_labor['labor_cost'] * 1000
    df = pd.merge(df, df_labor, on=['supply_naics_agg', 'use_naics_agg'], how='left')
    df.loc[(df['supply_naics_agg'] == 'labor') & ~df['use_naics_agg'].isin(['capital', 'labor']), 'value'] = df.loc[(df['supply_naics_agg'] == 'labor') & ~df['use_naics_agg'].isin(['capital', 'labor']), 'labor_cost']
    df = df.drop(columns=['labor_cost'])

    # Fill in the missing values with 0
    df.loc[df['value'].isna(), 'value'] = 0

    # Calculate the cost share of each industry
    df['cost_share'] = df.groupby('use_naics_agg')['value'].transform(lambda x: x / x.sum())
    df.loc[df['cost_share'].isna(), 'cost_share'] = 0

    # Sort the data frame by year, use_naics_agg, and supply_naics_agg
    df = df.sort_values(by=['use_naics_agg', 'supply_naics_agg'])

    # Append the data to the Dataframe
    df_97_19 = pd.concat([df_97_19, df], ignore_index=True)

# Sort the data frame by year, use_naics_agg, and supply_naics_agg
df_97_19 = df_97_19.sort_values(by=['year', 'use_naics_agg', 'supply_naics_agg'])

  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn("Workbook contains no default style, apply openpyxl's default")
  warn

# Append the I-O tables across all years and calculate the lambda's

In [None]:
# Concatenate the data frames
df = pd.concat([df_63_96, df_97_19], ignore_index=True)
df = pd.merge(df, df_cl[['year', 'naics', 'sales']].rename(columns={'naics': 'use_naics_agg'}), how='left', on=['year', 'use_naics_agg'])
df = df.sort_values(by=['year', 'use_naics_agg', 'supply_naics_agg'])


Unnamed: 0,supply_naics_agg,use_naics_agg,year,value,cost_share,sales
0,111-112,111-112,1963,37237,1.0,
1,113-115,111-112,1963,0,0.0,
2,221,111-112,1963,0,0.0,
3,23,111-112,1963,0,0.0,
4,311-312,111-112,1963,0,0.0,
...,...,...,...,...,...,...
63157,56,81,2019,0.054673,0.036508,776.8
63158,62,81,2019,0.000136,0.000091,776.8
63159,71,81,2019,0.003772,0.002519,776.8
63160,72,81,2019,0.010519,0.007024,776.8


In [55]:
# Create the cost-based IO matrices for each year and calculate the lambda's
df_lambda = pd.DataFrame([(year, naics) for naics in df_cl['naics'].unique() for year in df_cl['year'].unique()], columns=['year', 'naics']).sort_values(by=['year', 'naics'])
df_lambda['lambda'] = np.nan
df_lambda['lambda_k'] = np.nan
df_lambda['lambda_l'] = np.nan
df_lambda['wedge'] = np.nan

In [None]:
# Puique la matrice pour certaines années n'est pas inversible, on essaie de diagnostiquer le problème avec l'année 1987.
df_year = df[df['year'] == 1987]
df_year['revenue_share'] = df_year['value'] / df_year['sales']
df_year.loc[df_year['revenue_share'].isna(), 'revenue_share'] = 0
df_year_cost = df_year.pivot(index='use_naics_agg', columns='supply_naics_agg', values='cost_share')
df_year_revenue = df_year.pivot(index='use_naics_agg', columns='supply_naics_agg', values='revenue_share')
naics_list = df_year_cost.index.tolist()

# 1. Vérifier s’il y a des lignes ou colonnes nulles
print("Somme des lignes (cost share) :")
print(df_year_cost.sum(axis=1).sort_values())

print("Somme des colonnes (cost share) :")
print(df_year_cost.sum(axis=0).sort_values())

# 2. Voir les valeurs de Omega_tilde
print("Matrice Omega_tilde (dense) :")
print(df_year_cost.round(3))

# 3. Vérifie les NaN ou objets
print("Types de colonnes :")
print(df_year_cost.dtypes)

print("Nombre de valeurs NaN :")
print(df_year_cost.isna().sum().sum())



Somme des lignes (cost share) :
use_naics_agg
23         1.0
322        1.0
339        1.0
313-314    1.0
54         1.0
325        1.0
337        1.0
41         1.0
44-45      1.0
48-49      1.0
111-112    1.0
336        1.0
52-53      1.0
56         1.0
62         1.0
71         1.0
51         1.0
335        1.0
333        1.0
72         1.0
332        1.0
327        1.0
326        1.0
324        1.0
323        1.0
321        1.0
315-316    1.0
311-312    1.0
221        1.0
113-115    1.0
334        1.0
81         1.0
331        1.0
dtype: object
Somme des colonnes (cost share) :
supply_naics_agg
54          0.67241
81         0.801074
113-115    0.852657
71          0.87035
23         0.935504
335        0.962224
311-312    0.968575
339        0.977059
326        0.981994
337        0.992615
48-49      0.994907
315-316    0.997581
327        0.997701
321        0.998426
313-314    1.002283
56         1.003507
62         1.003593
331         1.00486
221        1.009336
333        1.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_year['revenue_share'] = df_year['value'] / df_year['sales']


In [None]:
Omega_tilde = sparse.csr_matrix(df_year_cost.astype(float).values)
Omega = sparse.csr_matrix(df_year_revenue.astype(float).values)
b = df_cl.loc[df_cl['year'] == 1987, ['naics', 'va']].sort_values(by=['naics'])['va'].values
b = b / b.sum()
b = np.append(b, [0, 0])
lambda_tilde = np.matmul(b.transpose(), np.linalg.inv(np.eye(Omega_tilde.shape[0]) - Omega_tilde))
numerator = np.sum(np.matmul(Omega_tilde[:-2, :-2].todense(), Omega[:-2, :-2].todense()), axis=1)
denominator = np.sum(np.matmul(Omega[:-2, :-2].todense(), Omega[:-2, :-2].todense()), axis=1)
wedge = numerator / denominator
d_lambda = dict([(naics_list[i], lambda_tilde[0, i]) for i in range(len(naics_list) - 2)])
d_wedge = dict([(naics_list[i], wedge.flatten()[0, i]) for i in range(len(naics_list) - 2)])
df_lambda.loc[df_lambda['year'] == 1987, 'lambda'] = df_lambda[df_lambda['year'] == 1987]['naics'].map(d_lambda)
df_lambda.loc[df_lambda['year'] == 1987, 'lambda_k'] = lambda_tilde[0, -2]
df_lambda.loc[df_lambda['year'] == 1987, 'lambda_l'] = lambda_tilde[0, -1]
df_lambda.loc[df_lambda['year'] == 1987, 'wedge'] = df_lambda[df_lambda['year'] == 1987]['naics'].map(d_wedge)


In [None]:
print("Dimensions Omega_tilde :", Omega_tilde.shape)
print("b shape :", b.shape)
print("Matrice à inverser :")
print(np.eye(Omega_tilde.shape[0]) - Omega_tilde.toarray())

In [None]:
# Trouver les années où leur matrice n'est pas inversible
for year in df_lambda['year'].unique():
    try:
        print(f"Année en cours : {year}")
        df_year = df[df['year'] == year].copy()

        if year < 1997:
            df_year['revenue_share'] = df_year['value'] / df_year['sales']
        else:
            df_year['revenue_share'] = df_year['value'] / (1000 * df_year['sales'])

        df_year.loc[df_year['revenue_share'].isna(), 'revenue_share'] = 0
        df_year_cost = df_year.pivot(index='use_naics_agg', columns='supply_naics_agg', values='cost_share')
        df_year_revenue = df_year.pivot(index='use_naics_agg', columns='supply_naics_agg', values='revenue_share')
        naics_list = df_year_cost.index.tolist()

        Omega_tilde = sparse.csr_matrix(df_year_cost.astype(float).values)
        Omega = sparse.csr_matrix(df_year_revenue.astype(float).values)

        b = df_cl.loc[df_cl['year'] == year, ['naics', 'va']].sort_values(by=['naics'])['va'].values
        b = b / b.sum()
        b = np.append(b, [0, 0])

        A = np.eye(Omega_tilde.shape[0]) - Omega_tilde.toarray()
        A_inv = np.linalg.inv(A)  # ← ici l'erreur potentielle

        lambda_tilde = np.matmul(b.transpose(), A_inv)
        numerator = np.sum(np.matmul(Omega_tilde[:-2, :-2].todense(), Omega[:-2, :-2].todense()), axis=1)
        denominator = np.sum(np.matmul(Omega[:-2, :-2].todense(), Omega[:-2, :-2].todense()), axis=1)
        wedge = numerator / denominator

        d_lambda = dict([(naics_list[i], lambda_tilde[0, i]) for i in range(len(naics_list) - 2)])
        d_wedge = dict([(naics_list[i], wedge.flatten()[0, i]) for i in range(len(naics_list) - 2)])
        df_lambda.loc[df_lambda['year'] == year, 'lambda'] = df_lambda[df_lambda['year'] == year]['naics'].map(d_lambda)
        df_lambda.loc[df_lambda['year'] == year, 'lambda_k'] = lambda_tilde[0, -2]
        df_lambda.loc[df_lambda['year'] == year, 'lambda_l'] = lambda_tilde[0, -1]
        df_lambda.loc[df_lambda['year'] == year, 'wedge'] = df_lambda[df_lambda['year'] == year]['naics'].map(d_wedge)

    except np.linalg.LinAlgError:
        print(f"⚠️ Matrice non inversible pour l’année {year}")


Année en cours : 1987
⚠️ Matrice non inversible pour l’année 1987
Année en cours : 1988
⚠️ Matrice non inversible pour l’année 1988
Année en cours : 1989
⚠️ Matrice non inversible pour l’année 1989
Année en cours : 1990
⚠️ Matrice non inversible pour l’année 1990
Année en cours : 1991
⚠️ Matrice non inversible pour l’année 1991
Année en cours : 1992
⚠️ Matrice non inversible pour l’année 1992
Année en cours : 1993


ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 33 is different from 35)

In [None]:
for year in df_lambda['year'].unique():
    df_year = df[df['year'] == year]
    df_year['revenue_share'] = df_year['value'] / df_year['sales']
    df_year.loc[df_year['revenue_share'].isna(), 'revenue_share'] = 0
    df_year_cost = df_year.pivot(index='use_naics_agg', columns='supply_naics_agg', values='cost_share')
    df_year_revenue = df_year.pivot(index='use_naics_agg', columns='supply_naics_agg', values='revenue_share')
    naics_list = df_year_cost.index.tolist()
    Omega_tilde = sparse.csr_matrix(df_year_cost.astype(float).values)
    Omega = sparse.csr_matrix(df_year_revenue.astype(float).values)
    b = df_cl.loc[df_cl['year'] == year, ['naics', 'va']].sort_values(by=['naics'])['va'].values
    b = b / b.sum()
    b = np.append(b, [0, 0])
    lambda_tilde = np.matmul(b.transpose(), np.linalg.inv(np.eye(Omega_tilde.shape[0]) - Omega_tilde))
    numerator = np.sum(np.matmul(Omega_tilde[:-2, :-2].todense(), Omega[:-2, :-2].todense()), axis=1)
    denominator = np.sum(np.matmul(Omega[:-2, :-2].todense(), Omega[:-2, :-2].todense()), axis=1)
    wedge = numerator / denominator
    d_lambda = dict([(naics_list[i], lambda_tilde[0, i]) for i in range(len(naics_list) - 2)])
    d_wedge = dict([(naics_list[i], wedge.flatten()[0, i]) for i in range(len(naics_list) - 2)])
    df_lambda.loc[df_lambda['year'] == year, 'lambda'] = df_lambda[df_lambda['year'] == year]['naics'].map(d_lambda)
    df_lambda.loc[df_lambda['year'] == year, 'lambda_k'] = lambda_tilde[0, -2]
    df_lambda.loc[df_lambda['year'] == year, 'lambda_l'] = lambda_tilde[0, -1]
    df_lambda.loc[df_lambda['year'] == year, 'wedge'] = df_lambda[df_lambda['year'] == year]['naics'].map(d_wedge)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_year['revenue_share'] = df_year['value'] / df_year['sales']


LinAlgError: Singular matrix

In [None]:
# Take the average of successive years
df_lambda['lambda'] = df_lambda.groupby('naics')['lambda'].transform(lambda x: x.rolling(2).mean())
df_lambda['lambda_k'] = df_lambda.groupby('naics')['lambda_k'].transform(lambda x: x.rolling(2).mean())
df_lambda['lambda_l'] = df_lambda.groupby('naics')['lambda_l'].transform(lambda x: x.rolling(2).mean())

# Calcul des variables de la productivité

In [None]:
# Rescale the variables to 1987=100
df['tfp'] = df['tfp'] / df.loc[df['year'] == 1987, 'tfp'].values[0] * 100
df['real_va'] = df['real_va'] / df.loc[df['year'] == 1987, 'real_va'].values[0] * 100
df['capital'] = df['capital'] / df.loc[df['year'] == 1987, 'capital'].values[0] * 100
df['labor'] = df['labor'] / df.loc[df['year'] == 1987, 'labor'].values[0] * 100

KeyError: 'tfp'

In [None]:
# Initialiser la colonne tfp_adj avec des NaN
df['tfp_adj'] = np.nan

# Définir la valeur de départ pour 1987
df.loc[df['year'] == 1987, 'tfp_adj'] = 100

# Boucler sur chaque industrie
for industry in df['industry'].unique():
    for year in range(1988, 2020):
        # Extraire les valeurs pour l’année courante et l’année précédente
        current = df[(df['year'] == year) & (df['industry'] == industry)]
        previous = df[(df['year'] == year - 1) & (df['industry'] == industry)]

        # Sauter si une des deux années est manquante
        if current.empty or previous.empty:
            continue

        # Calcul du poids du capital (alpha)
        cap_cost_now = current['capital_cost'].iloc[0]
        lab_cost_now = current['labor_cost'].iloc[0]
        cap_cost_prev = previous['capital_cost'].iloc[0]
        lab_cost_prev = previous['labor_cost'].iloc[0]

        alpha_now = cap_cost_now / (cap_cost_now + lab_cost_now)
        alpha_prev = cap_cost_prev / (cap_cost_prev + lab_cost_prev)
        alpha = 0.5 * (alpha_now + alpha_prev)

        # Valeurs nécessaires pour la formule
        tfp_prev = previous['tfp_adj'].iloc[0]
        real_va_now = current['real_va'].iloc[0]
        real_va_prev = previous['real_va'].iloc[0]
        capital_now = current['capital'].iloc[0]
        capital_prev = previous['capital'].iloc[0]
        labor_now = current['labor'].iloc[0]
        labor_prev = previous['labor'].iloc[0]

        # Calcul du TFP ajusté
        growth_va = np.log(real_va_now / real_va_prev)
        growth_cap = np.log((capital_now / real_va_now) / (capital_prev / real_va_prev))
        growth_lab = np.log(labor_now / labor_prev)

        tfp_adj = tfp_prev * np.exp(growth_va - (alpha / (1 - alpha)) * growth_cap - growth_lab)

        # Assigner le résultat
        df.loc[(df['year'] == year) & (df['industry'] == industry), 'tfp_adj'] = tfp_adj


  growth_va = np.log(real_va_now / real_va_prev)
  growth_cap = np.log((capital_now / real_va_now) / (capital_prev / real_va_prev))
  growth_va = np.log(real_va_now / real_va_prev)
  growth_cap = np.log((capital_now / real_va_now) / (capital_prev / real_va_prev))
  growth_cap = np.log((capital_now / real_va_now) / (capital_prev / real_va_prev))
  growth_va = np.log(real_va_now / real_va_prev)
  growth_cap = np.log((capital_now / real_va_now) / (capital_prev / real_va_prev))
  growth_cap = np.log((capital_now / real_va_now) / (capital_prev / real_va_prev))


In [None]:
# Calculate the share of value-added of each industry within year
df['va_agg'] = df.groupby('year')['va'].transform('sum')
df['b'] = df['va'] / df['va_agg']
df['b'] = df.groupby('industry')['b'].transform(lambda x: x.rolling(2).mean())
df = df.drop(columns=['va_agg'])

# Calculate the share of value-added of each industry for years 1987 and 2000
df = pd.merge(df, df.loc[df['year'] == 1988, ['industry', 'b']].rename(columns={'b': 'b_1987'}), on='industry', how='left')
df = pd.merge(df, df.loc[df['year'] == 2000, ['industry', 'b']].rename(columns={'b': 'b_2000'}), on='industry', how='left')

# Calculate the log difference of TFP, capital, and labor within each industry
df['tfp_growth'] = df.groupby('industry')['tfp'].transform(lambda x: np.log(x).diff())
df['tfp_adj_growth'] = df.groupby('industry')['tfp'].transform(lambda x: np.log(x).diff())
df['capital_growth'] = df.groupby('industry')['capital'].transform(lambda x: np.log(x).diff())
df['labor_growth'] = df.groupby('industry')['labor'].transform(lambda x: np.log(x).diff())

# Calculate the industry-level output elasticities of capital and labor
df['alpha_k'] = df['capital_cost'] / (df['capital_cost'] + df['labor_cost'])
df['alpha_k'] = df.groupby('industry')['alpha_k'].transform(lambda x: x.rolling(2).mean())
df['alpha_l'] = df['labor_cost'] / (df['capital_cost'] + df['labor_cost'])
df['alpha_l'] = df.groupby('industry')['alpha_l'].transform(lambda x: x.rolling(2).mean())

# Calculate the share of total labor and capital costs of each industry within year
df['capital_cost_agg'] = df.groupby('year')['capital_cost'].transform('sum')
df['omega_k'] = df['capital_cost'] / df['capital_cost_agg']
df['omega_k'] = df.groupby('industry')['omega_k'].transform(lambda x: x.rolling(2).mean())
df['labor_cost_agg'] = df.groupby('year')['labor_cost'].transform('sum')
df['omega_l'] = df['labor_cost'] / df['labor_cost_agg']
df['omega_l'] = df.groupby('industry')['omega_l'].transform(lambda x: x.rolling(2).mean())
df = df.drop(columns=['capital_cost_agg', 'labor_cost_agg'])