In [1]:
import pandas as pd
import numpy as np

# =========================
# CONFIGURATION
# =========================
INPUT_CSV = "Global Economy Indicators.csv"
OUTPUT_CSV = "Global_Economy_Cleaned.csv"

COLUMN_MAP = {
    " Country ": "country",
    " Year ": "year",
    " Gross Domestic Product (GDP) ": "gdp",
    " Gross National Income(GNI) in USD ": "gni",
    " Population ": "pop",
    " Currency ": "currency",
    " IMF based exchange rate ": "exchange",
    " Agriculture, hunting, forestry, fishing (ISIC A-B) ": "agriculture",
    " Manufacturing (ISIC D) ": "manufacturing",
    " Construction (ISIC F) ": "construction",
    " Wholesale, retail trade, restaurants and hotels (ISIC G-H) ": "trade",
    " Transport, storage and communication (ISIC I) ": "transport",
    " Other Activities (ISIC J-P) ": "other",
    " Exports of goods and services ": "exports",
    " Imports of goods and services ": "imports"
}

NUMERIC_COLS = [
    "year", "gdp", "gni", "pop", "exchange",
    "agriculture", "manufacturing", "construction",
    "trade", "transport", "other",
    "exports", "imports"
]

# =========================
# LECTURE DU CSV BRUT
# =========================
print("Chargement du fichier CSV brut...")
df = pd.read_csv(INPUT_CSV)

# =========================
# RENOMMAGE DES COLONNES
# =========================
print("Nettoyage des noms de colonnes...")
df = df.rename(columns=COLUMN_MAP)

# Supprimer les colonnes inutiles
df = df[list(COLUMN_MAP.values())]

# =========================
# NETTOYAGE DES DONNÉES
# =========================

# Nettoyage pays
df["country"] = df["country"].astype(str).str.strip()
df["currency"] = df["currency"].astype(str).str.strip()

# Conversion numérique sécurisée
for col in NUMERIC_COLS:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Remplacer NaN par 0 (logique économique)
df[NUMERIC_COLS] = df[NUMERIC_COLS].fillna(0)

# Supprimer les lignes sans PIB
initial_len = len(df)
df = df[df["gdp"] > 0]
print(f"Lignes supprimées (PIB nul): {initial_len - len(df)}")

# Supprimer années incohérentes
df = df[(df["year"] >= 1960) & (df["year"] <= 2025)]

# =========================
# TRI FINAL
# =========================
df = df.sort_values(by=["country", "year"]).reset_index(drop=True)

# =========================
# EXPORT CSV FINAL
# =========================
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print("Prétraitement terminé")
print(f"Fichier généré : {OUTPUT_CSV}")
print(f"Lignes finales : {len(df)}")


Chargement du fichier CSV brut...
Nettoyage des noms de colonnes...
Lignes supprimées (PIB nul): 0
Prétraitement terminé
Fichier généré : Global_Economy_Cleaned.csv
Lignes finales : 10512
