In [None]:
# --------------------------------------------------
# IMF_WEO dataset EDA
# This script:
# 1. loads cleaned IMF data
# 2. prints summary stats
# 3. makes basic plots for Week 5 deliverable
# --------------------------------------------------

In [6]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# make plots look clean
sns.set_theme(style='whitegrid')

In [7]:
# 1. load data

# change this path if your file is somewhere else
csv_path = "/Users/lexieee/Desktop/IMF_clean.csv"

df = pd.read_csv(csv_path)

print("=== BASIC CHECK ===")
print("Shape (rows, cols):", df.shape)
print("\nHead:")
print(df.head())
print("\nColumn dtypes:")
print(df.dtypes)
print("\nUnique countries:", df["Country"].nunique())
print(df["Country"].unique())
print("\nUnique indicators:", df["Indicator"].unique())
print("\nYear range:", df["Year"].min(), "to", df["Year"].max())

print("\nMissing values per column:")
print(df.isna().sum())

# drop duplicate rows if any
dupes = df.duplicated().sum()
print("\nDuplicate rows:", dupes)
if dupes > 0:
    df = df.drop_duplicates()
    print("Dropped duplicates. New shape:", df.shape)

# simple numeric summary of the Value column
print("\n=== VALUE SUMMARY (overall) ===")
print(df["Value"].describe())

# summary by Indicator
print("\n=== VALUE SUMMARY BY INDICATOR ===")
summary_by_indicator = (
    df.groupby("Indicator")["Value"]
      .describe()
      .round(3)
)
print(summary_by_indicator)


=== BASIC CHECK ===
Shape (rows, cols): (306, 4)

Head:
  Country                 Indicator  Year   Value
0  Canada  CPI inflation (% change)  1980  10.183
1  Canada  CPI inflation (% change)  1981  12.462
2  Canada  CPI inflation (% change)  1982  10.803
3  Canada  CPI inflation (% change)  1983   5.816
4  Canada  CPI inflation (% change)  1984   4.339

Column dtypes:
Country       object
Indicator     object
Year           int64
Value        float64
dtype: object

Unique countries: 3
['Canada' 'Mexico' 'United States']

Unique indicators: ['CPI inflation (% change)' 'GDP (constant prices, % change)']

Year range: 1980 to 2030

Missing values per column:
Country      0
Indicator    0
Year         0
Value        0
dtype: int64

Duplicate rows: 0

=== VALUE SUMMARY (overall) ===
count    306.000000
mean       5.583065
std       14.100971
min       -8.354000
25%        1.882250
50%        2.794500
75%        4.209250
max      131.959000
Name: Value, dtype: float64

=== VALUE SUMMARY BY I

In [12]:
import os
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style='whitegrid')

out_dir = "/Users/lexieee/Desktop/figs"

In [None]:

# ------- CPI inflation -------
inflation_name = "CPI inflation (% change)"
inflation_df = df[df["Indicator"] == inflation_name].copy()
inflation_df = inflation_df.sort_values(["Country", "Year"])

plt.figure(figsize=(9,4))
for country in inflation_df["Country"].unique():
    sub = inflation_df[inflation_df["Country"] == country]
    plt.plot(
        sub["Year"],
        sub["Value"],
        marker="o",
        label=country
    )

plt.title("CPI inflation (% change) over time")
plt.xlabel("Year")
plt.ylabel("Inflation rate (%)")
plt.legend()
plt.tight_layout()

fig_path_infl = os.path.join(out_dir, "inflation_over_time.png")
plt.savefig(fig_path_infl, dpi=300)
print("Saved plot:", fig_path_infl)

plt.close()

# ------- GDP growth  -------
gdp_name = "GDP (constant prices, % change)"
gdp_df = df[df["Indicator"] == gdp_name].copy()
gdp_df = gdp_df.sort_values(["Country", "Year"])

plt.figure(figsize=(9,4))
for country in gdp_df["Country"].unique():
    sub = gdp_df[gdp_df["Country"] == country]
    plt.plot(
        sub["Year"],
        sub["Value"],
        marker="o",
        label=country
    )

plt.title("Real GDP growth (% change) over time")
plt.xlabel("Year")
plt.ylabel("GDP growth (%)")
plt.legend()
plt.tight_layout()

fig_path_gdp = os.path.join(out_dir, "gdp_growth_over_time.png")
plt.savefig(fig_path_gdp, dpi=300)
print("Saved plot:", fig_path_gdp)

plt.close()

# ------- boxplot: inflation -------
plt.figure(figsize=(7,4))
sns.boxplot(
    data=inflation_df,
    x="Country",
    y="Value"
)
plt.title("Inflation rate distribution by country")
plt.xlabel("Country")
plt.ylabel("CPI inflation (% change)")
plt.tight_layout()

fig_path_infl_box = os.path.join(out_dir, "inflation_boxplot.png")
plt.savefig(fig_path_infl_box, dpi=300)
print("Saved plot:", fig_path_infl_box)

plt.close()

# ------- boxplot: GDP growth -------
plt.figure(figsize=(7,4))
sns.boxplot(
    data=gdp_df,
    x="Country",
    y="Value"
)
plt.title("GDP growth distribution by country")
plt.xlabel("Country")
plt.ylabel("GDP growth (% change)")
plt.tight_layout()

fig_path_gdp_box = os.path.join(out_dir, "gdp_boxplot.png")
plt.savefig(fig_path_gdp_box, dpi=300)
print("Saved plot:", fig_path_gdp_box)

plt.close()

# ------- scatter: Inflation vs GDP growth -------
wide = (
    df.pivot_table(
        index=["Country", "Year"],
        columns="Indicator",
        values="Value"
    )
    .reset_index()
)

wide = wide.rename(columns={
    "CPI inflation (% change)": "Inflation",
    "GDP (constant prices, % change)": "GDP_Growth"
})

plt.figure(figsize=(6,5))
sns.scatterplot(
    data=wide,
    x="Inflation",
    y="GDP_Growth",
    hue="Country"
)
plt.title("GDP growth vs Inflation")
plt.xlabel("Inflation rate (%)")
plt.ylabel("GDP growth (%)")
plt.tight_layout()

fig_path_scatter = os.path.join(out_dir, "inflation_vs_gdp_scatter.png")
plt.savefig(fig_path_scatter, dpi=300)
print("Saved plot:", fig_path_scatter)

plt.close()

Saved plot: /Users/lexieee/Desktop/figs/inflation_over_time.png
Saved plot: /Users/lexieee/Desktop/figs/gdp_growth_over_time.png
Saved plot: /Users/lexieee/Desktop/figs/inflation_boxplot.png
Saved plot: /Users/lexieee/Desktop/figs/gdp_boxplot.png
Saved plot: /Users/lexieee/Desktop/figs/inflation_vs_gdp_scatter.png


In [None]:
# 5. small text summary to report
print("\n=== QUICK TEXT SUMMARY ===")
print("- We have data for", df['Country'].nunique(), "countries from",
      df['Year'].min(), "to", df['Year'].max(), "years.")
print("- Indicators included:", list(df['Indicator'].unique()))
print("- We looked at missing data and duplicates. After dropping dupes, shape is", df.shape)
print("- We plotted inflation and GDP growth over time for each country.")
print("- We compared distributions with boxplots to see which country is more volatile.")
print("- We also checked correlation between inflation and GDP growth by country.")
print("  (See printed correlation values above.)")

print("\nAll plots saved as PNG in the same folder as this script.")
print("Done.")


=== QUICK TEXT SUMMARY ===
- We have data for 3 countries from 1980 to 2030 years.
- Indicators included: ['CPI inflation (% change)', 'GDP (constant prices, % change)']
- We looked at missing data and duplicates. After dropping dupes, shape is (306, 4)
- We plotted inflation and GDP growth over time for each country.
- We compared distributions with boxplots to see which country is more volatile.
- We also checked correlation between inflation and GDP growth by country.
  (See printed correlation values above.)

All plots saved as PNG in the same folder as this script.
Done.
