## Setup
Import the required libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Task 1: Load the Data

Load the two CSV files into pandas DataFrames:
- `df_gdp` - GDP per capita data
- `df_gini` - Gini index data

In [None]:
# Load from local files (parent folder contains the CSVs)
df_gdp = pd.read_csv("../gdp-per-capita-maddison-2020.csv")
df_gini = pd.read_csv("../economic-inequality-gini-index.csv")

# Alternative: Load from GitHub raw URLs (for Google Colab)
# gdp_url = "https://raw.githubusercontent.com/YOUR_REPO/gdp-per-capita-maddison-2020.csv"
# gini_url = "https://raw.githubusercontent.com/YOUR_REPO/economic-inequality-gini-index.csv"
# df_gdp = pd.read_csv(gdp_url)
# df_gini = pd.read_csv(gini_url)

In [None]:
# Preview GDP dataset
print("GDP Dataset:")
print(f"Shape: {df_gdp.shape}")
df_gdp.head()

In [None]:
# Preview Gini dataset
print("Gini Dataset:")
print(f"Shape: {df_gini.shape}")
df_gini.head()

## Task 2: Clean the GDP Dataset

Drop columns you do not need (like the annotations column).

In [None]:
# Check columns
print("GDP columns:", df_gdp.columns.tolist())

In [None]:
# Drop the annotations column
df_gdp = df_gdp.drop(columns=["417485-annotations"])

print("GDP columns after cleaning:", df_gdp.columns.tolist())

## Task 3: Convert Year to Datetime

Make sure the `Year` column is a datetime type for both datasets.

In [None]:
# Filter out invalid years first (must be between 1800-2100)
df_gdp = df_gdp[df_gdp["Year"].between(1800, 2100)]
df_gini = df_gini[df_gini["Year"].between(1800, 2100)]

# Convert Year to datetime
df_gdp["Year"] = pd.to_datetime(df_gdp["Year"], format="%Y")
df_gini["Year"] = pd.to_datetime(df_gini["Year"], format="%Y")

print(f"GDP Year dtype: {df_gdp['Year'].dtype}")
print(f"Gini Year dtype: {df_gini['Year'].dtype}")

## Task 4: Clean the Gini Dataset

Drop columns you do not need and filter to relevant years.

In [None]:
# Check Gini columns
print("Gini columns:", df_gini.columns.tolist())

In [None]:
# Filter to Year > 1980 for better data quality
df_gdp = df_gdp[df_gdp["Year"] > "1980"]
df_gini = df_gini[df_gini["Year"] > "1980"]

print(f"Rows in GDP after filtering Year > 1980: {len(df_gdp)}")
print(f"Rows in Gini after filtering Year > 1980: {len(df_gini)}")

## Task 5: Merge the Datasets

Merge the GDP and Gini datasets on `Entity` and `Year`.

In [None]:
# Merge on Entity and Year
df_merged = pd.merge(df_gdp, df_gini, on=["Entity", "Year"], how="inner")

print(f"Rows after merging: {len(df_merged)}")
df_merged.head()

## Task 6: Rename Columns

Rename columns to be shorter and more consistent.

In [None]:
# Rename columns
df_merged = df_merged.rename(columns={
    "GDP per capita": "GDP",
    "Gini index": "Gini_index",
    "Code_x": "Code"
})

# Drop duplicate Code column if it exists
if "Code_y" in df_merged.columns:
    df_merged = df_merged.drop(columns=["Code_y"])

print("Final columns:", df_merged.columns.tolist())
df_merged.head()

## Task 7: Calculate Correlation

Create a DataFrame with just the numeric columns and calculate the correlation.

In [None]:
# Create DataFrame with just numeric columns
df_numeric = df_merged[["GDP", "Gini_index"]].dropna()

print(f"Rows with complete data: {len(df_numeric)}")

In [None]:
# Calculate correlation
correlation = df_numeric["GDP"].corr(df_numeric["Gini_index"])

print(f"Correlation between GDP and Gini index: {correlation:.4f}")

In [None]:
# Interpretation
if correlation > 0:
    direction = "positive"
else:
    direction = "negative"

strength = abs(correlation)
if strength < 0.3:
    strength_desc = "weak"
elif strength < 0.7:
    strength_desc = "moderate"
else:
    strength_desc = "strong"

print(f"Interpretation: There is a {strength_desc} {direction} correlation between GDP per capita and inequality.")

## Optional: Scatter Plot

Visualize the relationship between GDP and Gini index.

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df_numeric["GDP"], df_numeric["Gini_index"], alpha=0.5, s=10)
plt.xlabel("GDP per capita")
plt.ylabel("Gini Index")
plt.title(f"GDP per Capita vs Gini Index\nCorrelation: {correlation:.4f}")
plt.grid(True, alpha=0.3)

# Add trend line
z = np.polyfit(df_numeric["GDP"], df_numeric["Gini_index"], 1)
p = np.poly1d(z)
x_line = [df_numeric["GDP"].min(), df_numeric["GDP"].max()]
plt.plot(x_line, p(x_line), "r--", linewidth=2, label="Trend line")
plt.legend()

plt.tight_layout()
plt.show()

## Optional: Time Series for One Country

Pick one country and plot how GDP and Gini change over time.

In [None]:
# Pick a country (change this to explore different countries)
country = "United States"

df_country = df_merged[df_merged["Entity"] == country].sort_values("Year")
print(f"Data points for {country}: {len(df_country)}")
df_country.head()

In [None]:
# Plot time series
fig, ax1 = plt.subplots(figsize=(12, 6))

# GDP on left axis
color1 = "tab:blue"
ax1.set_xlabel("Year")
ax1.set_ylabel("GDP per capita", color=color1)
ax1.plot(df_country["Year"], df_country["GDP"], color=color1, marker="o", markersize=3)
ax1.tick_params(axis="y", labelcolor=color1)

# Gini on right axis
ax2 = ax1.twinx()
color2 = "tab:red"
ax2.set_ylabel("Gini Index", color=color2)
ax2.plot(df_country["Year"], df_country["Gini_index"], color=color2, marker="s", markersize=3)
ax2.tick_params(axis="y", labelcolor=color2)

plt.title(f"GDP per Capita and Gini Index Over Time: {country}")
fig.tight_layout()
plt.show()

## Summary

Answer the done checklist questions:

In [None]:
print("=" * 50)
print("SUMMARY")
print("=" * 50)
print(f"Rows after filtering Year > 1980: {len(df_gdp)} (GDP), {len(df_gini)} (Gini)")
print(f"Rows after merging: {len(df_merged)}")
print(f"Correlation: {correlation:.4f}")
print(f"\nConclusion: There is a {strength_desc} {direction} relationship between GDP per capita and inequality.")