<a href="https://colab.research.google.com/github/jarekwan/jarwan_projekt/blob/main/Projekt2_brudnopis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
POZNIEJ 1SZA SUROWA WERSJA BRUDNOPIS, POWYZEJ BEDZIE PRAWDZIWA ROBOTA

In [1]:
# ============================================================
# PROJEKT: Analiza, modelowanie i prognozowanie danych ekonomicznych
# Dane: World Bank Open Data (GDP per capita, Inflation, Unemployment)
# ============================================================

!pip install wbdata statsmodels seaborn scikit-learn pydantic --quiet

import pandas as pd
import numpy as np
import wbdata
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import re
from pydantic import BaseModel, Field, ValidationError
from typing import Annotated
from scipy import stats
from statsmodels.tsa.stattools import adfuller, acf, pacf
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# ==========================================
# 1. Pobranie i przygotowanie danych
# ==========================================

start_date = datetime.datetime(2000, 1, 1)
end_date = datetime.datetime(2023, 1, 1)

countries = ["PL", "DE", "FR", "US"]

indicators = {
    "NY.GDP.PCAP.CD": "gdp_per_capita",    # GDP per capita
    "FP.CPI.TOTL.ZG": "inflation",         # Inflation (%)
    "SL.UEM.TOTL.ZS": "unemployment"       # Unemployment (%)
}

# Pobranie danych
df = wbdata.get_dataframe(indicators, country=countries, convert_date=True)

# Filtrowanie zakresu dat
df = df[(df.index >= start_date) & (df.index <= end_date)].reset_index()

print("✅ Dane pobrane z World Bank API")
print(df.head())

# Serializacja
df.to_csv("economic_data.csv", index=False)
df.to_json("economic_data.json", orient="records", indent=2, date_format="iso")

# Walidacja (Pydantic + regex)
DateStr = Annotated[str, Field(pattern=r"^\d{4}-\d{2}-\d{2}$")]

class EconomicData(BaseModel):
    country: str
    date: datetime.date
    gdp_per_capita: float | None
    inflation: float | None
    unemployment: float | None

try:
    for _, row in df.head(5).iterrows():
        EconomicData(
            country=row["country"],
            date=row["date"],
            gdp_per_capita=row["gdp_per_capita"],
            inflation=row["inflation"],
            unemployment=row["unemployment"]
        )
    print("✅ Walidacja przykładowych rekordów OK")
except ValidationError as e:
    print("❌ Błąd walidacji:", e)

# Czyszczenie braków
print("\nBraki danych:")
print(df.isna().sum())
df = df.dropna()
print("✅ Po usunięciu braków:", df.shape)

# ==========================================
# 2. Symulacje i rozkłady (NumPy + Matplotlib)
# ==========================================

samples_normal = np.random.normal(0, 1, 1000)
samples_uniform = np.random.uniform(-1, 1, 1000)
samples_expo = np.random.exponential(1, 1000)

plt.figure(figsize=(12,4))
plt.subplot(1,3,1); sns.histplot(samples_normal, kde=True); plt.title("Normalny")
plt.subplot(1,3,2); sns.histplot(samples_uniform, kde=True); plt.title("Jednorodny")
plt.subplot(1,3,3); sns.histplot(samples_expo, kde=True); plt.title("Wykładniczy")
plt.show()

# Central Limit Theorem
means = [np.mean(np.random.normal(0, 1, 50)) for _ in range(1000)]
sns.histplot(means, kde=True)
plt.title("Central Limit Theorem (średnie z próbek)")
plt.show()

# ==========================================
# 3. Eksploracyjna analiza danych (EDA)
# ==========================================

print(df.describe())

# Standaryzacja
df["gdp_z"] = (df["gdp_per_capita"] - df["gdp_per_capita"].mean()) / df["gdp_per_capita"].std()

# Heatmapa korelacji
plt.figure(figsize=(6,4))
sns.heatmap(df[["gdp_per_capita","inflation","unemployment"]].corr(), annot=True, cmap="coolwarm")
plt.title("Korelacja zmiennych")
plt.show()

# Meshgrid (GDP vs Inflation)
x = np.linspace(df["gdp_per_capita"].min(), df["gdp_per_capita"].max(), 50)
y = np.linspace(df["inflation"].min(), df["inflation"].max(), 50)
X, Y = np.meshgrid(x, y)
Z = np.sin(X/10000) + np.cos(Y)

plt.contourf(X, Y, Z, cmap="viridis")
plt.xlabel("GDP per capita"); plt.ylabel("Inflacja")
plt.title("Meshgrid + Contour Plot")
plt.show()

# ==========================================
# 4. Statystyka klasyczna
# ==========================================

stat, p = stats.shapiro(df["gdp_per_capita"])
print("Shapiro-Wilk test p-value:", p)

ci_low, ci_high = stats.norm.interval(0.95,
    loc=df["gdp_per_capita"].mean(),
    scale=df["gdp_per_capita"].std()/np.sqrt(len(df)))
print("95% CI dla GDP:", ci_low, ci_high)

boot_means = [np.mean(np.random.choice(df["gdp_per_capita"], size=len(df), replace=True)) for _ in range(1000)]
sns.histplot(boot_means, kde=True)
plt.title("Bootstrap średniej GDP per capita")
plt.show()

# ==========================================
# 5. Regresja liniowa i diagnostyka modeli
# ==========================================

X = df[["inflation","unemployment"]]
y = df["gdp_per_capita"]

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

lr = LinearRegression().fit(X_train,y_train)
y_pred = lr.predict(X_test)

print("OLS R²:", r2_score(y_test,y_pred))

plt.scatter(y_test,y_pred)
plt.xlabel("Rzeczywiste"); plt.ylabel("Prognozowane")
plt.title("OLS: Rzeczywiste vs Prognozowane")
plt.show()

# Regularyzacja
for model in [Ridge(alpha=1.0), Lasso(alpha=0.01), ElasticNet(alpha=0.01)]:
    model.fit(X_train,y_train)
    print(model.__class__.__name__, "R²:", model.score(X_test,y_test))

# ==========================================
# 6. Szeregi czasowe (time series)
# ==========================================

df_ts = df[df["country"]=="PL"].set_index("date")
ts = df_ts["gdp_per_capita"]

adf_result = adfuller(ts.dropna())
print("ADF test p-value:", adf_result[1])

decomp = seasonal_decompose(ts, model="additive", period=5)
decomp.plot()
plt.show()

model = ARIMA(ts, order=(1,1,1)).fit()
forecast = model.forecast(steps=5)
print("Prognoza ARIMA:", forecast)

acf_vals = acf(ts.dropna(), nlags=20)
pacf_vals = pacf(ts.dropna(), nlags=20)

plt.bar(range(len(acf_vals)), acf_vals); plt.title("ACF"); plt.show()
plt.bar(range(len(pacf_vals)), pacf_vals); plt.title("PACF"); plt.show()

# ==========================================
# 7. Prezentacja wyników
# ==========================================

df.to_csv("final_results.csv", index=False)
print("📁 Wyniki zapisane do final_results.csv")


TypeError: got an unexpected keyword argument 'convert_date'