In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sfma import Data, Variable, SplineVariable, SplineSpecs, SplineUniformPrior
from sfma.model import SFMAModel

In [None]:
df = pd.read_csv("../data/gdp-le.csv")

In [None]:
df = df.loc[~np.isnan(df.gdp.values)]
df = df.loc[~np.isnan(df[["le"]].values)]

df["weights"] = 10000.

df["log.le"] = np.log(df["le"])
df["log.gdp"] = np.log(df["gdp"])

df.sort_values('gdp', inplace=True)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(16, 8))
ax[0].scatter(df[["gdp"]], df[["le"]], alpha=0.3)
ax[1].scatter(np.log(df[["gdp"]]), np.log(df[["le"]]), alpha=0.3)

In [None]:
x_var = "log.gdp"
y_var = "log.le"

In [None]:
data = Data(
    col_obs=y_var,
    col_weights="weights",
    col_covs=[x_var],
    df=df
)

In [None]:
priors = [
    SplineUniformPrior(order=1, lb=0.0, ub=np.inf, size=20),
    SplineUniformPrior(order=2, lb=-np.inf, ub=0.0, size=20)
]

variables = [
    SplineVariable(x_var,
                   spline_specs=SplineSpecs(knots=np.linspace(0.0, 1.0, 5),
                                            degree=2,
                                            r_linear=True,
                                            l_linear=True,
                                            knots_type="rel_domain"),
                   priors=priors)
]

In [None]:
model = SFMAModel(data, variables)

In [None]:
model.fit(outlier_pct=0.02, trim_verbose=True,
          eta_options={"method": "bounded", "bounds": [0.0, 1.0]})

In [None]:
df_pred = pd.DataFrame({
    x_var: np.linspace(df[x_var].min(), df[x_var].max(), 100)
})
df_pred["pred"] = model.predict(df_pred)

In [None]:
fig, ax = plt.subplots()
outlier_indices = data.trim_weights == 0.0
ax.scatter(df[x_var], df[y_var], color="gray", edgecolor="none", alpha=0.2)
ax.scatter(df[x_var][outlier_indices], df[y_var][outlier_indices], color="red", marker="x", alpha=0.2)
ax.plot(df_pred[x_var], df_pred.pred, color="red")