In [1]:
import duckdb
import polars as pl
import numpy as np
#import chainladder as cl



## Final Polars version

In [102]:
def calculate_cl_factors(
    amount: str = "paid_claims_usd",
    development: str = "development_lag",
    origin="accident_date",
) -> pl.Expr:
    # list concatinated, not list in list
    sort_cols = [origin, development]
    over_cols = [origin]
    # Calculate  Chain-Ladder Factors, simple and volume-weighted with Polars
    return (
        pl.col(amount).sort_by(sort_cols).pct_change().over(over_cols).fill_null(0) + 1
    )


def calculate_cdf(ldf: str = "ldf_incurred", development: str = "development_lag"):
    cdf = pl.col(ldf).sort_by(development, descending=True).shift(1,fill_value=1).cum_prod()
    return cdf.reverse()


def calculate_factors(df, index, development, origin):
    dataframes = []

    for df_lob in df.partition_by("line_of_business"):
        df_factors = (
            df_lob
            .with_columns(
                cl_factor_incurred=calculate_cl_factors(amount="incurred_claims_usd"),
                cl_factor_paid=calculate_cl_factors(amount="paid_claims_usd"),
            )
            .group_by([index, development])
            .agg(
                ldf_incurred=pl.col("cl_factor_incurred").mean(),
                ldf_paid=pl.col("cl_factor_paid").mean(),
            )
            .sort(development)
            .with_columns(
                cdf_incurred=calculate_cdf("ldf_incurred"),
                cdf_paid=calculate_cdf("ldf_paid"),
            )
        )
        dataframes.append(df_factors)
    return pl.concat(dataframes)


with duckdb.connect("./data/db_dev.duckdb") as con:
    df = con.execute(
        "SELECT line_of_business,accident_date, accident_year, development_year, development_lag, incurred_claims_usd, paid_claims_usd FROM report_ibnr"
    ).pl()

dev_years = df.select(pl.col("development_year").unique().sort()).to_series()
df_dev = pl.concat(
    [
        (
            df.with_columns(
                cl_factor_incurred=calculate_cl_factors(amount="incurred_claims_usd"),
                cl_factor_paid=calculate_cl_factors(amount="paid_claims_usd"),
            )
            .filter(pl.col("line_of_business") == "Workers' compensation insurance")
            .filter(pl.col("development_year") <= dev_year)
            .pipe(
                calculate_factors, index="line_of_business", development="development_lag", origin="accident_date"
            )
        )
        for dev_year in [1997]
    ]
)
df_dev.sort(["line_of_business","development_lag"])

line_of_business,development_lag,ldf_incurred,ldf_paid,cdf_incurred,cdf_paid
str,i64,f64,f64,f64,f64
"""Workers' compe…",0,1.0,1.0,0.967007,4.122531
"""Workers' compe…",1,1.023213,2.206679,0.94507,1.868206
"""Workers' compe…",2,0.975516,1.316287,0.96879,1.4193
"""Workers' compe…",3,0.981517,1.149941,0.987033,1.234237
"""Workers' compe…",4,0.996586,1.081418,0.990414,1.141314
"""Workers' compe…",5,0.996526,1.046461,0.993867,1.090642
"""Workers' compe…",6,0.997014,1.032269,0.996843,1.056548
"""Workers' compe…",7,1.000317,1.02525,0.996527,1.030528
"""Workers' compe…",8,1.000575,1.020144,0.995955,1.010179
"""Workers' compe…",9,0.995955,1.010179,1.0,1.0


In [None]:
df.partition_by

In [76]:
df_dev.with_columns(cdf = calculate_cdf(ldf="ldf_incurred"))

  cdf = pl.col(ldf).sort_by([index, development], descending=True).cumprod().over(index)


line_of_business,development_lag,ldf_incurred,ldf_paid,dev_year,cdf
str,i64,f64,f64,i32,f64
"""Workers' compe…",0,1.0,1.0,1997,0.967007
"""Workers' compe…",4,0.996586,1.081418,1997,0.967007
"""Workers' compe…",7,1.000317,1.02525,1997,0.94507
"""Workers' compe…",1,1.023213,2.206679,1997,0.96879
"""Workers' compe…",2,0.975516,1.316287,1997,0.987033
"""Workers' compe…",3,0.981517,1.149941,1997,0.990414
"""Workers' compe…",5,0.996526,1.046461,1997,0.993867
"""Workers' compe…",6,0.997014,1.032269,1997,0.996843
"""Workers' compe…",8,1.000575,1.020144,1997,0.996527
"""Workers' compe…",9,0.995955,1.010179,1997,0.995955


In [83]:
pl.concat([df_group.select(pl.col("cdf_incurred").reverse()) for df_group in df_dev.partition_by("line_of_business")])

ColumnNotFoundError: cdf_incurred

Error originated just after this operation:
DF ["line_of_business", "development_lag", "ldf_incurred", "ldf_paid"]; PROJECT */5 COLUMNS; SELECTION: "None"

In [16]:
df_list = df_dev.group_by("line_of_business").agg(pl.col("development_lag"),
                                                  pl.col("ldf_incurred").sort_by("development_lag").map_elements(cdf_numpy).alias("cdf_incurred"),
                                                  pl.col("ldf_paid").sort_by("development_lag").map_elements(cdf_numpy).alias("cdf_paid"),
                                                  pl.col("ldf_incurred").sort_by("development_lag"),
                                                  ).explode(["development_lag","ldf_incurred","ldf_paid","cdf_incurred","cdf_paid"])




ComputeError: IndexError: invalid index to scalar variable.

In [5]:
def cdf_numpy(ldf):
    ldf=ldf.to_numpy()[0]
    cdf = ldf[::-1].cumprod()[::-1]
    return cdf

In [28]:
df_list = df_dev.group_by("line_of_business").agg(pl.col("ldf_incurred")).with_columns(cdf_numpy(pl.col("ldf_incurred")))


AttributeError: 'Expr' object has no attribute 'to_numpy'

In [22]:
df_list

line_of_business,ldf_incurred
str,list[f64]
"""Workers' compe…","[1.0, 0.996586, … 0.995955]"


In [24]:
df_list.group_by("line_of_business").agg(pl.col("ldf_incurred").map_elements(cdf_numpy).alias("cdf"))



line_of_business,cdf
str,object
"""Workers' compe…",[0.96700734 0.96700734 0.97031966 0.97370229 0.97339376 0.97283483  0.950765 0.97462782 0.99298141 0.99595524]


In [145]:
df_list.get_column("cl_factor_incurred").to_numpy()[0]

array([1.0232127 , 0.98151668, 0.99658636, 0.99652601, 1.00031696,
       1.        , 0.97551597, 0.9970141 , 1.00057454, 0.99595524])

## Chainladder package


In [67]:
!pip install chainladder

Defaulting to user installation because normal site-packages is not writeable
Collecting chainladder
  Downloading chainladder-0.8.18-py3-none-any.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25hCollecting scikit-learn>=0.23
  Downloading scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.1/12.1 MB[0m [31m43.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting matplotlib
  Downloading matplotlib-3.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m49.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting sparse>=0.9
  Downloading sparse-0.15.1-py2.py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116

In [103]:
import chainladder as cl
test = cl.load_sample('clrd').groupby('LOB').sum()
test

  if isinstance(self.groups.dtypes.index, pd.MultiIndex):
  index = pd.DataFrame(self.groups.dtypes.index)


Unnamed: 0,Triangle Summary
Valuation:,1997-12
Grain:,OYDY
Shape:,"(6, 6, 10, 10)"
Index:,[LOB]
Columns:,"[IncurLoss, CumPaidLoss, BulkLoss, EarnedPremDIR, EarnedPremCeded, EarnedPremNet]"


In [135]:
dev = cl.Development(average="simple").fit_transform(test["IncurLoss"])
ibnr = cl.Chainladder().fit(dev["IncurLoss"])
ibnr.ultimate_.to_frame().T

  xp.nansum(w * x * y, axis) - xp.nansum(x * w, axis) * xp.nanmean(y, axis)
  intercept = xp.nanmean(y, axis) - slope * xp.nanmean(x, axis)


LOB,comauto,medmal,othliab,ppauto,prodliab,wkcomp
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1988,629146.0,236376.0,328473.0,8723062.0,123424.0,1356500.0
1989,679231.398324,241107.011377,367994.093378,9855793.0,127828.161046,1451549.0
1990,728839.971396,270345.278304,396925.632756,10786180.0,142673.097646,1568354.0
1991,730441.283895,300337.404311,485515.583144,10803840.0,126807.537539,1635776.0
1992,761066.208582,313435.895437,498677.539336,11725710.0,102629.397212,1594492.0
1993,816279.244374,330108.675136,563184.66028,12604040.0,129251.908274,1534061.0
1994,888304.311678,355154.143799,638800.138275,13413680.0,148422.276704,1548814.0
1995,895392.01555,372686.756583,601429.407051,13493240.0,141974.376783,1564075.0
1996,872985.851884,364739.366542,680283.721933,13184950.0,136922.012916,1474465.0
1997,890434.440272,346423.102362,679600.097413,12990520.0,127422.559491,1452841.0


In [136]:
ibnr.full_triangle_.loc["wkcomp"]

Unnamed: 0,12,24,36,48,60,72,84,96,108,120,132,9999
1988,1273279,1343238,1356530,1351429,1379353,1377612,1363000,1362205,1362009,1356500,1356500,1356500
1989,1383700,1441224,1433627,1472238,1473424,1460943,1458170,1455562,1457444,1451549,1451549,1451549
1990,1477245,1621324,1605337,1602578,1584751,1570110,1568607,1573819,1574723,1568354,1568354,1568354
1991,1629195,1694048,1686692,1661484,1639743,1638468,1640956,1641476,1642419,1635776,1635776,1635776
1992,1634231,1749947,1689300,1621422,1600524,1604332,1599542,1600049,1600968,1594492,1594492,1594492
1993,1719891,1714687,1643956,1556461,1548909,1543528,1538919,1539407,1540292,1534061,1534061,1534061
1994,1785215,1750992,1638790,1569161,1563804,1558372,1553719,1554211,1555104,1548814,1548814,1548814
1995,1750767,1692856,1614463,1584622,1579213,1573727,1569028,1569525,1570427,1564075,1564075,1564075
1996,1625977,1560165,1521966,1493835,1488735,1483564,1479134,1479603,1480453,1474465,1474465,1474465
1997,1502410,1537285,1499646,1471928,1466903,1461807,1457442,1457904,1458742,1452841,1452841,1452841


In [134]:
ibnr.cdf_.loc["wkcomp"]

Unnamed: 0,12-Ult,24-Ult,36-Ult,48-Ult,60-Ult,72-Ult,84-Ult,96-Ult,108-Ult,120-Ult,132-Ult
(All),0.9609,0.9418,0.9671,0.9867,0.9908,0.9943,0.997,0.9966,0.996,1.0,1.0


In [132]:
dev.cdf_.loc["wkcomp"]["IncurLoss"]*test.loc["wkcomp"]["IncurLoss"].latest_diagonal.val_to_dev()

Unnamed: 0,12,24,36,48,60,72,84,96,108,120
1988,,,,,,,,,,
1989,,,,,,,,,1451549.0,
1990,,,,,,,,1568354.0,,
1991,,,,,,,1635776.0,,,
1992,,,,,,1594492.0,,,,
1993,,,,,1534061.0,,,,,
1994,,,,1548814.0,,,,,,
1995,,,1564075.0,,,,,,,
1996,,1474465.0,,,,,,,,
1997,1452841.0,,,,,,,,,


In [118]:
test.loc["wkcomp"]["IncurLoss"].latest_diagonal

Unnamed: 0,1997
1988,1356500
1989,1457444
1990,1573819
1991,1640956
1992,1604332
1993,1548909
1994,1569161
1995,1614463
1996,1560165
1997,1502410


In [119]:
dev.cdf_.loc["wkcomp"]["IncurLoss"]

Unnamed: 0,12-Ult,24-Ult,36-Ult,48-Ult,60-Ult,72-Ult,84-Ult,96-Ult,108-Ult
(All),0.967,0.9451,0.9688,0.987,0.9904,0.9939,0.9968,0.9965,0.996


In [56]:
dev.ldf_.loc["wkcomp"].to_frame().sort_index(ascending=False).T["IncurLoss"]

development
96-108     1.000575
84-96      1.000317
72-84      0.997014
60-72      0.996526
48-60      0.996586
36-48      0.981517
24-36      0.975516
12-24      1.023213
108-120    0.995955
Name: IncurLoss, dtype: float64

In [112]:
import pandas as pd
fin_ibnr = ibnr.ibnr_.loc["wkcomp"].to_frame()
ldf_dev = dev.ldf_.loc["wkcomp"].to_frame().T["IncurLoss"].to_list()
ldf_dev.reverse()
cdf_dev = dev.cdf_.loc["wkcomp"].to_frame().T["IncurLoss"].to_list()

df_fin_ibnr= fin_ibnr.assign(
    ultimate = ibnr.ultimate_.loc["wkcomp"].to_frame(),
    incurred=test.loc["wkcomp"].latest_diagonal.to_frame().T["IncurLoss"].to_list()
    ).dropna().assign(ldf=ldf_dev, cdf=cdf_dev, test = lambda df_: df_["cdf"]*df_["incurred"])

In [123]:
ibnr.ultimate_.loc["wkcomp"].to_frame().values

array([[1356500.        ],
       [1451548.98829597],
       [1568391.15892512],
       [1635970.51017018],
       [1595120.00394135],
       [1534733.9580067 ],
       [1548271.72693472],
       [1561345.4992816 ],
       [1469441.33580175],
       [1443681.72281264]])

In [124]:
dev.cdf_.loc["wkcomp"].to_frame().T["IncurLoss"].to_list()

[0.967007339208989,
 0.9450697181687259,
 0.9687895933000701,
 0.9870332439944216,
 0.9904141567645723,
 0.9938668396459205,
 0.9968433179273872,
 0.996527459264213,
 0.9959552396496646]

In [115]:
ibnr.full_triangle_.loc["wkcomp"]

Unnamed: 0,12,24,36,48,60,72,84,96,108,120,132,9999
1988,1273279,1343238,1356530,1351429,1379353,1377612,1363000,1362205,1362009,1356500,1356500,1356500
1989,1383700,1441224,1433627,1472238,1473424,1460943,1458170,1455562,1457444,1451549,1451549,1451549
1990,1477245,1621324,1605337,1602578,1584751,1570110,1568607,1573819,1574761,1568391,1568391,1568391
1991,1629195,1694048,1686692,1661484,1639743,1638468,1640956,1641632,1642614,1635971,1635971,1635971
1992,1634231,1749947,1689300,1621422,1600524,1604332,1599981,1600640,1601598,1595120,1595120,1595120
1993,1719891,1714687,1643956,1556461,1548909,1543597,1539411,1540045,1540967,1534734,1534734,1534734
1994,1785215,1750992,1638790,1569161,1562572,1557213,1552990,1553630,1554560,1548272,1548272,1548272
1995,1750767,1692856,1614463,1582411,1575766,1570362,1566104,1566749,1567686,1561345,1561345,1561345
1996,1625977,1560165,1519432,1489267,1483013,1477928,1473919,1474527,1475409,1469441,1469441,1469441
1997,1502410,1532815,1492796,1463160,1457016,1452019,1448081,1448678,1449545,1443682,1443682,1443682


In [71]:
dev.ldf_.loc["wkcomp"].to_frame().T["IncurLoss"]

development
12-24      1.023213
24-36      0.975516
36-48      0.981517
48-60      0.996586
60-72      0.996526
72-84      0.997014
84-96      1.000317
96-108     1.000575
108-120    0.995955
Name: IncurLoss, dtype: float64

In [70]:
dev.cdf_.loc["wkcomp"].to_frame().T["IncurLoss"]

development
12-Ult     0.967007
24-Ult     0.945070
36-Ult     0.968790
48-Ult     0.987033
60-Ult     0.990414
72-Ult     0.993867
84-Ult     0.996843
96-Ult     0.996527
108-Ult    0.995955
Name: IncurLoss, dtype: float64

In [90]:
import chainladder as cl
import pandas as pd

triangle = cl.Triangle(
    df,
    origin="accident_date",
    development="development_date",
    index="line_of_business",
    columns=[
        "incremental_paid_claims_usd"
    ],
    grain="oydy",
    cumulative=True
)
dev = cl.TweedieGLM(
    design_matrix="C(line_of_business) + C(development) + C(origin)", link="log", power=1
).fit(triangle)

# Grab LDFs vs traditional approach
glm = dev.ldf_.iloc[..., 0, :].T.iloc[:, 0].rename("GLM")
traditional = cl.Development().fit(triangle).ldf_.T.iloc[:, 0].rename("Traditional")

# Plot data
results = pd.concat((glm, traditional), axis=1)

  origin_date = origin_date.dt.to_period(self.origin_grain).dt.to_timestamp(
  development_date = development_date.dt.to_period(
  o = pd.period_range(
  d = pd.period_range(
  return pd.DatetimeIndex(self.odims, name="origin").to_period(freq=freq)


ValueError: negative dimensions are not allowed

In [39]:
X

Unnamed: 0,Intercept,C(line_of_business)[T.Medical expense insurance],C(line_of_business)[T.Motor vehicle liability insurance],C(line_of_business)[T.Workers' compensation insurance],C(development_date)[T.1989-02-20 00:00:00],C(development_date)[T.1990-02-20 00:00:00],C(development_date)[T.1991-02-20 00:00:00],C(development_date)[T.1992-02-20 00:00:00],C(development_date)[T.1993-02-20 00:00:00],C(development_date)[T.1994-02-20 00:00:00],...,C(development_date)[T.1997-02-20 00:00:00],C(accident_date)[T.1989-02-20 00:00:00],C(accident_date)[T.1990-02-20 00:00:00],C(accident_date)[T.1991-02-20 00:00:00],C(accident_date)[T.1992-02-20 00:00:00],C(accident_date)[T.1993-02-20 00:00:00],C(accident_date)[T.1994-02-20 00:00:00],C(accident_date)[T.1995-02-20 00:00:00],C(accident_date)[T.1996-02-20 00:00:00],C(accident_date)[T.1997-02-20 00:00:00]
0,1.0,1,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1.0,1,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,1.0,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
3,1.0,1,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
4,1.0,1,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,1.0,0,1,0,0,0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0
216,1.0,0,1,0,0,0,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
217,1.0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
218,1.0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [43]:
X

Unnamed: 0,incremental_incurred_claims_usd
0,421814.0
1,-6641.0
2,-24473.0
3,-38948.0
4,-24250.0
...,...
215,-48499.0
216,-45926.0
217,-56449.0
218,-12845.0


In [91]:
triangle.to_frame(keepdims=True, implicit_axis=True).valuation.unique()

<DatetimeArray>
['1987-12-31 23:59:59.999999999', '1988-12-31 23:59:59.999999999',
 '1989-12-31 23:59:59.999999999', '1990-12-31 23:59:59.999999999',
 '1991-12-31 23:59:59.999999999', '1992-12-31 23:59:59.999999999',
 '1993-12-31 23:59:59.999999999', '1994-12-31 23:59:59.999999999',
 '1995-12-31 23:59:59.999999999', '1996-12-31 23:59:59.999999999',
 '1988-11-30 23:59:59.999999999', '1989-11-30 23:59:59.999999999',
 '1990-11-30 23:59:59.999999999', '1992-11-30 23:59:59.999999999',
 '1993-11-30 23:59:59.999999999', '1994-11-30 23:59:59.999999999',
 '1996-11-30 23:59:59.999999999']
Length: 17, dtype: datetime64[ns]

In [92]:
df.development_date.unique()

<DatetimeArray>
['1990-02-20 00:00:00', '1991-02-20 00:00:00', '1992-02-20 00:00:00',
 '1993-02-20 00:00:00', '1994-02-20 00:00:00', '1995-02-20 00:00:00',
 '1996-02-20 00:00:00', '1997-02-20 00:00:00', '1989-02-20 00:00:00',
 '1988-02-20 00:00:00']
Length: 10, dtype: datetime64[us]

In [74]:
tri_test

  return pd.DatetimeIndex(self.odims, name="origin").to_period(freq=freq)
  return pd.DatetimeIndex(self.odims, name="origin").to_period(freq=freq)


Unnamed: 0,11,12,23,24,35,36,48,59,60,71,72,83,84,96,107,108,120
1987,,11644995.0,,11674240.0,,11653597.0,11630882.0,,11593868.0,,11551625.0,,11463312.0,11420238.0,,11415560.0,11396981.0
1988,13123290.0,,13118789.0,,13113024.0,,13050144.0,12959037.0,,12866709.0,,12787372.0,,12757420.0,12743440.0,,
1989,14776079.0,,14670690.0,,,14479699.0,14324680.0,14183178.0,,14033498.0,,,13948139.0,13925679.0,,,
1990,15318373.0,,,15112547.0,,14877662.0,14615540.0,14380205.0,,,14205778.0,,14154882.0,,,,
1991,,16828857.0,,16457307.0,,15999385.0,15538214.0,,15249286.0,,15161066.0,,,,,,
1992,18169370.0,,17590902.0,,17080187.0,,16485467.0,16281774.0,,,,,,,,,
1993,19414898.0,,18609089.0,,,17854178.0,17521037.0,,,,,,,,,,
1994,19502850.0,,,18668388.0,,17901550.0,,,,,,,,,,,
1995,,19142090.0,,17910743.0,,,,,,,,,,,,,
1996,18113581.0,,,,,,,,,,,,,,,,


In [76]:
df.development_date.unique()

<DatetimeArray>
['1990-02-20 00:00:00', '1991-02-20 00:00:00', '1992-02-20 00:00:00',
 '1993-02-20 00:00:00', '1994-02-20 00:00:00', '1995-02-20 00:00:00',
 '1996-02-20 00:00:00', '1997-02-20 00:00:00', '1989-02-20 00:00:00',
 '1988-02-20 00:00:00']
Length: 10, dtype: datetime64[us]

In [71]:
X

Unnamed: 0,Intercept,C(development_date)[T.1989-02-20 00:00:00],C(development_date)[T.1990-02-20 00:00:00],C(development_date)[T.1991-02-20 00:00:00],C(development_date)[T.1992-02-20 00:00:00],C(development_date)[T.1993-02-20 00:00:00],C(development_date)[T.1994-02-20 00:00:00],C(development_date)[T.1995-02-20 00:00:00],C(development_date)[T.1996-02-20 00:00:00],C(development_date)[T.1997-02-20 00:00:00]
0,1.0,0,1,0,0,0,0,0,0,0
1,1.0,0,0,1,0,0,0,0,0,0
2,1.0,0,0,0,1,0,0,0,0,0
3,1.0,0,0,0,0,1,0,0,0,0
4,1.0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...
215,1.0,0,0,0,0,1,0,0,0,0
216,1.0,0,0,0,0,0,1,0,0,0
217,1.0,0,0,0,0,0,0,1,0,0
218,1.0,0,0,0,0,0,0,0,1,0


In [14]:
X

Unnamed: 0,Intercept,C(line_of_business)[T.Medical expense insurance],C(line_of_business)[T.Motor vehicle liability insurance],C(line_of_business)[T.Workers' compensation insurance],C(development_date)[T.1989-02-20 00:00:00],C(development_date)[T.1990-02-20 00:00:00],C(development_date)[T.1991-02-20 00:00:00],C(development_date)[T.1992-02-20 00:00:00],C(development_date)[T.1993-02-20 00:00:00],C(development_date)[T.1994-02-20 00:00:00],C(development_date)[T.1995-02-20 00:00:00],C(development_date)[T.1996-02-20 00:00:00],C(development_date)[T.1997-02-20 00:00:00]
0,1.0,1,0,0,0,1,0,0,0,0,0,0,0
1,1.0,1,0,0,0,0,1,0,0,0,0,0,0
2,1.0,1,0,0,0,0,0,1,0,0,0,0,0
3,1.0,1,0,0,0,0,0,0,1,0,0,0,0
4,1.0,1,0,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
215,1.0,0,1,0,0,0,0,0,1,0,0,0,0
216,1.0,0,1,0,0,0,0,0,0,1,0,0,0
217,1.0,0,1,0,0,0,0,0,0,0,1,0,0
218,1.0,0,1,0,0,0,0,0,0,0,0,1,0


In [None]:

glm = GeneralizedLinearRegressor(family='poisson', link='log')
glm.fit(X, y.values)

# After fitting, you can predict with the model and/or evaluate its performance
# predictions = glm.predict(X.design_matrix)

# Print model coefficients, intercept, or use other model attributes/methods as needed
print("Coefficients:", np.exp(glm.coef_))
print("Intercept:", glm.intercept_)

In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220 entries, 0 to 219
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype         
---  ------                           --------------  -----         
 0   line_of_business_id              220 non-null    int32         
 1   line_of_business                 220 non-null    object        
 2   accident_date                    220 non-null    datetime64[us]
 3   development_date                 220 non-null    datetime64[us]
 4   accident_year                    220 non-null    int32         
 5   development_year                 220 non-null    int32         
 6   development_lag                  220 non-null    int64         
 7   incurred_claims_usd              220 non-null    float64       
 8   paid_claims_usd                  220 non-null    float64       
 9   reserved_claims_usd              220 non-null    float64       
 10  earned_premium_usd               220 non-null    float64      

In [8]:
type(df)

pandas.core.frame.DataFrame

In [5]:
import pandas as pd

def calculate_chain_ladder_factors(df, development='development_lag', origin ='accident_date', amount='incurred_claims_usd'):
    # Group data by development period
    grouped = df.groupby(["line_of_business", development])

    # Calculate simple average factors
    simple_factors = grouped[amount].pct_change() + 1  

    # Calculate volume-weighted factors
    def weighted_factor(group):
        group['volume'] = group[amount].shift(1)
        return ((group[amount] * group['volume']).sum() / group['volume'].sum())

    volume_weighted_factors = grouped.apply(weighted_factor)

    return simple_factors, volume_weighted_factors

# Example usage (assuming you have your data loaded in 'df')
simple_factors, volume_weighted_factors = calculate_chain_ladder_factors(df) 
print("simple factors:", simple_factors.head())
print(volume_weighted_factors)


simple factors: 0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: incurred_claims_usd, dtype: float64
line_of_business                   development_lag
General liability insurance        0                  7.205721e+05
                                   1                  7.466600e+05
                                   2                  7.188816e+05
                                   3                  6.691707e+05
                                   4                  6.087223e+05
                                   5                  5.739989e+05
                                   6                  5.579097e+05
                                   7                  5.233254e+05
                                   8                  4.994040e+05
                                   9                           NaN
Medical expense insurance          0                  4.881687e+05
                                   1                  4.609513e+05
                                   2           

  return ((group[amount] * group['volume']).sum() / group['volume'].sum())
  return ((group[amount] * group['volume']).sum() / group['volume'].sum())
  return ((group[amount] * group['volume']).sum() / group['volume'].sum())
  return ((group[amount] * group['volume']).sum() / group['volume'].sum())
  volume_weighted_factors = grouped.apply(weighted_factor)


In [16]:
import polars as pl

def calculate_chain_ladder_factors(df, development='development_lag', origin='accident_date', amount='incurred_claims_usd'):  
    def simple_factors_calc(df: pl.DataFrame) -> pl.Series:
        return df[amount].pct_change().fill_null(0) + 1

    def volume_weighted_factors_calc(df: pl.DataFrame) -> pl.Series:
        df = df.with_column(pl.col(amount).shift().alias("volume"))
        return (df[amount] * df["volume"]).sum() / df["volume"].sum()

    result = (df.group_by(development).agg(
        pl.map_groups(pl.col(*), simple_factors_calc).alias("simple_factors"),
        pl.map_groups(volume_weighted_factors_calc).alias("volume_weighted_factors")
    ))

    return result 
df_cl = calculate_chain_ladder_factors(pl.DataFrame(df)) 
print(df_cl)



SyntaxError: invalid syntax (2464163514.py, line 12)

In [8]:
import pandas as pd

# Sample data
data = {'AY': ['2018', '2018', '2018', '2018', '2019', '2019', '2019', '2020', '2020'],
        'DevelopmentYear': [1, 2, 3, 4, 1, 2, 3, 1, 2],
        'CumulativeClaims': [100, 200, 250, 270, 120, 220, 260, 150, 230],
        'ClaimsVolume': [50, 45, 40, 35, 55, 50, 45, 60, 55]}  # ClaimsVolume for volume-weighted calculation

df = pd.DataFrame(data)

# Calculate Simple Chain-Ladder Factors
df['NextCumulativeClaims'] = df.groupby('AY')['CumulativeClaims'].shift(-1)
df['SimpleCLFactor'] = df['NextCumulativeClaims'] / df['CumulativeClaims']
simple_cl_factors = df.dropna(subset=['SimpleCLFactor'])[['AY', 'DevelopmentYear', 'SimpleCLFactor']]

print(simple_cl_factors)


     AY  DevelopmentYear  SimpleCLFactor
0  2018                1        2.000000
1  2018                2        1.250000
2  2018                3        1.080000
4  2019                1        1.833333
5  2019                2        1.181818
7  2020                1        1.533333


In [15]:
from glum import GeneralizedLinearRegressor
from formulaic import model_matrix
import numpy as np

y,X= model_matrix('log(incurred_claims_usd)~ C(line_of_business):C(development_date)', df)

glm = GeneralizedLinearRegressor(family='poisson', link='log')
glm.fit(X, y)

# After fitting, you can predict with the model and/or evaluate its performance
# predictions = glm.predict(X.design_matrix)

# Print model coefficients, intercept, or use other model attributes/methods as needed
print("Coefficients:", np.exp(glm.coef_))
print("Intercept:", glm.intercept_)

Coefficients: [1.         0.99806346 0.99883872 1.00000838 1.00066193 1.0010448
 1.00109112 1.00094199 1.00075485 1.0004479  0.99471466 1.00850333
 0.99986524 0.98969702 1.01684059 1.0005165  0.98569909 1.02452809
 1.00110745 0.98215462 1.03131532 1.00195428 0.97898696 1.03777863
 1.00271436 0.97545299 1.04387387 1.00354005 0.9724235  1.04964986
 1.00402275 0.96998121 1.05491137 1.00433685 0.96792666 1.05964424
 1.00431134 0.96555895 1.06391838 1.00423739]
Intercept: 2.645369747496562


  y = column_or_1d(y, warn=True)


## Working Chainladder
Both simple and volume, but ugly

In [37]:
cl_test.groupby(["line_of_business","development_lag"])[["incurred_claims_usd","NextAmount"]].sum().assign(VolumeCL = lambda x: x["NextAmount"] / x["incurred_claims_usd"])

Unnamed: 0_level_0,Unnamed: 1_level_0,incurred_claims_usd,NextAmount,VolumeCL
line_of_business,development_lag,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Medical expense insurance,0,4313855.0,4150080.0,0.962035
Medical expense insurance,1,3606561.0,3413314.0,0.946418
Medical expense insurance,2,2887737.0,2692739.0,0.932474
Medical expense insurance,3,2225398.0,2099337.0,0.943354
Medical expense insurance,4,1689871.0,1570471.0,0.929344
Medical expense insurance,5,1208966.0,1153294.0,0.953951
Medical expense insurance,6,822851.0,775924.0,0.94297
Medical expense insurance,7,495513.0,486868.0,0.982553
Medical expense insurance,8,241022.0,236376.0,0.980724


## Polars version

In [None]:
def calculate_cl_factors(development: pl.Expr, origin=pl.Expr, amount=pl.Expr) -> pl.Expr:
    # Calculate  Chain-Ladder Factors, simple and volume-weighted with Polars
    cl_factor = (pl.col("incurred_claims_usd").pct_change().sort_by("line_of_business","accident_date","development_lag").over(["line_of_business","accident_date"]).fill_null(0)) +1)
    df['NextAmount'] = df.groupby([index, origin])[amount].shift(-1)
    df['SimpleCLFactor'] = df['NextAmount'] / df[amount]
Following is 

In [72]:
sort_cols = index + origin + development
sort_cols


['line_of_business', 'accident_date', 'development_date']

In [17]:
index = ["line_of_business"]
origin = ["accident_date"]
development = ["development_lag"]
amounts = ["incurred_claims_usd", "paid_claims_usd"]
# list concatinated, not list in list
sort_cols = [index, origin, development]
over_cols = [index, origin]

df_pl = pl.DataFrame(df)
df_pl_cl = (
    df_pl.select(index + origin + development + amounts)
    .sort(index + origin + development)
    .with_columns(
        lead_amount=pl.col(amounts[0]).shift(1).over(index + origin),
        cl_factor=(pl.col(amounts[0]).pct_change().over(index + origin).fill_null(0))
        + 1,
    )
    #.filter(pl.col("line_of_business") == "Medical expense insurance")
    
)
#Volum weighted average of cl_factor for line_of_business and development_date
cl_vol = df_pl_cl.group_by(index + development).agg(pl.col("cl_factor").mean().alias("cl_factor_simple"),
                                                    pl.col("cl_factor"),
                                                    pl.col("lead_amount"),
                                                    pl.col(amounts[0]),
)#.with_columns(volume_cl=pl.col("lead_amount") / pl.col(amounts[0]))

In [18]:
cl_vol.filter(pl.col("line_of_business") == "Medical expense insurance")

line_of_business,development_lag,cl_factor_simple,cl_factor,lead_amount,incurred_claims_usd
str,i64,f64,list[f64],list[f64],list[f64]
"""Medical expens…",0,1.0,"[1.0, 1.0, … 1.0]","[null, null, … null]","[380154.0, 363266.0, … 536705.0]"
"""Medical expens…",1,0.961841,"[0.907879, 1.002918, … 0.97014]","[380154.0, 363266.0, … 560248.0]","[345134.0, 364326.0, … 543519.0]"
"""Medical expens…",2,0.946369,"[0.936515, 0.968424, … 0.947112]","[345134.0, 364326.0, … 554926.0]","[323223.0, 352822.0, … 525577.0]"
"""Medical expens…",3,0.933092,"[0.954706, 0.930373, … 0.920713]","[323223.0, 352822.0, … 507586.0]","[308583.0, 328256.0, … 467341.0]"
"""Medical expens…",7,0.942734,"[0.924816, 0.957566, 0.94582]","[260387.0, 265990.0, 296474.0]","[240810.0, 254703.0, 280411.0]"
"""Medical expens…",4,0.942636,"[0.963196, 0.907137, … 0.955923]","[308583.0, 328256.0, … 428346.0]","[297226.0, 297773.0, … 409466.0]"
"""Medical expens…",5,0.929832,"[0.919122, 0.937318, … 0.914686]","[297226.0, 297773.0, … 395223.0]","[273187.0, 279108.0, … 361505.0]"
"""Medical expens…",6,0.953941,"[0.953146, 0.953, … 0.952689]","[273187.0, 279108.0, … 346853.0]","[260387.0, 265990.0, … 330443.0]"
"""Medical expens…",8,0.983053,"[1.00088, 0.965226]","[240810.0, 254703.0]","[241022.0, 245846.0]"
"""Medical expens…",9,0.980724,[0.980724],[241022.0],[236376.0]


In [21]:
df.sample(10)

Unnamed: 0,line_of_business_id,line_of_business,accident_date,development_date,accident_year,development_year,development_lag,incurred_claims_usd,paid_claims_usd,reserved_claims_usd,earned_premium_usd,incremental_incurred_claims_usd,incremental_paid_claims_usd,incremental_reserved_claims_usd
66,8,General liability insurance,1988-02-20,1988-02-20,1988,1988,0,448957.0,39166.0,409791.0,972971.0,448957.0,39166.0,409791.0
49,4,Motor vehicle liability insurance,1991-02-20,1997-02-20,1991,1997,6,11561602.0,11425383.0,136219.0,15781531.0,-47329.0,114965.0,-162294.0
201,8,General liability insurance,1990-02-20,1993-02-20,1990,1993,3,598058.0,334211.0,263847.0,1044412.0,-4382.0,107050.0,-111432.0
198,8,General liability insurance,1990-02-20,1990-02-20,1990,1990,0,593294.0,49382.0,543912.0,1044412.0,593294.0,49382.0,543912.0
134,8,General liability insurance,1993-02-20,1995-02-20,1993,1995,2,775163.0,341873.0,433290.0,1206095.0,-34235.0,142651.0,-176886.0
3,1,Medical expense insurance,1990-02-20,1993-02-20,1990,1993,3,351752.0,163695.0,188057.0,460119.0,-38948.0,44731.0,-83679.0
43,4,Motor vehicle liability insurance,1991-02-20,1991-02-20,1991,1991,0,12561606.0,4245091.0,8316515.0,15781531.0,12561606.0,4245091.0,8316515.0
110,8,General liability insurance,1995-02-20,1996-02-20,1995,1996,1,844633.0,212724.0,631909.0,1482542.0,-62440.0,162680.0,-225120.0
6,1,Medical expense insurance,1990-02-20,1996-02-20,1990,1996,6,296474.0,225199.0,71275.0,460119.0,-13344.0,11227.0,-24571.0
58,1,Medical expense insurance,1991-02-20,1994-02-20,1991,1994,3,391085.0,199262.0,191823.0,436063.0,-21518.0,52915.0,-74433.0


In [174]:
index = ["line_of_business"]
origin = ["accident_date"]
development = ["development_lag"]
amounts = ["incurred_claims_usd", "paid_claims_usd"]
def calculate_simple_chain_ladder_factors(df, development='development_lag', origin='accident_date', amount='incurred_claims_usd', index='line_of_business'):
    df['NextAmount'] = df.groupby([index, origin])[amount].shift(-1)
    df['SimpleCLFactor'] = df['NextAmount'] / df[amount]
    return df.dropna(subset=['SimpleCLFactor'])[[index, origin, development, amount,"NextAmount", 'SimpleCLFactor']]
cl_test = calculate_simple_chain_ladder_factors(df).query("line_of_business == 'Medical expense insurance'")
cl_test.reset_index().groupby(["line_of_business","development_lag"])["SimpleCLFactor"].mean()

  df['NextAmount'] = df.groupby([index, origin])[amount].shift(-1)


TypeError: 'GroupBy' object is not subscriptable

In [172]:
df

line_of_business,development_lag,cl_factor_incurred,dev_year,cdf_incurred
str,i64,f64,i32,f64
"""Workers' compe…",0,1.0,1997,0.995955
"""Workers' compe…",1,1.023213,1997,0.996527
"""Workers' compe…",7,1.000317,1997,0.996843
"""Workers' compe…",9,0.995955,1997,0.993867
"""Workers' compe…",2,0.975516,1997,0.990414
"""Workers' compe…",3,0.981517,1997,0.987033
"""Workers' compe…",4,0.996586,1997,0.96879
"""Workers' compe…",5,0.996526,1997,0.94507
"""Workers' compe…",6,0.997014,1997,0.967007
"""Workers' compe…",8,1.000575,1997,0.967007


## Numpy

In [31]:
index = ["line_of_business"]
origin = ["accident_date"]
development = ["development_lag"]
amounts = ["incurred_claims_usd", "paid_claims_usd"]

import numpy as np
def calculate_simple_ldf_factors(matrix):
    # Calculate the ratio of each element to its previous element in each row (AY)
    simple_factors = matrix[:, 1:] / matrix[:, :-1]
    # Calculate the average factor for each development lag across all AYs
    # Ignore NaN values by using np.nanmean
    avg_simple_factors = np.nanmean(simple_factors, axis=0)
    avg_simple_factors[np.isnan(avg_simple_factors)]=1.0
    return avg_simple_factors

def calculate_simple_cdf_factors(ldf):
    return np.cumprod(ldf[::-1])[::-1]


tri_pl = df.pivot(index=index+origin, columns=development, values="incurred_claims_usd", aggregate_function="sum")

In [32]:
tri_pl

line_of_business,accident_date,0,1,2,3,4,5,6,7,8,9
str,date,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Medical expens…",1989-02-20,363266.0,364326.0,352822.0,328256.0,297773.0,279108.0,265990.0,254703.0,245846.0,
"""Workers' compe…",1990-02-20,1.477245e6,1.621324e6,1.605337e6,1.602578e6,1.584751e6,1.57011e6,1.568607e6,1.573819e6,,
"""Workers' compe…",1995-02-20,1.750767e6,1.692856e6,1.614463e6,,,,,,,
"""Medical expens…",1995-02-20,562089.0,554926.0,525577.0,,,,,,,
"""General liabil…",1993-02-20,805843.0,809398.0,775163.0,728328.0,717128.0,,,,,
…,…,…,…,…,…,…,…,…,…,…,…
"""Motor vehicle …",1995-02-20,1.6282921e7,1.5575973e7,1.4937057e7,,,,,,,
"""General liabil…",1990-02-20,593294.0,614502.0,602440.0,598058.0,577346.0,550370.0,546687.0,544668.0,,
"""General liabil…",1989-02-20,524433.0,528356.0,568232.0,538732.0,525421.0,510165.0,503168.0,499956.0,499404.0,
"""Workers' compe…",1993-02-20,1.719891e6,1.714687e6,1.643956e6,1.556461e6,1.548909e6,,,,,


In [7]:
tri_pl_group = tri_pl.group_by("line_of_business")

development_lag = {}
ldf_groups = {}
cdf_groups = {}
for g,values  in tri_pl_group:
    single_triangle = values.sort(origin).select(pl.exclude(index + origin))
    development_lag[g] = single_triangle.columns
    ldf_groups[g] = calculate_simple_ldf_factors(single_triangle.to_numpy())
    cdf_groups[g] = calculate_simple_cdf_factors(ldf_groups[g])

  for g,values  in tri_pl_group:


In [62]:

def calculated_cl_factors(df, amount=['incurred_claims_usd'], index= ["line_of_business"], development=['development_lag'], origin=['accident_date']):
    
    for i, amount in enumerate(amounts):
        triangles = df.pivot(index=index+origin, columns=development, values=amount, aggregate_function="sum")
        dataframes = []
        for g, values in triangles.group_by("line_of_business"):
            single_triangle = values.sort(origin).select(pl.exclude(index + origin))
            ldf = calculate_simple_ldf_factors(single_triangle.to_numpy())
            cdf = calculate_simple_cdf_factors(ldf)
            dataframes.append(
                pl.DataFrame(
                    {
                        "line_of_business": g,
                        "development_lag": [int(col) for col in single_triangle.columns[:-1]],
                        "ldf_"+amount: ldf,
                        "cdf_"+amount: cdf,
                    },
                    
                ).sort(["line_of_business","development_lag"])
            )
        if i==0:
            df_cl_factors = pl.concat(dataframes)
        else:
            df_cl_factors = df_cl_factors.join(pl.concat(dataframes), on = index + development)

    return df_cl_factors

In [63]:
test_dev = calculated_cl_factors(df)

  for g, values in triangles.group_by("line_of_business"):


In [64]:
test_dev

line_of_business,development_lag,ldf_incurred_claims_usd,cdf_incurred_claims_usd,ldf_paid_claims_usd,cdf_paid_claims_usd
str,i64,f64,f64,f64,f64
"""Medical expens…",0,0.961841,0.645463,6.075951,25.519739
"""Medical expens…",1,0.946369,0.67107,1.97625,4.200123
"""Medical expens…",2,0.933092,0.7091,1.383782,2.125299
"""Medical expens…",3,0.942636,0.759946,1.199759,1.535863
"""Medical expens…",4,0.929832,0.806193,1.101576,1.280143
…,…,…,…,…,…
"""Motor vehicle …",4,0.993621,0.986291,1.021848,1.042829
"""Motor vehicle …",5,0.994988,0.992624,1.010795,1.020533
"""Motor vehicle …",6,0.99865,0.997623,1.005396,1.009634
"""Motor vehicle …",7,0.999516,0.998972,1.002952,1.004216


In [49]:
"ldf_"+amounts[0]

'ldf_incurred_claims_usd'

In [47]:
test_dev.head(2)

line_of_business,development_lag,amount_type,ldf,cdf
str,i64,str,f64,f64
"""Motor vehicle …",0,"""incurred_claim…",0.969737,0.914558
"""Motor vehicle …",1,"""incurred_claim…",0.979088,0.9431


In [46]:
amounts=["incurred_claims_usd","paid_claims_usd"]
df.groupby("line_of_business", "development_lag").agg()

line_of_business,development_lag,ldf_amount_type_incurred_claims_usd,ldf_amount_type_paid_claims_usd,cdf_amount_type_incurred_claims_usd,cdf_amount_type_paid_claims_usd
str,i64,f64,f64,f64,f64
"""Motor vehicle …",0,0.969737,1.83119,0.914558,2.643895
"""Motor vehicle …",1,0.979088,1.20972,0.9431,1.443813
"""Motor vehicle …",2,0.985854,1.094253,0.963243,1.19351
"""Motor vehicle …",3,0.990645,1.045912,0.977064,1.090707
"""Motor vehicle …",4,0.993621,1.021848,0.986291,1.042829
…,…,…,…,…,…
"""Medical expens…",4,0.929832,1.101576,0.806193,1.280143
"""Medical expens…",5,0.953941,1.067977,0.867031,1.162102
"""Medical expens…",6,0.942734,1.039102,0.908893,1.088134
"""Medical expens…",7,0.983053,1.028556,0.964104,1.047187


In [253]:
increments = np.diff(tri_np, axis=1)
weights = tri_np[:, :-1]
np.nanmean(increments / weights, axis=0) +1

array([0.96184102, 0.94636855, 0.9330923 , 0.94263576, 0.92983239,
       0.95394109, 0.9427339 , 0.98305326, 0.98072375])

In [249]:
def calculate_volume_weighted_cl_factors_np(matrix):
    # Calculate the differences (increments) between development periods
    increments = np.diff(matrix, axis=1)
    
    # Calculate weights as the amounts themselves, shifted right to align with increments
    weights = matrix[:, :-1]  # Using amounts before each increment as weights
    
    # Volume-weighted calculation (simple average in this context, as weights align with amounts)
    weighted_averages = np.nanmean(increments / weights, axis=0)
    
    return weighted_averages

volume_weighted_cl_factors = calculate_volume_weighted_cl_factors_np(tri_pl.select(pl.exclude(index + origin)).to_numpy())
print("Volume-Weighted Chain-Ladder Factors:", volume_weighted_cl_factors+1)


Volume-Weighted Chain-Ladder Factors: [0.96184102 0.94636855 0.9330923  0.94263576 0.92983239 0.95394109
 0.9427339  0.98305326 0.98072375]


In [247]:

simple = cl_vol.select(pl.col(index), pl.col(development),pl.col("cl_factor").list.mean()).sort(index + development)
simple
#cl_vol.select(np.average(pl.col("cl_factor"), weights=pl.col("incurred_claims_usd")))

line_of_business,development_lag,cl_factor
str,i64,f64
"""General liabil…",0,1.0
"""General liabil…",1,0.985213
"""General liabil…",2,1.00355
"""General liabil…",3,0.967666
"""General liabil…",4,0.961066
…,…,…
"""Workers' compe…",5,0.996526
"""Workers' compe…",6,0.997014
"""Workers' compe…",7,1.000317
"""Workers' compe…",8,1.000575


In [18]:
tri_test= cl.Triangle(df, origin="accident_date", development="development_date", columns = "incurred_claims_usd", index="line_of_business", grain="OYDY", cumulative=True)

  origin_date = origin_date.dt.to_period(self.origin_grain).dt.to_timestamp(
  development_date = development_date.dt.to_period(
  o = pd.period_range(
  d = pd.period_range(


In [24]:
cl.Development(average="volume").fit(tri_test).ldf_.loc["Medical expense insurance"]

  return pd.DatetimeIndex(self.odims, name="origin").to_period(freq=freq)


Unnamed: 0,11-23,12-24,23-35,24-36,35-47,36-48,48-60,59-71,60-72,71-83,72-84,83-95,84-96,96-108,107-119,108-120
(All),,,,,,0.9313,0.9383,,,,,,0.936,0.9652,,0.9807


In [25]:
tri_test.loc["Medical expense insurance"]

  return pd.DatetimeIndex(self.odims, name="origin").to_period(freq=freq)
  return pd.DatetimeIndex(self.odims, name="origin").to_period(freq=freq)


Unnamed: 0,11,12,23,24,35,36,48,59,60,71,72,83,84,96,107,108,120
1987,,380154.0,,345134.0,,323223.0,308583.0,,297226.0,,273187.0,,260387.0,240810.0,,241022.0,236376.0
1988,363266.0,,364326.0,,352822.0,,328256.0,297773.0,,279108.0,,265990.0,,254703.0,245846.0,,
1989,421814.0,,415173.0,,,390700.0,351752.0,327502.0,,309818.0,,,296474.0,280411.0,,,
1990,459573.0,,,440929.0,,412603.0,391085.0,372147.0,,,346853.0,,330443.0,,,,
1991,,519397.0,,493296.0,,444814.0,417376.0,,395223.0,,361505.0,,,,,,
1992,510705.0,,474444.0,,455989.0,,428346.0,409466.0,,,,,,,,,
1993,536609.0,,518333.0,,,507586.0,467341.0,,,,,,,,,,
1994,562089.0,,,554926.0,,525577.0,,,,,,,,,,,
1995,,560248.0,,543519.0,,,,,,,,,,,,,
1996,536705.0,,,,,,,,,,,,,,,,


In [36]:
import pandas as pd
from formulaic import Formula as fm
import glum

# Sample dataset creation (you'd load your actual data)
insurance_data = pd.DataFrame({
    'claim_amount': [1000, 5000, 3500, 800, 1200, ...],
    'development_year': [1, 1, 2, 1, 3, ...], 
    'origin_year': [2022, 2021, 2022, 2021, 2020, ...] 
})

# Formula Specification
model_formula = fm('claim_amount ~ development_year + origin_year + development_year:origin_year')

# GLM model fitting
model = glum.GeneralizedLinearRegressor(family="twedie", link='log')
model = model.fit(data=insurance_data, formula=model_formula)

# Predictions and further analysis
predictions = model.predict(insurance_data) 
print(model.summary())  # View model summary




TypeError: GeneralizedLinearRegressor.fit() got an unexpected keyword argument 'data'