In [1]:
import io
import numpy as np
import pandas as pd
from scipy import integrate, stats, optimize

https://stackoverflow.com/questions/78831434/new-column-in-pandas-dataframe-using-least-squares-from-scipy-optimize

In [2]:
data = pd.read_fwf(io.StringIO("""Race_ID   Date           Student_ID      feature1  
1         1/1/2023       3               0.02167131     
1         1/1/2023       4               0.17349148     
1         1/1/2023       6               0.08438952     
1         1/1/2023       8               0.04143787     
1         1/1/2023       9               0.02589056
1         1/1/2023       1               0.03866752     
1         1/1/2023       10              0.0461553     
1         1/1/2023       45              0.09212758
1         1/1/2023       23              0.10879326      
1         1/1/2023       102             0.186921      
1         1/1/2023       75              0.02990676      
1         1/1/2023       27              0.02731904      
1         1/1/2023       15              0.06020158      
1         1/1/2023       29              0.06302721                         
3         17/4/2022      5               0.2     
3         17/4/2022      2               0.1     
3         17/4/2022      3               0.55     
3         17/4/2022      4               0.15   """))

In [3]:
def integrand(x, ti, *theta):
    product = 1.
    for t in theta:
        product = product * (1 - stats.norm.cdf(x - t))
    return product * stats.norm.pdf(x - ti)

def integral(ti, *theta):
    return integrate.quad(integrand, -np.inf, np.inf, args=(ti, *theta))[0]

In [4]:
def change_order(parameters, i):
    return [parameters[i]] + parameters[0:i] + parameters[i+1:]

In [5]:
def system(parameters, t):
    parameters = parameters.tolist()
    values = np.full(len(t), np.nan)
    for i in range(len(parameters)):
        parameters = change_order(parameters, i)
        values[i] = integral(*parameters) - t[i]
    return values

In [6]:
def solver(x):
    t = x["feature1"].values
    u = np.ones_like(t)
    solution = optimize.least_squares(system, u, bounds=[-2*u, 2*u], args=(t,))
    return solution.x

In [7]:
data["new_column"] = data.groupby("Race_ID").apply(solver, include_groups=False).explode().values

In [9]:
print(data)

    Race_ID       Date  Student_ID  feature1 new_column
0         1   1/1/2023           3  0.021671   1.383615
1         1   1/1/2023           4  0.173491    0.25823
2         1   1/1/2023           6  0.084390   0.695116
3         1   1/1/2023           8  0.041438   1.073675
4         1   1/1/2023           9  0.025891   1.301445
5         1   1/1/2023           1  0.038668   1.108209
6         1   1/1/2023          10  0.046155   1.019114
7         1   1/1/2023          45  0.092128   0.645103
8         1   1/1/2023          23  0.108793   0.548053
9         1   1/1/2023         102  0.186921   0.209302
10        1   1/1/2023          75  0.029907    1.23329
11        1   1/1/2023          27  0.027319   1.276228
12        1   1/1/2023          15  0.060202   0.880537
13        1   1/1/2023          29  0.063027   0.855987
14        3  17/4/2022           5  0.200000   0.920987
15        3  17/4/2022           2  0.100000   1.376155
16        3  17/4/2022           3  0.550000   0