## Code to optimize a curve fit for log(P_c) using b*log(1-exp(x-a))+c, uses least squares

In [1]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.optimize import least_squares
import scipy.special as scipy

In [2]:
df = pd.read_csv("/homes/fed1/Survival_Franny/SEER_CA_formatted_test_for_paper.csv")

In [None]:
df['t'] = df['interval']+1

df = df[df.year==0]
df = df[df.sex==3]
df = df[df.age_name=="All Ages"]

df['P_c'] = 1 - df['net_survival']

In [None]:
#with_P_c = df[df.t==1]
for cancer in df.cause_name.unique():
    index_1 = (df.cause_name==cancer)
    df.loc[(index_1)&(df.t>1), 'P_c']=1-df.net_survival[index_1].values[1:]/df.net_survival[index_1].values[:-1]
    #with_P_c = pd.merge(with_P_c, df_cause, how='outer')

In [None]:
df = df.dropna()
df = df[df['P_c']!=0]
df['logit_P_c'] = scipy.logit(df['P_c'])

In [None]:
# define the functional form to fit
def function(x,t):
    return x[1]*np.log(1 + np.exp(-(t-x[0])))+x[2]

def model(x, t, y):
    return function(x,t)-y

In [None]:
# starting values
x_0 = np.array([1.0,1.0,1.0])

In [None]:
parameters = pd.DataFrame(columns = ['cause_name', 'a','b','c'])
for cancer in df.cause_name.unique():
    #for age in with_P_c_no_inf.age_name.unique():
        #cancer = "All Sites"
        #ages = "All Ages"

        data = df[df['cause_name']==cancer]
        #data = data[data['age_name']==age]


        t=data['t']
        y=data['logit_P_c']
        res_lsq = least_squares(model, x_0, args=(t,y), loss='linear')        
        # other loss functions to try:
        # soft_l1, cauchy
        #weights

        # PLOT
        output = function(res_lsq.x, t)
        plt.plot(t, scipy.expit(y), 'o', markersize=4, label='data')
        plt.plot(t, scipy.expit(output), label='fitted model')
        plt.xlabel("t")
        plt.ylabel("logit_P_c")
        plt.legend(loc='lower right')
        plt.title(cancer)#+": a ="+str(res_lsq.x[0])+" b ="+str(res_lsq.x[1])+" c ="+str(res_lsq.x[2]))
        plt.show()
        
        # SAVE
        row = pd.DataFrame(data={'cause_name': [cancer],
              'a': [res_lsq.x[0]],
              'b': [res_lsq.x[1]],
              'c': [res_lsq.x[2]]})
        print(row)
        parameters = parameters.append(row, ignore_index = True)

In [None]:
parameters.head()

In [None]:
parameters.to_csv('/homes/fed1/Survival_Franny/three_parameter_log_form_values_SEER_6_3_2021.csv', index=False)

## To do:

Try age group specific