In [18]:
library(tidyverse)
data <- read_csv("./data/ologit.csv")
head(data)

# Gender Age Income Stage
# 1 1      60  3      3    
# 2 0      53  3      2    
# 3 1      66  1      1    
# 4 0      77  3      3    
# 5 0      63  2      2    
# 6 0      72  3      1

[1mRows: [22m[34m192[39m [1mColumns: [22m[34m4[39m
[36m--[39m [1mColumn specification[22m [36m--------------------------------------------------------[39m
[1mDelimiter:[22m ","
[32mdbl[39m (4): Gender, Age, Income, Stage

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Gender,Age,Income,Stage
<dbl>,<dbl>,<dbl>,<dbl>
1,60,3,3
0,53,3,2
1,66,1,1
0,77,3,3
0,63,2,2
0,72,3,1


In [19]:
data %>% glimpse()

Rows: 192
Columns: 4
$ Gender [3m[90m<dbl>[39m[23m 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, ~
$ Age    [3m[90m<dbl>[39m[23m 60, 53, 66, 77, 63, 72, 56, 65, 68, 65, 53, 53, 54, 54, 54, 56,~
$ Income [3m[90m<dbl>[39m[23m 3, 3, 1, 3, 2, 3, 1, 2, 2, 3, 1, 1, 1, 3, 1, 2, 1, 2, 2, 1, 1, ~
$ Stage  [3m[90m<dbl>[39m[23m 3, 2, 1, 3, 2, 1, 0, 2, 2, 2, 0, 1, 0, 0, 0, 2, 0, 1, 1, 1, 1, ~


In [20]:
library(skimr)
data %>% mutate(
  Gender = factor(Gender),
  Stage = factor(Stage),
  Income = factor(Income)
) %>%
  skim() %>%
  print()

-- Data Summary ------------------------
                           Values    
Name                       Piped data
Number of rows             192       
Number of columns          4         
_______________________              
Column type frequency:               
  factor                   3         
  numeric                  1         
________________________             
Group variables            None      

-- Variable type: factor -------------------------------------------------------
  skim_variable n_missing complete_rate ordered n_unique
[90m1[39m Gender                0             1 FALSE          2
[90m2[39m Income                0             1 FALSE          3
[90m3[39m Stage                 0             1 FALSE          4
  top_counts                
[90m1[39m 0: 106, 1: 86             
[90m2[39m 2: 76, 3: 62, 1: 54       
[90m3[39m 2: 89, 1: 46, 3: 34, 0: 23

-- Variable type: numeric ------------------------------------------------------
  skim_vari

In [None]:
data_pre <- data %>%
  mutate(
    Stage = factor(
      Stage,
      levels = c(0, 1, 2, 3),
      labels = c("I-II", "III", "IV", "V"),
      ordered = TRUE
    ),
    Gender = factor(
      Gender,
      levels = c(0, 1),
      labels = c("Female", "Male")
    ),
    Income = factor(
      Income,
      levels = c(1, 2, 3),
      labels = c("Low", "Medium", "High")
    )
  )
skim(data_pre) %>% print()

-- Data Summary ------------------------
                           Values  
Name                       data_pre
Number of rows             192     
Number of columns          4       
_______________________            
Column type frequency:             
  factor                   3       
  numeric                  1       
________________________           
Group variables            None    

-- Variable type: factor -------------------------------------------------------
  skim_variable n_missing complete_rate ordered n_unique
[90m1[39m Gender                0             1 FALSE          2
[90m2[39m Income                0             1 FALSE          3
[90m3[39m Stage                 0             1 TRUE           4
  top_counts                     
[90m1[39m Fem: 106, Mal: 86              
[90m2[39m Med: 76, Hig: 62, Low: 54      
[90m3[39m IV: 89, III: 46, V: 34, I-I: 23

-- Variable type: numeric ------------------------------------------------------
  skim_vari

In [None]:
# 使用brant检验
library(brant)
library(MASS)
brant(polr(Stage ~ Gender + Age + Income, data = data_pre))

# -------------------------------------------- 
# Test for	X2	df	probability 
# -------------------------------------------- 
# Omnibus		3.67	8	0.89
# GenderMale	1.42	2	0.49
# Age		1.78	2	0.41
# IncomeMedium	0.06	2	0.97
# IncomeHigh	0.26	2	0.88
# -------------------------------------------- 

# H0: Parallel Regression Assumption holds

-------------------------------------------- 
Test for	X2	df	probability 
-------------------------------------------- 
Omnibus		3.67	8	0.89
GenderMale	1.42	2	0.49
Age		1.78	2	0.41
IncomeMedium	0.06	2	0.97
IncomeHigh	0.26	2	0.88
-------------------------------------------- 

H0: Parallel Regression Assumption holds


In [None]:
# 使用LR检验
library(lmtest)
library(VGAM)

# 拟合符合平行性假设的模型
om1 <- vglm(
  Stage ~ Income + Age + Gender,
  data = data,
  family = cumulative(parallel = TRUE)
)

# 拟合不符合平行性假设的模型
om2 <- vglm(
  Stage ~ Income + Age + Gender,
  data = data,
  family = cumulative(parallel = FALSE)
)

# 计算两种模型拟合结果是否一致
lrtest(om2, om1)

# Likelihood ratio test

# Model 1: Stage ~ Income + Age + Gender
# Model 2: Stage ~ Income + Age + Gender
#   #Df  LogLik Df Chisq Pr(>Chisq)
# 1 564 -226.00                    
# 2 570 -227.45  6 2.891     0.8224

Likelihood ratio test

Model 1: Stage ~ Income + Age + Gender
Model 2: Stage ~ Income + Age + Gender
  #Df  LogLik Df Chisq Pr(>Chisq)
1 564 -226.00                    
2 570 -227.45  6 2.891     0.8224

In [35]:
fit <- polr(Stage ~ Gender + Age + Income, data = data_pre)
summary(fit)


Re-fitting to get Hessian




Call:
polr(formula = Stage ~ Gender + Age + Income, data = data_pre)

Coefficients:
               Value Std. Error t value
GenderMale   -0.2297    0.27540 -0.8342
Age           0.1112    0.02743  4.0533
IncomeMedium  1.1909    0.34341  3.4679
IncomeHigh    0.8738    0.35997  2.4274

Intercepts:
         Value   Std. Error t value
I-II|III  5.7885  1.7946     3.2256
III|IV    7.4302  1.8266     4.0678
IV|V      9.8215  1.8840     5.2130

Residual Deviance: 447.6524 
AIC: 461.6524 

In [None]:
# 以单因素分析为基础
library(broom)
df_select <- data.frame(
  col_name = character(),
  p_value = numeric(),
  keep = logical(),
  stringsAsFactors = FALSE
)
p_limit <- 0.05
for (i in 1:(ncol(data_pre) - 1)) {
  col_name <- names(data_pre)[i]
  print(col_name)

  # 使用polr函数进行有序逻辑回归
  model <- polr(Stage ~ data_pre[[col_name]], data = data_pre)

  # 打印模型摘要
  tidy_model <- tidy(model)
  print(tidy_model)

  # polr输出没有p.value，需用z值计算p值
  # 只选第一个自变量（第二行），计算p值
  z_val <- tidy_model$statistic[1]
  p_value <- 2 * (1 - pnorm(abs(z_val)))

  keep <- p_value < p_limit

  df_select <- rbind(df_select, data.frame(
    col_name = col_name,
    p_value = p_value,
    keep = keep,
    stringsAsFactors = FALSE
  ))
}

print(df_select)

  # col_name      p_value  keep
# 1   Gender 1.210195e-01 FALSE
# 2      Age 3.354322e-06  TRUE
# 3   Income 8.305579e-05  TRUE

[1] "Gender"



Re-fitting to get Hessian




[90m# A tibble: 4 x 5[39m
  term                     estimate std.error statistic coef.type  
  [3m[90m<chr>[39m[23m                       [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m      
[90m1[39m data_pre[[col_name]]Male   -[31m0[39m[31m.[39m[31m420[39m     0.271     -[31m1[39m[31m.[39m[31m55[39m coefficient
[90m2[39m I-II|III                   -[31m2[39m[31m.[39m[31m20[39m      0.260     -[31m8[39m[31m.[39m[31m44[39m scale      
[90m3[39m III|IV                     -[31m0[39m[31m.[39m[31m763[39m     0.194     -[31m3[39m[31m.[39m[31m93[39m scale      
[90m4[39m IV|V                        1.37      0.216      6.33 scale      
[1] "Age"



Re-fitting to get Hessian




[90m# A tibble: 4 x 5[39m
  term                 estimate std.error statistic coef.type  
  [3m[90m<chr>[39m[23m                   [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m      
[90m1[39m data_pre[[col_name]]    0.125    0.026[4m9[24m      4.65 coefficient
[90m2[39m I-II|III                6.18     1.75        3.52 scale      
[90m3[39m III|IV                  7.73     1.79        4.33 scale      
[90m4[39m IV|V                   10.0      1.85        5.43 scale      
[1] "Income"



Re-fitting to get Hessian




[90m# A tibble: 5 x 5[39m
  term                       estimate std.error statistic coef.type  
  [3m[90m<chr>[39m[23m                         [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m      
[90m1[39m data_pre[[col_name]]Medium    1.34      0.341      3.94 coefficient
[90m2[39m data_pre[[col_name]]High      1.13      0.354      3.20 coefficient
[90m3[39m I-II|III                     -[31m1[39m[31m.[39m[31m23[39m      0.284     -[31m4[39m[31m.[39m[31m32[39m scale      
[90m4[39m III|IV                        0.284     0.258      1.10 scale      
[90m5[39m IV|V                          2.53      0.317      7.99 scale      
  col_name      p_value  keep
1   Gender 1.210195e-01 FALSE
2      Age 3.354322e-06  TRUE
3   Income 8.305579e-05  TRUE


In [None]:
fit <- polr(Stage ~ Age + Income, data = data_pre)
library(car)
vif(fit)

# GVIF     Df GVIF^(1/(2*Df))
# Age    1.023471 1  1.011667       
# Income 1.023471 2  1.005817


Re-fitting to get Hessian




Unnamed: 0,GVIF,Df,GVIF^(1/(2*Df))
Age,1.023471,1,1.011667
Income,1.023471,2,1.005817


In [41]:
# 计算 Cook 距离
cook <- cooks.distance(fit)

# 显示 Cook 距离 > 0.5 的个案编号和 Cook 值
outliers <- which(cook > 0.5)
cook_outliers <- cook[outliers]
print(data.frame(case = outliers, cook_distance = cook_outliers))

# 显示最大 Cook 距离
cat("Max Cook Distance:", max(cook), "\n")

ERROR: Error in UseMethod("cooks.distance"): no applicable method for 'cooks.distance' applied to an object of class "polr"


In [None]:
# Box-Tidwell 检验
fit_age <- polr(Stage ~ Age + I(Age * log(Age)) + Income, data = data_pre)
# 手动计算p值
result_age <- tidy(fit_age)
result_age <- result_age %>%
  mutate(p_value = 2 * (1 - pnorm(abs(statistic))))
print(result_age)


Re-fitting to get Hessian




[90m# A tibble: 7 x 6[39m
  term              estimate std.error statistic coef.type    p_value
  [3m[90m<chr>[39m[23m                [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m          [3m[90m<dbl>[39m[23m
[90m1[39m Age                  4.48    0.114       39.4  coefficient 0       
[90m2[39m I(Age * log(Age))   -[31m0[39m[31m.[39m[31m840[39m   0.027[4m2[24m     -[31m30[39m[31m.[39m[31m9[39m  coefficient 0       
[90m3[39m IncomeMedium         1.17    0.344        3.41 coefficient 0.000[4m6[24m[4m5[24m[4m5[24m
[90m4[39m IncomeHigh           0.901   0.362        2.49 coefficient 0.012[4m8[24m  
[90m5[39m I-II|III            61.3     0.009[4m5[24m[4m9[24m   [4m6[24m392.   scale       0       
[90m6[39m III|IV              63.0     0.233      270.   scale       0       
[90m7[39m IV|V                65.4     0.313      209.   scale       0       


In [None]:
fit <- polr(Stage ~ Age + Income, data = data_pre)
summary(fit)

# Call:
# polr(formula = Stage ~ Age + Income, data = data_pre)

# Coefficients:
#               Value Std. Error t value
# Age          0.1134     0.0273   4.155
# IncomeMedium 1.2004     0.3432   3.498
# IncomeHigh   0.8894     0.3597   2.473

# Intercepts:
#          Value   Std. Error t value
# I-II|III  6.0594  1.7650     3.4330
# III|IV    7.6918  1.7995     4.2745
# IV|V     10.0810  1.8591     5.4225

# Residual Deviance: 448.3491 
# AIC: 460.3491


Re-fitting to get Hessian




Call:
polr(formula = Stage ~ Age + Income, data = data_pre)

Coefficients:
              Value Std. Error t value
Age          0.1134     0.0273   4.155
IncomeMedium 1.2004     0.3432   3.498
IncomeHigh   0.8894     0.3597   2.473

Intercepts:
         Value   Std. Error t value
I-II|III  6.0594  1.7650     3.4330
III|IV    7.6918  1.7995     4.2745
IV|V     10.0810  1.8591     5.4225

Residual Deviance: 448.3491 
AIC: 460.3491 

In [None]:
confint(fit)

# 2.5 %     97.5 %   
# Age          0.0605485 0.1678252
# IncomeMedium 0.5339424 1.8814674
# IncomeHigh   0.1889152 1.6013124

Waiting for profiling to be done...


Re-fitting to get Hessian




Unnamed: 0,2.5 %,97.5 %
Age,0.0605485,0.1678252
IncomeMedium,0.5339424,1.8814674
IncomeHigh,0.1889152,1.6013124


In [None]:
drop1(fit, test = "Chisq")

# Df AIC      LRT      Pr(>Chi)    
# <none> NA 460.3491       NA           NA
# Age     1 476.2535 17.90441 2.322824e-05
# Income  2 469.2724 12.92330 1.562213e-03

Unnamed: 0_level_0,Df,AIC,LRT,Pr(>Chi)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
<none>,,460.3491,,
Age,1.0,476.2535,17.90441,2.322824e-05
Income,2.0,469.2724,12.9233,0.001562213


In [None]:
# 计算p值
library(broom)
result <- tidy(fit)
result <- result %>%
  mutate(p_value = 2 * (1 - pnorm(abs(statistic))))
print(result)


Re-fitting to get Hessian




[90m# A tibble: 6 x 6[39m
  term         estimate std.error statistic coef.type        p_value
  [3m[90m<chr>[39m[23m           [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m              [3m[90m<dbl>[39m[23m
[90m1[39m Age             0.113    0.027[4m3[24m      4.16 coefficient 0.000[4m0[24m[4m3[24m[4m2[24m5   
[90m2[39m IncomeMedium    1.20     0.343       3.50 coefficient 0.000[4m4[24m[4m6[24m[4m9[24m    
[90m3[39m IncomeHigh      0.889    0.360       2.47 coefficient 0.013[4m4[24m      
[90m4[39m I-II|III        6.06     1.77        3.43 scale       0.000[4m5[24m[4m9[24m[4m7[24m    
[90m5[39m III|IV          7.69     1.80        4.27 scale       0.000[4m0[24m[4m1[24m[4m9[24m2   
[90m6[39m IV|V           10.1      1.86        5.42 scale       0.000[4m0[24m[4m0[24m[4m0[24m058[4m8[24m


In [None]:
print(exp(coef(fit)))
print(exp(confint(fit)))

# Age IncomeMedium   IncomeHigh 
#     1.120123     3.321463     2.433658 
# Waiting for profiling to be done...


# Re-fitting to get Hessian


#                 2.5 %   97.5 %
# Age          1.062419 1.182730
# IncomeMedium 1.705643 6.563129
# IncomeHigh   1.207939 4.959537

         Age IncomeMedium   IncomeHigh 
    1.120123     3.321463     2.433658 


Waiting for profiling to be done...


Re-fitting to get Hessian




                2.5 %   97.5 %
Age          1.062419 1.182730
IncomeMedium 1.705643 6.563129
IncomeHigh   1.207939 4.959537


In [None]:
# 手动构造似然比检验
# 计算对数似然值
loglik_full <- logLik(fit)

# 计算空模型的对数似然值
fit_null <- polr(Stage ~ 1, data = data_pre)
loglik_null <- logLik(fit_null)

# 计算似然比检验的统计量
lr_statistic <- -2 * (loglik_null - loglik_full)

# 计算p值
p_value <- pchisq(lr_statistic, df = ncol(data) - 1, lower.tail = FALSE)

cat("Likelihood Ratio Statistic:", lr_statistic, "\n")
cat("p-value:", p_value, "\n")

if (p_value < 0.05) {
  cat("The model is statistically significant.\n")
} else {
  cat("The model is not statistically significant.\n")
}

# Likelihood Ratio Statistic: 35.29161 
# p-value: 1.057113e-07 
# The model is statistically significant.

Likelihood Ratio Statistic: 35.29161 
p-value: 1.057113e-07 
The model is statistically significant.


In [None]:
fit <- polr(Stage ~ Age + Income, data = data_pre, method = "probit")
brant(fit)

# -------------------------------------------- 
# Test for	X2	df	probability 
# -------------------------------------------- 
# Omnibus		2.24	6	0.9
# Age		1.94	2	0.38
# IncomeMedium	0.06	2	0.97
# IncomeHigh	0.19	2	0.91
# -------------------------------------------- 

# H0: Parallel Regression Assumption holds

-------------------------------------------- 
Test for	X2	df	probability 
-------------------------------------------- 
Omnibus		2.24	6	0.9
Age		1.94	2	0.38
IncomeMedium	0.06	2	0.97
IncomeHigh	0.19	2	0.91
-------------------------------------------- 

H0: Parallel Regression Assumption holds


In [None]:
# 测试LR检验
library(lmtest)
library(VGAM)
# 拟合符合平行性假设的模型
om1 <- vglm(
  Stage ~ Income + Age + Gender,
  data = data,
  family = cumulative(parallel = TRUE),
  link = "probitlink"
)
# 拟合不符合平行性假设的模型
om2 <- vglm(
  Stage ~ Income + Age + Gender,
  data = data,
  family = cumulative(parallel = FALSE),
  link = "probitlink"
)
# 计算两种模型拟合结果是否一致
lrtest(om2, om1)

# Likelihood ratio test

# Model 1: Stage ~ Income + Age + Gender
# Model 2: Stage ~ Income + Age + Gender
#   #Df  LogLik Df Chisq Pr(>Chisq)
# 1 564 -226.00                    
# 2 570 -227.45  6 2.891     0.8224

Likelihood ratio test

Model 1: Stage ~ Income + Age + Gender
Model 2: Stage ~ Income + Age + Gender
  #Df  LogLik Df Chisq Pr(>Chisq)
1 564 -226.00                    
2 570 -227.45  6 2.891     0.8224

In [None]:
# 以单因素分析为基础
library(broom)
df_select <- data.frame(
  col_name = character(),
  p_value = numeric(),
  keep = logical(),
  stringsAsFactors = FALSE
)
p_limit <- 0.05
for (i in 1:(ncol(data_pre) - 1)) {
  col_name <- names(data_pre)[i]
  print(col_name)

  # 使用polr函数进行有序逻辑回归
  model <- polr(
    Stage ~ data_pre[[col_name]],
    data = data_pre,
    method = "probit"
  )

  # 打印模型摘要
  tidy_model <- tidy(model)
  print(tidy_model)

  # polr输出没有p.value，需用z值计算p值
  # 只选第一个自变量（第二行），计算p值
  z_val <- tidy_model$statistic[1]
  p_value <- 2 * (1 - pnorm(abs(z_val)))

  keep <- p_value < p_limit

  df_select <- rbind(df_select, data.frame(
    col_name = col_name,
    p_value = p_value,
    keep = keep,
    stringsAsFactors = FALSE
  ))
}

print(df_select)

#   col_name      p_value  keep
# 1   Gender 1.304453e-01 FALSE
# 2      Age 8.065292e-06  TRUE
# 3   Income 8.362519e-05  TRUE

[1] "Gender"



Re-fitting to get Hessian




[90m# A tibble: 4 x 5[39m
  term                     estimate std.error statistic coef.type  
  [3m[90m<chr>[39m[23m                       [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m      
[90m1[39m data_pre[[col_name]]Male   -[31m0[39m[31m.[39m[31m236[39m     0.156     -[31m1[39m[31m.[39m[31m51[39m coefficient
[90m2[39m I-II|III                   -[31m1[39m[31m.[39m[31m29[39m      0.141     -[31m9[39m[31m.[39m[31m17[39m scale      
[90m3[39m III|IV                     -[31m0[39m[31m.[39m[31m466[39m     0.117     -[31m4[39m[31m.[39m[31m00[39m scale      
[90m4[39m IV|V                        0.827     0.125      6.63 scale      
[1] "Age"



Re-fitting to get Hessian




[90m# A tibble: 4 x 5[39m
  term                 estimate std.error statistic coef.type  
  [3m[90m<chr>[39m[23m                   [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m      
[90m1[39m data_pre[[col_name]]   0.067[4m7[24m    0.015[4m2[24m      4.46 coefficient
[90m2[39m I-II|III               3.26      1.00        3.26 scale      
[90m3[39m III|IV                 4.14      1.01        4.09 scale      
[90m4[39m IV|V                   5.50      1.03        5.33 scale      
[1] "Income"



Re-fitting to get Hessian




[90m# A tibble: 5 x 5[39m
  term                       estimate std.error statistic coef.type  
  [3m[90m<chr>[39m[23m                         [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m      
[90m1[39m data_pre[[col_name]]Medium    0.770     0.196     3.93  coefficient
[90m2[39m data_pre[[col_name]]High      0.647     0.203     3.19  coefficient
[90m3[39m I-II|III                     -[31m0[39m[31m.[39m[31m728[39m     0.163    -[31m4[39m[31m.[39m[31m47[39m  scale      
[90m4[39m III|IV                        0.138     0.155     0.892 scale      
[90m5[39m IV|V                          1.49      0.177     8.40  scale      
  col_name      p_value  keep
1   Gender 1.304453e-01 FALSE
2      Age 8.065292e-06  TRUE
3   Income 8.362519e-05  TRUE


In [None]:
fit <- polr(Stage ~ Age + Income, data = data_pre, method = "probit")
library(car)
vif(fit)

# GVIF     Df GVIF^(1/(2*Df))
# Age    1.010366 1  1.005169       
# Income 1.010366 2  1.002581


Re-fitting to get Hessian




Unnamed: 0,GVIF,Df,GVIF^(1/(2*Df))
Age,1.010366,1,1.005169
Income,1.010366,2,1.002581


In [None]:
# Box-Tidwell 检验
fit_age <- polr(
  Stage ~ Age + I(Age * log(Age)) + Income,
  data = data_pre,
  method = "probit"
)
# 手动计算p值
result_age <- tidy(fit_age)
result_age <- result_age %>%
  mutate(p_value = 2 * (1 - pnorm(abs(statistic))))
print(result_age)


Re-fitting to get Hessian




[90m# A tibble: 7 x 6[39m
  term              estimate std.error statistic coef.type    p_value
  [3m[90m<chr>[39m[23m                [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m          [3m[90m<dbl>[39m[23m
[90m1[39m Age                  2.72    0.064[4m1[24m      42.4  coefficient 0       
[90m2[39m I(Age * log(Age))   -[31m0[39m[31m.[39m[31m512[39m   0.015[4m3[24m     -[31m33[39m[31m.[39m[31m5[39m  coefficient 0       
[90m3[39m IncomeMedium         0.677   0.198        3.41 coefficient 0.000[4m6[24m[4m4[24m[4m0[24m
[90m4[39m IncomeHigh           0.556   0.206        2.69 coefficient 0.007[4m0[24m[4m5[24m 
[90m5[39m I-II|III            37.0     0.005[4m2[24m[4m7[24m   [4m7[24m026.   scale       0       
[90m6[39m III|IV              38.0     0.125      304.   scale       0       
[90m7[39m IV|V                39.4     0.164      240.   scale       0       


In [None]:
fit <- polr(
  Stage ~ Age + Income,
  data = data_pre,
  method = "probit"
)
summary(fit)

# Call:
# polr(formula = Stage ~ Age + Income, data = data_pre, method = "probit")

# Coefficients:
#               Value Std. Error t value
# Age          0.0623    0.01538   4.049
# IncomeMedium 0.7009    0.19784   3.543
# IncomeHigh   0.5549    0.20574   2.697

# Intercepts:
#          Value  Std. Error t value
# I-II|III 3.3012 1.0093     3.2706 
# III|IV   4.2263 1.0227     4.1324 
# IV|V     5.6403 1.0436     5.4049 

# Residual Deviance: 449.9679 
# AIC: 461.9679


Re-fitting to get Hessian




Call:
polr(formula = Stage ~ Age + Income, data = data_pre, method = "probit")

Coefficients:
              Value Std. Error t value
Age          0.0623    0.01538   4.049
IncomeMedium 0.7009    0.19784   3.543
IncomeHigh   0.5549    0.20574   2.697

Intercepts:
         Value  Std. Error t value
I-II|III 3.3012 1.0093     3.2706 
III|IV   4.2263 1.0227     4.1324 
IV|V     5.6403 1.0436     5.4049 

Residual Deviance: 449.9679 
AIC: 461.9679 

In [None]:
confint(fit)

# 2.5 %      97.5 %    
# Age          0.03228059 0.09260261
# IncomeMedium 0.31406588 1.08969115
# IncomeHigh   0.15238435 0.95896150

Waiting for profiling to be done...


Re-fitting to get Hessian




Unnamed: 0,2.5 %,97.5 %
Age,0.03228059,0.09260261
IncomeMedium,0.31406588,1.08969115
IncomeHigh,0.15238435,0.9589615


In [None]:
drop1(fit, test = "Chisq")

# Df AIC      LRT      Pr(>Chi)    
# <none> NA 461.9679       NA           NA
# Age     1 476.6574 16.68958 4.402205e-05
# Income  2 471.2978 13.32992 1.274809e-03

Unnamed: 0_level_0,Df,AIC,LRT,Pr(>Chi)
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>
<none>,,461.9679,,
Age,1.0,476.6574,16.68958,4.402205e-05
Income,2.0,471.2978,13.32992,0.001274809


In [None]:
# 计算p值
library(broom)
result <- tidy(fit)
result <- result %>%
  mutate(p_value = 2 * (1 - pnorm(abs(statistic))))
print(result)


Re-fitting to get Hessian




[90m# A tibble: 6 x 6[39m
  term         estimate std.error statistic coef.type        p_value
  [3m[90m<chr>[39m[23m           [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m     [3m[90m<dbl>[39m[23m [3m[90m<chr>[39m[23m              [3m[90m<dbl>[39m[23m
[90m1[39m Age            0.062[4m3[24m    0.015[4m4[24m      4.05 coefficient 0.000[4m0[24m[4m5[24m[4m1[24m4   
[90m2[39m IncomeMedium   0.701     0.198       3.54 coefficient 0.000[4m3[24m[4m9[24m[4m6[24m    
[90m3[39m IncomeHigh     0.555     0.206       2.70 coefficient 0.006[4m9[24m[4m9[24m     
[90m4[39m I-II|III       3.30      1.01        3.27 scale       0.001[4m0[24m[4m7[24m     
[90m5[39m III|IV         4.23      1.02        4.13 scale       0.000[4m0[24m[4m3[24m[4m5[24m9   
[90m6[39m IV|V           5.64      1.04        5.40 scale       0.000[4m0[24m[4m0[24m[4m0[24m064[4m9[24m


In [None]:
# 手动构造似然比检验
# 计算对数似然值
loglik_full <- logLik(fit)

# 计算空模型的对数似然值
fit_null <- polr(Stage ~ 1, data = data_pre, method = "probit")
loglik_null <- logLik(fit_null)

# 计算似然比检验的统计量
lr_statistic <- -2 * (loglik_null - loglik_full)

# 计算p值
p_value <- pchisq(lr_statistic, df = ncol(data) - 1, lower.tail = FALSE)

cat("Likelihood Ratio Statistic:", lr_statistic, "\n")
cat("p-value:", p_value, "\n")

if (p_value < 0.05) {
  cat("The model is statistically significant.\n")
} else {
  cat("The model is not statistically significant.\n")
}

# Likelihood Ratio Statistic: 33.67286 
# p-value: 2.322623e-07 
# The model is statistically significant.

Likelihood Ratio Statistic: 33.67286 
p-value: 2.322623e-07 
The model is statistically significant.
