In [26]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,mean_absolute_error

def preprocess_and_build_logistic_regression(file_path,target_column,independent_columns):
    try:
        data=pd.read_csv(file_path)
        data[target_column]=pd.to_numeric(data[target_column],errors='coerce')
        data=data.dropna(subset=[target_column])
        min_value=data[target_column].min()
        max_value=data[target_column].max()
        data[target_column]=(data[target_column]-min_value)/(max_value-min_value)
        for col in independent_columns:
            data[col]=pd.to_numeric(data[col],errors='coerce')
            data=data.dropna(subset=[col])


        X=data[independent_columns]
        y=data[target_column]
        X=sm.add_constant(X)
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42
                                                      )
        logit_model=sm.Logit(y_train,X_train)
        result=logit_model.fit()
        print(result.summary())

        y_pred_proba=result.predict(X_test)
        y_pred=[1 if p>0.5 else 0 for p in y_pred_proba]

        mse=mean_squared_error(y_test,y_pred_proba)
        mae=mean_absolute_error(y_test,y_pred_proba)
        print(f"均方误差:{mse}")
        print(f"平均绝对误差:{mae}")
        
    except FileNotFoundError:
        print(f"文件未找到:{file_path}")
    except Exception as e:
        print(f"发生错误:{e}")

file_path='CGSS2021.csv'
target_column='A35'
independent_columns=['A7a','A7b','D9','A5','A62','A10','A13','A28_1','A28_2','A28_3','A28_4',
                    'A28_5','A28_6','A31_1','A31_2','A66','A68a_1','A68a_2']
preprocess_and_build_logistic_regression(file_path,target_column,independent_columns)

Optimization terminated successfully.
         Current function value: 0.056224
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                    A35   No. Observations:                 1586
Model:                          Logit   Df Residuals:                     1567
Method:                           MLE   Df Model:                           18
Date:                Tue, 24 Dec 2024   Pseudo R-squ.:                 -0.4223
Time:                        23:24:39   Log-Likelihood:                -89.172
converged:                       True   LL-Null:                       -62.695
Covariance Type:            nonrobust   LLR p-value:                     1.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -4.1117      0.930     -4.423      0.000      -5.934      -2.290
A7a           -0.0051      0.