In [12]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
import joblib
def best_model(X_train, y_train, model_path):
# Define the models and their parameter grids
    param_grids = {
        "Lasso Regression": {
            "model": Lasso(),
            "params": {"alpha": [0.01, 0.1, 1, 10, 100]}
        },
        "Random Forest Regressor": {
            "model": RandomForestRegressor(),
            "params": {
                "n_estimators": [50, 100, 200],
                "max_depth": [None, 10, 20],
                "min_samples_split": [2, 5, 10]
            }
        },
        "XGBoost": {
            "model": XGBRegressor(),
            "params": {
                "n_estimators": [50, 100],
                "learning_rate": [0.01, 0.1],
                "max_depth": [3, 5]
            }
        },
        "Support Vector Regression": {
            "model": SVR(),
            "params": {
                "C": [0.1, 1, 10],
                "epsilon": [0.01, 0.1],
                "kernel": ["linear", "rbf"]
            }
        }
    }
    #perform grid search
    best_models={}
    for model_name, config in param_grids.items():
        grid_search=GridSearchCV(
            estimator=config["model"],
            param_grid=config["params"],
            scoring="neg_mean_squared_error",
            cv=5,
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        best_models[model_name]=grid_search.best_estimator_
        print(f"Best params for {model_name}: {grid_search.best_params_}")
        #save best model with best params
        joblib.dump(grid_search.best_estimator_, f"{model_path}/{model_name.replace(' ', '_')}.pkl")
    return best_models

In [15]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
data=pd.read_excel('~/Library/CloudStorage/OneDrive-国立大学法人東海国立大学機構/Weekly_challenges/Data science and Analytics/Japan_Life_Expectency/data/processed/Cleaned_Japan_Life_Expectancy.xlsx')

data.rename(columns={'Junior_col_%': 'Junior_col_percent', 'Park_Land_%': 'Park_Land_percent'}, inplace=True)# Define the model
model = ols('Life_expectancy ~ Junior_col_percent + Physician_100kP + Park_Land_percent', data=data).fit()

# Print the summary of the regression results
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:        Life_expectancy   R-squared:                       0.530
Model:                            OLS   Adj. R-squared:                  0.497
Method:                 Least Squares   F-statistic:                     16.14
Date:                Mon, 27 Jan 2025   Prob (F-statistic):           3.59e-07
Time:                        15:17:53   Log-Likelihood:                -16.179
No. Observations:                  47   AIC:                             40.36
Df Residuals:                      43   BIC:                             47.76
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             83.4115      0

# Observations form model_experiments
- `Socioeconomic_index` and `Ambulances_100kP` do not appear to be statistically significant (>0.05) in predicting `life expectancy` based on their p-values. Removing these features might improve the model’s performance and make it more reliable.
- All three independent variables (`education`, `healthcare access`, and `park access`) are statistically significant predictors of `life expectancy`, with positive relationships.
- `Education` has the largest impact on `life expectancy` among the three variables, followed by `healthcare` access and `access to parks`. (1% increase in junior_col increases life_expectancy by 1.2 years, 1 additionaal physician increases life expectancy by 0.75, 1% increase in Park_Land increases life_expectancy by 0.56)
- The model explains a moderate portion (53%) of the variation in `life expectancy`, and the residuals are approximately normally distributed, suggesting that the model is well-specified and the assumptions are reasonably met.