In [118]:
# Imports & setup
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
# from tkinter import Tk
# from tkinter.filedialog import askopenfilename


In [119]:

# This allows to choose the file to open
# Tk().withdraw()  # to hide the main window
# filepath = askopenfilename()  # show an "Open" dialog box and return the path to the selected file

# feel free to change this to filepath to choose specific file path on local machine
data = pd.read_csv("E:\ECS-171-Project\Life Expectancy Data.csv")

In [120]:
# Select columns
selected_cols = ["Alcohol", "Adult Mortality", "Hepatitis B", "Measles ", " BMI ",
                 "Polio", "Total expenditure", "Diphtheria ", " HIV/AIDS", "GDP",
                 " thinness  1-19 years", "Schooling", "infant deaths"]
selected_data = data[selected_cols]
selected_data.columns = ["Alcohol", "Adult Mortality", "Hepatitis B", "Measles", "BMI",
                         "Polio", "Total Expenditure", "Diphtheria", "HIV/AIDS", "GDP",
                         "Thinness 1-19 Years", "Schooling", "Infant Deaths"]

target = data["Life expectancy "]
target.name = "Life Expectancy"

In [121]:
# Transform selected columns
selected_data = selected_data.apply({"Adult Mortality": np.sqrt,
                                     "Alcohol": np.sqrt,
                                     "Hepatitis B": lambda x: np.log(100 - x),
                                     "Measles": lambda x: np.log(x + 0.1),
                                     "BMI": lambda x: np.log(100 - x),
                                     "Polio": lambda x: np.log(100 - x),
                                     "Total Expenditure": lambda x: x,
                                     "Diphtheria": lambda x: np.log(100 - x),
                                     "HIV/AIDS": np.log,
                                     "GDP": np.log,
                                     "Thinness 1-19 Years": np.log,
                                     "Schooling": lambda x: x,
                                     "Infant Deaths": lambda x: x})

In [122]:
# Remove rows with outliers or where target is missing
def find_outliers(data):
  q1 = np.nanpercentile(data, 25)
  q3 = np.nanpercentile(data, 75)
  iqr = q3 - q1
  min_threshold = q1 - 1.5*iqr
  max_threshold = q3 + 1.5*iqr
  return list( np.where((data < min_threshold) | (data > max_threshold))[0] )

drop_rows = list( np.where(target.isna())[0] )
for name, col in selected_data.items():
  drop_rows += find_outliers(col)

data_drop_rows = selected_data.drop(index=drop_rows).reset_index(drop=True)
clean_target = target.drop(index=drop_rows).reset_index(drop=True)

In [123]:
# Replace missing values with mean
imputer = SimpleImputer(strategy="mean").set_output(transform="pandas")
clean_data = imputer.fit_transform(data_drop_rows)

In [124]:
# Perform 80/20 train/test split
X_train, X_test, y_train, y_test = train_test_split(clean_data, clean_target, train_size=0.8, random_state=42)

In [125]:
# Fit and evaluate model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mean_squared_error(y_test, y_pred)

14.465076776536385

In [126]:
r_squared = model.score(X_test, y_test)
print("R-squared value:", r_squared)


R-squared value: 0.8090454357903173


Cross validation here to validate the result of the model. Here, we used random-fold validation which will shuffle the data repeatedly do random train/splits on the data.

In [127]:
from sklearn.model_selection import ShuffleSplit
kfold = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
result_cross_val = cross_val_score(model, clean_data, clean_target, cv=kfold)
print(result_cross_val)

[0.81862037 0.80799447 0.80205991 0.80583582 0.81971378 0.81767745
 0.83528402 0.80789415 0.79484749 0.79713436]


In [128]:
# Mean performance
result_cross_val.mean()



0.8107061820959955

result is extremely close to model score

## Ridge vs Lasso regularization

In [129]:
# Initialize RidgeCV w/ range of alpha values
alphas = [0.01, 0.1, 0.4, 1, 10, 100]

# Fit model
ridge_cv = RidgeCV(alphas=alphas, cv = 5) # 5-fold cross validation
ridge_cv.fit(X_train, y_train)

# Get the best alpha value
best_alpha = ridge_cv.alpha_
print("Best alpha value: ", best_alpha)


ridge_cv_score = ridge_cv.score(X_test, y_test)
print("RidgeCV R-squared value: ", ridge_cv_score)

y_pred_2 = ridge_cv.predict(X_test)

# Mean squared error
mse_ridge = mean_squared_error(y_test, y_pred_2)
print("RidgeCV Mean squared error: ", mse_ridge)



Best alpha value:  10.0
RidgeCV R-squared value:  0.8089215297075956
RidgeCV Mean squared error:  14.474462836550549


In [130]:
alphas = [0.001, 0.05, 0.1, 1.0, 0.009]
lasso_cv = LassoCV(alphas=alphas, cv=5)
lasso_cv.fit(X_train, y_train)

# Get the best alpha value
best_alpha = lasso_cv.alpha_
print("Best alpha value: ", best_alpha)

ridge_cv_score = lasso_cv.score(X_test, y_test)
print("LassoCV R-squared value: ", ridge_cv_score)

y_pred_3 = lasso_cv.predict(X_test)

# Mean squared error
mse_lasso = mean_squared_error(y_test, y_pred_3)    
print("LassoCV Mean squared error: ", mse_lasso)

Best alpha value:  0.001
LassoCV R-squared value:  0.8090342015398175
LassoCV Mean squared error:  14.46592778681037


## K-Nearest Neighboor

In [131]:
# Initialize KNN model
knr_model = KNeighborsRegressor()

# Fit model
knr_model.fit(X_train, y_train)

knr_model_score = knr_model.score(X_test, y_test)
print("KNN R-squared value: ", knr_model_score)

y_pred_4 = knr_model.predict(X_test)

# Calculate mean squared error
mse_knr = mean_squared_error(y_test, y_pred_4)
print("KNN Mean squared error: ", mse_knr)

KNN R-squared value:  0.8797436587460512
KNN Mean squared error:  9.109586965376781


## Random Forest 

In [133]:
RandomForestRegressor_model = RandomForestRegressor()
RandomForestRegressor_model.fit(X_train, y_train)
RandomForestRegressor_model_score = RandomForestRegressor_model.score(X_test, y_test)
print("RandomForestRegressor R-squared value: ", RandomForestRegressor_model_score)

y_pred_5 = RandomForestRegressor_model.predict(X_test)
mse_RandomForestRegressor = mean_squared_error(y_test, y_pred_5)
print("RandomForestRegressor Mean squared error: ", mse_RandomForestRegressor)

RandomForestRegressor R-squared value:  0.9514165802748376
RandomForestRegressor Mean squared error:  3.680262366598787
