In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/hsuyeemon/SML_Project/main/data/life%20expectancy.csv')

Data Preprocessing

In [17]:
# we will use only the rows where the target Life Expectancy value present.
df = df[~df['Life Expectancy World Bank'].isna()]
print("Number of rows after dropping : ", df.shape[0])

Number of rows after dropping :  3118


In [24]:
# TODO : // get a subset of data for project  
# We will use the data from "East Asia & Pacific,South Asia and Europe & Central Asia" region since 2010.
# data = data[data['Year']>=2010]
# data = data[(data['Region'] == 'East Asia & Pacific') | (data['Region'] == 'South Asia')| (data['Region'] == 'Europe & Central Asia')]

In [46]:
# handle missing data
# remove the countries has over 50% of missing data in Health or Education column
values_to_drop  = [ 
    'Algeria','Antigua and Barbuda','Bermuda','Bolivia',
    'Botswana','Canada','Comoros','Eritrea','France',
    'Greece','Greenland','Grenada','Haiti','Honduras',
    'Iraq','Liberia','Luxembourg','Morocco','Nicaragua',
    'North Macedonia','Papua New Guinea','Puerto Rico','South Sudan',
    'Sudan','Suriname','Tonga','Turkmenistan','United Arab Emirates','United States',
    'Uzbekistan','Vanuatu','Zimbabwe','Dominica','Montenegro',
    'Palau','Bosnia and Herzegovina','Equatorial Guinea','Guam','Nigeria','Somalia',
    'Bosnia and Herzegovina','Equatorial Guinea','Guam','Libya','Nigeria','Somalia',
]

df_filtered = df[~df['Country Name'].isin(values_to_drop)]
print("data points after filtered" , df_filtered.shape[0])
percentage_missing = (df_filtered[['Health Expenditure %','Education Expenditure %']].isnull().sum() / len(df_filtered)) * 100
print("The missing value percent\n",percentage_missing)

data points after filtered 2375
The missing value percent
 Health Expenditure %        0.084211
Education Expenditure %    17.347368
dtype: float64


In [64]:
# Define the subset of countries we want to impute
countries_to_impute = [
    'Afghanistan',
'Albania',
'Angola',
'Australia',
'Austria',
'Bahrain',
'Bangladesh',
'Barbados',
'Belarus',
'Belgium',
'Belize',
'Bhutan',
'Brazil',
'Bulgaria',
'Burkina Faso',
'Burundi',
'Cambodia',
'Cameroon',
'Central African Republic',
'Chad',
'Chile',
'China',
'Costa Rica',
'Croatia',
'Cuba',
'Cyprus',
'Denmark',
'Djibouti',
'Dominican Republic',
'Ecuador',
'El Salvador',
'Estonia',
'Eswatini',
'Ethiopia',
'Fiji',
'Finland',
'Gabon',
'Germany',
'Ghana',
'Guatemala',
'Guinea',
'Guinea-Bissau',
'Guyana',
'Hungary',
'Iceland',
'India',
'Indonesia',
'Ireland',
'Israel',
'Italy',
'Jamaica',
'Japan',
'Jordan',
'Kazakhstan',
'Kenya',
'Kiribati',
'Kuwait',
'Latvia',
'Lebanon',
'Lesotho',
'Lithuania',
'Malawi',
'Malaysia',
'Maldives',
'Mali',
'Malta',
'Mauritania',
'Mexico',
'Mongolia',
'Mozambique',
'Myanmar',
'Namibia',
'Netherlands',
'New Zealand',
'Niger',
'Norway',
'Oman',
'Pakistan',
'Panama',
'Paraguay',
'Philippines',
'Poland',
'Portugal',
'Qatar',
'Romania',
'Rwanda',
'Samoa',
'Sao Tome and Principe',
'Saudi Arabia',
'Senegal',
'Serbia',
'Seychelles',
'Sierra Leone',
'Slovenia',
'Solomon Islands',
'Spain',
'Sri Lanka',
'Sweden',
'Switzerland',
'Tajikistan',
'Tanzania',
'Trinidad and Tobago',
'Tunisia',
'Uganda',
'United Kingdom',
'Uruguay',
'Vietnam',
'Zambia',
'Iraq',
'Zimbabwe',
]


# Filter the DataFrame to include only the specified countries
df_subset = df_filtered[df_filtered['Country Name'].isin(countries_to_impute)]

# Calculate mean for each country group and impute missing values
df_subset['Health Expenditure %'] = df_subset.groupby('Country Name')['Health Expenditure %'].transform(lambda x: x.fillna(x.mean()))

# Calculate mean for each country group and impute missing values
#print(df_subset.groupby('Country Name')['Education Expenditure %'].transform(lambda x: x.fillna(x.mean())))
df_subset['Education Expenditure %'] = df_subset.groupby('Country Name')['Education Expenditure %'].transform(lambda x: x.fillna(x.mean()))

# Merge the subset back into the original DataFrame
df_imputed = pd.concat([df_filtered[~df_filtered['Country Name'].isin(countries_to_impute)], df_subset])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['Health Expenditure %'] = df_subset.groupby('Country Name')['Health Expenditure %'].transform(lambda x: x.fillna(x.mean()))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['Education Expenditure %'] = df_subset.groupby('Country Name')['Education Expenditure %'].transform(lambda x: x.fillna(x.mean()))


In [65]:
# get the columns of interest
df_cleaned = df_imputed[['Health Expenditure %','Education Expenditure %','Life Expectancy World Bank']]
print("The size of data is",df_cleaned.shape[0])
percentage_missing = (df_cleaned.isnull().sum() / len(df_cleaned)) * 100
print("The missing value percent\n",percentage_missing) 

The size of data is 2375
The missing value percent
 Health Expenditure %          0.0
Education Expenditure %       0.0
Life Expectancy World Bank    0.0
dtype: float64


In [66]:
# remove outliers
df_cleaned = df_cleaned[(df_cleaned['Education Expenditure %'] <=8) & (df_cleaned['Health Expenditure %'] <= 12)]

In [67]:
X = df_cleaned[['Health Expenditure %','Education Expenditure %']] #input
y = df_cleaned['Life Expectancy World Bank'] #output

Splitting

In [70]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.4, random_state = 42)

Training

In [75]:
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

steps = [ ("scale", StandardScaler()),  #standardizing the data to range ( mean = 0 ,variance = 1)
          ("polytransform", PolynomialFeatures(degree = 2)), # transforming the polinomial to linear
          ("regressor", Ridge(1)) ] # Ridge Model

pipeline = Pipeline(steps)

model = pipeline.fit(X_train, y_train)  # goal :  to find the parameters , minimize the error between the error and actual data , doing optimization


Evaluation

In [76]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluating on the training data
y_pred = model.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train,y_pred)
print('training')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


# Evaluating on the testing data
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
print('testing')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


training
mae: 7 mse: 71 r2 score: 0.2
testing
mae: 6 mse: 69 r2 score: 0.22
