In [1]:
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/hsuyeemon/SML_Project/main/data/life%20expectancy.csv')

Data Preprocessing

In [2]:
# we will use only the rows where the target Life Expectancy value present.
df = df[~df['Life Expectancy World Bank'].isna()]
print("Number of rows after dropping : ", df.shape[0])

Number of rows after dropping :  3118


In [3]:
percentage_missing = (df[['Health Expenditure %','Education Expenditure %']].isnull().sum() / len(df)) * 100
print("The missing value percent\n",percentage_missing)

The missing value percent
 Health Expenditure %        4.554201
Education Expenditure %    31.205901
dtype: float64


In [4]:
# TODO : // get a subset of data for project  

# Reference
# We will use the data from "East Asia & Pacific,South Asia and Europe & Central Asia" region since 2010.
#df = df[df['Year']>=2010]

# regions_filter = ['','']
# df = df[df['Region'].isin(regions_filter)]
# data = data[(data['Region'] == 'East Asia & Pacific') | (data['Region'] == 'South Asia')| (data['Region'] == 'Europe & Central Asia')]

In [5]:
df_filtered = df[~df['Country Name'].isin(['Dominica', 'Palau'])]

In [6]:
#df_filtered=df_filtered[(df_filtered['Region'] == 'East Asia & Pacific') | (df_filtered['Region'] == 'South Asia')| (df_filtered['Region'] == 'Europe & Central Asia')]
df_filtered=df_filtered[(df_filtered['Region'] == 'East Asia & Pacific')]

In [7]:
percentage_missing = (df_filtered[['Health Expenditure %','Health Expenditure %']].isnull().sum() / len(df_filtered)) * 100
print("The missing value percent\n",percentage_missing)

The missing value percent
 Health Expenditure %    4.761905
Health Expenditure %    4.761905
dtype: float64


In [8]:
# Group by 'Country' and count the number of unique years
country_edu_data_counts = df_filtered[['Country Name','Education Expenditure %']].groupby('Country Name')['Education Expenditure %'].nunique() 

# Filter out countries with less than 5 years of data
countries_edu_with_less_than_5_years = country_edu_data_counts[country_edu_data_counts <8].index.tolist()

# Filter out countries to impute
countries_edu_with_null_value = df_filtered[df_filtered['Education Expenditure %'].isna() == True]['Country Name'].unique().tolist()

print(countries_edu_with_less_than_5_years)
print(countries_edu_with_null_value)

['Guam', 'Papua New Guinea', 'Tonga']
['Australia', 'China', 'Guam', 'Myanmar', 'Mongolia', 'Papua New Guinea', 'Solomon Islands', 'Vietnam', 'Kiribati', 'Fiji', 'Cambodia', 'Samoa', 'Vanuatu', 'Malaysia', 'Tonga', 'Indonesia', 'Japan', 'Philippines', 'New Zealand']


In [9]:
# Group by 'Country' and count the number of unique years
country_hea_data_counts = df_filtered[['Country Name','Health Expenditure %']].groupby('Country Name')['Health Expenditure %'].nunique() 

# Filter out countries with less than 5 years of data
countries_hea_with_less_than_5_years = country_hea_data_counts[country_hea_data_counts < 8].index.tolist()

# Filter out countries to impute
countries_hea_with_null_value = df_filtered[df_filtered['Health Expenditure %'].isna() == True]['Country Name'].unique().tolist()


print(countries_hea_with_less_than_5_years)
print(countries_hea_with_null_value)

['Guam']
['Guam']


In [10]:
# handle missing data

df_filtered = df_filtered[~df_filtered['Country Name'].isin(countries_edu_with_less_than_5_years)]
print("data points after filtered" , df_filtered.shape[0])

df_filtered = df_filtered[~df_filtered['Country Name'].isin(countries_hea_with_less_than_5_years)]
print("data points after filtered" , df_filtered.shape[0])

data points after filtered 342
data points after filtered 342


In [11]:
# imputation

# Filter the DataFrame to include only the specified countries
df_subset = df_filtered[df_filtered['Country Name'].isin(countries_edu_with_null_value)]

# Calculate mean for each country group and impute missing values
#print(df_subset.groupby('Country Name')['Education Expenditure %'].transform(lambda x: x.fillna(x.mean())))
df_subset['Education Expenditure %'] = df_subset.groupby('Country Name')['Education Expenditure %'].transform(lambda x: x.fillna(x.mean()))

# Merge the subset back into the original DataFrame
df_imputed = pd.concat([df_filtered[~df_filtered['Country Name'].isin(countries_edu_with_null_value)], df_subset])

percentage_missing = (df_imputed[['Education Expenditure %']].isnull().sum() / len(df_imputed)) * 100
print("The missing value percent\n",percentage_missing)



# Filter the DataFrame to include only the specified countries
df_subset = df_imputed[df_imputed['Country Name'].isin(countries_hea_with_null_value)]

# Calculate mean for each country group and impute missing values
#print(df_subset.groupby('Country Name')['Education Expenditure %'].transform(lambda x: x.fillna(x.mean())))
df_subset['Health Expenditure %'] = df_subset.groupby('Country Name')['Health Expenditure %'].transform(lambda x: x.fillna(x.mean()))

# Merge the subset back into the original DataFrame
df_imputed = pd.concat([df_imputed[~df_imputed['Country Name'].isin(countries_hea_with_null_value)], df_subset])

percentage_missing = (df_imputed[['Health Expenditure %']].isnull().sum() / len(df_imputed)) * 100
print("The missing value percent\n",percentage_missing)

The missing value percent
 Education Expenditure %    0.0
dtype: float64
The missing value percent
 Health Expenditure %    0.0
dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['Education Expenditure %'] = df_subset.groupby('Country Name')['Education Expenditure %'].transform(lambda x: x.fillna(x.mean()))


In [12]:
# get the columns of interest
df_cleaned = df_imputed[['Health Expenditure %','Education Expenditure %','Life Expectancy World Bank']]
print("The size of data is",df_cleaned.shape[0])
percentage_missing = (df_cleaned.isnull().sum() / len(df_cleaned)) * 100
print("The missing value percent\n",percentage_missing) 

The size of data is 342
The missing value percent
 Health Expenditure %          0.0
Education Expenditure %       0.0
Life Expectancy World Bank    0.0
dtype: float64


In [13]:
# remove outliers
df_cleaned = df_cleaned[(df_cleaned['Education Expenditure %'] <=8) & (df_cleaned['Health Expenditure %'] <= 12)]

In [14]:
X = df_cleaned[['Health Expenditure %','Education Expenditure %']] #input
y = df_cleaned['Life Expectancy World Bank'] #output

Splitting

In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.28, random_state = 42)

Training

Evaluation

In [54]:
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import Ridge , LinearRegression , Lasso
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [55]:
print("Linear Regression")
steps = [ ("scale", StandardScaler()),  #standardizing the data to range ( mean = 0 ,variance = 1)
          ("regressor", LinearRegression()) ] # Ridge Model

pipeline = Pipeline(steps)

model = pipeline.fit(X_train, y_train)  # goal :  to find the parameters , minimize the error between the error and actual data , doing optimization

# Evaluating on the training data
y_pred = model.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train,y_pred)
print('training')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


# Evaluating on the testing data
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
print('testing')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


Linear Regression
training
mae: 4 mse: 23 r2 score: 0.39
testing
mae: 4 mse: 25 r2 score: 0.29


In [56]:
degree =5
alpha = 0.7

In [57]:
print("Poly Regression")

steps = [ ("scale", StandardScaler()),  #standardizing the data to range ( mean = 0 ,variance = 1)
          ("polytransform", PolynomialFeatures(degree = degree)), # transforming the polinomial to linear
          ("regressor", LinearRegression()) ] # Ridge Model

pipeline = Pipeline(steps)

model = pipeline.fit(X_train, y_train)  # goal :  to find the parameters , minimize the error between the error and actual data , doing optimization

# Evaluating on the training data
y_pred = model.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train,y_pred)
print('training')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


# Evaluating on the testing data
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
print('testing')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


Poly Regression
training
mae: 3 mse: 11 r2 score: 0.71
testing
mae: 2 mse: 11 r2 score: 0.7


In [58]:
print("Poly-Ridge Regression")

steps = [ ("scale", StandardScaler()),  #standardizing the data to range ( mean = 0 ,variance = 1)
          ("polytransform", PolynomialFeatures(degree = degree)), # transforming the polinomial to linear
          ("regressor", Ridge(alpha)) ] # Ridge Model

pipeline = Pipeline(steps)

model = pipeline.fit(X_train, y_train)  # goal :  to find the parameters , minimize the error between the error and actual data , doing optimization

# Evaluating on the training data
y_pred = model.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train,y_pred)
print('training')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


# Evaluating on the testing data
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
print('testing')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


Poly-Ridge Regression
training
mae: 3 mse: 11 r2 score: 0.71
testing
mae: 2 mse: 11 r2 score: 0.69


In [59]:
print("Poly-Lasso Regression")

steps = [ ("scale", StandardScaler()),  #standardizing the data to range ( mean = 0 ,variance = 1)
          ("polytransform", PolynomialFeatures(degree = degree)), # transforming the polinomial to linear
          ("regressor", Lasso(alpha)) ] # Ridge Model

pipeline = Pipeline(steps)

model = pipeline.fit(X_train, y_train)  # goal :  to find the parameters , minimize the error between the error and actual data , doing optimization

# Evaluating on the training data
y_pred = model.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train,y_pred)
print('training')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


# Evaluating on the testing data
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
print('testing')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


Poly-Lasso Regression
training
mae: 3 mse: 17 r2 score: 0.56
testing
mae: 4 mse: 18 r2 score: 0.47
