In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Learning/SML-Parami/data/life expectancy.csv')

Data Preprocessing

In [None]:
# get a subset of data for project
# We will use the data from "East Asia & Pacific,South Asia and Europe & Central Asia" region since 2010.
data = df[df['Year']>=2010]
data = data[(data['Region'] == 'East Asia & Pacific') | (data['Region'] == 'South Asia')| (data['Region'] == 'Europe & Central Asia')]

# get the columns of interest
data = data[['Health Expenditure %','Education Expenditure %','Life Expectancy World Bank']]


# handle missing data
# we will use only the rows where the target Life Expectancy value present.
data = data[data['Life Expectancy World Bank'].notnull()]

# remove outliers
data = data[(data['Education Expenditure %'] <=8) & (data['Health Expenditure %'] <= 12)]

In [None]:
data.describe()

Unnamed: 0,Health Expenditure %,Education Expenditure %,Life Expectancy World Bank
count,539.0,539.0,539.0
mean,6.709977,4.50912,75.531363
std,2.642807,1.484173,5.448729
min,1.926669,0.85032,61.028
25%,4.297886,3.467195,71.113
50%,6.568625,4.53671,75.321
75%,9.102098,5.44961,81.084146
max,11.818562,7.95963,84.210976


In [None]:
print("The size of data is",data.shape[0])
percentage_missing = (data.isnull().sum() / len(df)) * 100
print("The missing value percent\n",percentage_missing)

The size of data is 539
The missing value percent
 Health Expenditure %          0.0
Education Expenditure %       0.0
Life Expectancy World Bank    0.0
dtype: float64


In [None]:
X = data[['Health Expenditure %','Education Expenditure %']] #input
y = data['Life Expectancy World Bank'] #output

Splitting

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

Training

In [None]:
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline

steps = [("imp_mean", SimpleImputer()), # cleaning the data - replace the missing data with average
          ("scale", StandardScaler()),  #standardizing the data to range ( mean = 0 ,variance = 1)
          ("polytransform", PolynomialFeatures(degree =5)), # transforming the polinomial to linear
          ("regressor", Ridge(1)) ] # Ridge Model

pipeline = Pipeline(steps)

model = pipeline.fit(X_train, y_train)  # goal :  to find the parameters , minimize the error between the error and actual data , doing optimization


Evaluation

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluating on the training data
y_pred = model.predict(X_train)
mae = mean_absolute_error(y_train, y_pred)
mse = mean_squared_error(y_train, y_pred)
r2 = r2_score(y_train,y_pred)
print('training')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


# Evaluating on the testing data
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test,y_pred)
print('testing')
print('mae:', round(mae), 'mse:', round(mse), 'r2 score:', round(r2, 2))


training
mae: 3 mse: 13 r2 score: 0.56
testing
mae: 3 mse: 14 r2 score: 0.56
