In [1]:
# This dataset contains information compiled by the World Health Organization 
# and the United Nations to track factors that affect life expectancy. 
# The data contains 2938 rows and 22 columns. The columns include: country, 
# year, developing status, adult mortality, life expectancy, infant deaths, 
# alcohol consumption per capita, country’s expenditure on health, immunization 
# coverage, BMI, deaths under 5-years-old, deaths due to HIV/AIDS, GDP, population, 
# body condition, income information and education.

# https://www.kaggle.com/datasets/kumarajarshi/life-expectancy-who/data

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
plt.rc("font", size=14)

import seaborn as sns
sns.set_theme(style="white")
sns.set_theme(style="whitegrid", color_codes=True)

In [2]:
data=pd.read_csv(r'Life_Expectancy_Data.csv',header=0)
data=data.dropna() #Удаление пропущенных данных
print(data.shape) #(1649 - строк, 22 - столбцов)
data.head()

(1649, 22)


Unnamed: 0,Country,Year,Status,Life expectancy,Adult Mortality,infant deaths,Alcohol,percentage expenditure,Hepatitis B,Measles,...,Polio,Total expenditure,Diphtheria,HIV/AIDS,GDP,Population,thinness 1-19 years,thinness 5-9 years,Income composition of resources,Schooling
0,Afghanistan,2015,Developing,65.0,263.0,62,0.01,71.279624,65.0,1154,...,6.0,8.16,65.0,0.1,584.25921,33736494.0,17.2,17.3,0.479,10.1
1,Afghanistan,2014,Developing,59.9,271.0,64,0.01,73.523582,62.0,492,...,58.0,8.18,62.0,0.1,612.696514,327582.0,17.5,17.5,0.476,10.0
2,Afghanistan,2013,Developing,59.9,268.0,66,0.01,73.219243,64.0,430,...,62.0,8.13,64.0,0.1,631.744976,31731688.0,17.7,17.7,0.47,9.9
3,Afghanistan,2012,Developing,59.5,272.0,69,0.01,78.184215,67.0,2787,...,67.0,8.52,67.0,0.1,669.959,3696958.0,17.9,18.0,0.463,9.8
4,Afghanistan,2011,Developing,59.2,275.0,71,0.01,7.097109,68.0,3013,...,68.0,7.87,68.0,0.1,63.537231,2978599.0,18.2,18.2,0.454,9.5


In [3]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [11]:
X = data[['Adult Mortality', 'infant deaths', 'Alcohol', 'Schooling', 'GDP', 'Population']]
y = data['Life expectancy ']

In [12]:
#pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regression', LinearRegression())
])

In [13]:
#тренировочный и тестовый набор
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [14]:
#обучение модели
pipeline.fit(X_train, y_train)

In [15]:
# Предсказание значений
y_pred = pipeline.predict(X_test)

In [16]:
# Оценка точности модели
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print('Root Mean Squared Error:', rmse)
print('Mean Squared Error:', mse)
print('Mean Absolute Error:', mae)

Root Mean Squared Error: 5.08323820596476
Mean Squared Error: 25.83931065857983
Mean Absolute Error: 3.5064089009472608
