### Importing libraries

In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
# Reading data
train_data = pd.read_csv('../data/train.csv')
test_data = pd.read_csv('../data/test.csv')

# Feature Engineering

In [5]:
# Exploring the data
train_data.isnull().sum()

Country                   0
Year                      0
Status                    0
Population              644
Hepatitis B             542
Measles                   0
Polio                    19
Diphtheria               19
HIV/AIDS                  0
infant deaths             0
under-five deaths         0
Total expenditure       221
GDP                     442
BMI                      32
thinness  1-19 years     32
Alcohol                 188
Schooling               160
Life expectancy           0
dtype: int64

In [6]:
train_data.dtypes

Country                  object
Year                      int64
Status                   object
Population              float64
Hepatitis B             float64
Measles                   int64
Polio                   float64
Diphtheria              float64
HIV/AIDS                float64
infant deaths             int64
under-five deaths         int64
Total expenditure       float64
GDP                     float64
BMI                     float64
thinness  1-19 years    float64
Alcohol                 float64
Schooling               float64
Life expectancy         float64
dtype: object

In [8]:
# Converting non numeric data to numeric data
train_data['Status'] = (train_data['Status'] == 'Developed').astype(int)
test_data['Status'] = (test_data['Status'] == 'Developed').astype(int)

In [10]:
# Filling missing values
columns = train_data.drop(['Life expectancy', 'Country', 'Year', 'Status'], axis=1).columns.tolist()

grouped = train_data.groupby('Country')

for col in columns:
    temp_mean = train_data[col].fillna(grouped[col].transform('mean'))
    test_data[col] = test_data[col].fillna(temp_mean.mean())
    train_data[col] = train_data[col].fillna(temp_mean.mean())
    
    temp_median = train_data[col].fillna(grouped[col].transform('median'))
    test_data[col] = test_data[col].fillna(temp_median.mean())
    train_data[col] = train_data[col].fillna(temp_median.mean())

In [11]:
# Removing unneccesary columns and defining target value
X = train_data.drop(['Life expectancy', 'Country', 'Year', 'Population', 'Hepatitis B', 'Alcohol'], axis=1)
y = train_data['Life expectancy']
test_data.drop(['Country', 'Year','Population', 'Hepatitis B', 'Alcohol'], axis=1, inplace=True)

In [13]:
# Splitting and standardizing the data
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid= train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=69)

std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_valid = std_scaler.transform(X_valid)
test_data = std_scaler.transform(test_data)

## Modeling

In [14]:
from sklearn.preprocessing import PolynomialFeatures

poly_transformer = PolynomialFeatures()
x_train_trans = poly_transformer.fit_transform(X_train)
x_test_trans = poly_transformer.transform(X_valid)
test_data_trans = poly_transformer.transform(test_data)

In [15]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train_trans, y_train)
y_pred = model.predict(x_test_trans)

In [16]:
# Evaluate the model
from sklearn.metrics import r2_score

r2_score(y_valid, y_pred)

0.8399516509882674