In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

df = pd.read_csv("LifeExpectancyDataset - Sheet1.csv")
print(df.shape)

df = df.dropna() 
print(df.shape)

In [None]:
df.loc[df["Status"] == "Developing", "Status"] = 0
df.loc[df["Status"] == "Developed", "Status"] = 1


train_proportion = 0.8

def stratified_split(df, group_col, train_proportion):
    train_df = df.groupby(group_col, group_keys=False).apply(lambda x: x.sample(frac=train_proportion))
    test_df = df.drop(train_df.index)
    return train_df, test_df

train_df, test_df = stratified_split(df, 'Country', train_proportion)

train_df = pd.get_dummies(train_df, columns=["Country"])
test_df = pd.get_dummies(test_df, columns=["Country"])
train_df, test_df = train_df.align(test_df, join='outer', axis=1, fill_value=0)

train_df = train_df.astype(float)
test_df = test_df.astype(float)

train_labels = train_df["Life expectancy"]
train_data = train_df.drop(columns=["Life expectancy"])

test_labels = test_df["Life expectancy"]
test_data = test_df.drop(columns=["Life expectancy"])

print(train_df.head(50))

In [None]:
print(train_labels.shape)
print(test_data.shape)
print(train_data)
print(test_labels.shape)
print(train_data.columns)
print(test_data.columns)

In [None]:
## now ready to build the model
class Linear_model:
    def __init__(self, data, labels):
        # expecting all of them to be numpy ndarray
        self.data = data.to_numpy()
        self.labels = np.array(labels)
        self.w = np.zeros(np.shape(data)[1]).astype(float)
        self.b = 0.0
    
    def normalize(self, data, training=False):
        if training:
            self.mean_val = np.mean(data, axis=0)
            self.std_val = np.std(data, axis=0)
            
#         if data.ndim == 1:
#             data = data.reshape(1, len(data))
        data = (data - self.mean_val) / self.std_val
        return data
    
    def train(self, epochs, a):
        self.data = self.normalize(self.data, True)
        for i in range(epochs):
            self.update_w_and_b(a)
            print("loss: ", self.calc_loss())
            
    def calc_loss(self):
        w = np.copy(self.w) # order feat,
        b = self.b # a constant value
        labels = np.copy(self.labels) # order n,
        data = np.copy(self.data) # order n*p
        n = len(data)
        loss_arr = (labels - (np.sum(w*data, axis=1) + b))**2
        loss = np.sum(loss_arr)/n
        return loss
        
    def update_w_and_b(self, a):
        data = self.data
        w = self.w
        b = self.b
        labels = self.labels
        n = len(data)
        
        predictions = np.dot(data, w) + b
        diff = labels - predictions
        
        grad_w = -2/n * np.dot(data.T, diff)
        grad_b = -2/n * np.sum(diff)
        
        self.w -= a * grad_w
        self.b -= a * grad_b
        
    def predict(self, X):
        X = X.to_numpy()
        X = self.normalize(X)
        return np.dot(X, self.w) + self.b
    
    def evaluate(self, X, y):
        n = len(X)
        predictions = self.predict(X)
        loss_arr = (y - predictions)**2
        loss = np.sqrt(np.sum(loss_arr)/n)
        return loss

In [None]:
# now test run was successful

a = 0.01
model2 = Linear_model(train_data, train_labels)
model2.train(epochs=10000, a=a)

In [None]:
print(model2.w)

In [None]:
# get the rmse of the test data
print(model2.evaluate(test_data, test_labels))

In [None]:
new_data=pd.DataFrame({
    'Country':["Australia"],
    'Year':[2013],
    'Status':[1],
    'Adult Mortality':[61],
    'infant deaths':[1],
    'Alcohol':[9.87],
    'percentage expenditure':[11734.85381],
    'Hepatitis B':[91],
    'Measles':[158],
    'BMI':[65.5],
    'under-five deaths':[1],
    'Polio':[91],
    'Total expenditure':[9.36],
    'Diphtheria':[91],
    'HIV/AIDS':[0.1],
    'GDP':[67792.3386],
    'Population':[23117353],
    'thinness  1-19 years':[0.6],
    'thinness 5-9 years':[0.6],
    'Income composition of resources':[0.933],
    'Schooling':[20.3]
})

new_data["Country_Australia"] = 1
new_data = new_data.drop(columns=["Country"])
train_data, new_data = train_data.align(new_data, join='outer', axis=1, fill_value=0)

print(model2.predict(new_data))