# Initialization

In [1]:
# Importing required modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
%matplotlib inline

In [2]:
# Reading the dataset
df = pd.read_csv('insurance.txt')

# Dropping 'children' column
df.drop(columns = 'children', inplace = True)

df.head()

Unnamed: 0,age,bmi,charges
0,19,27.9,16884.924
1,18,33.77,1725.5523
2,28,33.0,4449.462
3,33,22.705,21984.47061
4,32,28.88,3866.8552


## Function to normalize dataset
### normalize_dataset function normalizes the dataset by using the formula
## $X_{normalized} = \frac {X - min(X_i)} {max(X_i) - min(X_i)}$

In [3]:
def normalize_dataset():

    # normalize_coeff stores max and min of every column in dataset for denormalization
    normalize_coeff = {col: {'max': df[col].max(), 'min': df[col].min()} for col in df.columns}

    # normal_df is the normalized dataframe
    normal_df = (df - df.min()) / (df.max() - df.min())

    return pd.DataFrame(normal_df), normalize_coeff

In [4]:
normal_df, normalize_coeff = normalize_dataset()

print(normal_df.head())
# print()
# print(normalize_coeff)

        age       bmi   charges
0  0.021739  0.321227  0.251611
1  0.000000  0.479150  0.009636
2  0.217391  0.458434  0.053115
3  0.326087  0.181464  0.333010
4  0.304348  0.347592  0.043816


## Function to shuffle the dataset and return $X_{train}, Y_{train}, X_{val}, Y_{val}, X_{test}, Y_{test}$ matrices 

In [5]:
from math import ceil
def shuffle(dataframe, train_ratio, val_ratio, target):
    
    # Getting list of columns for X matrix
    columns = list(dataframe.columns)
    columns.remove(target)
    
    # Getting the max indices for training and validation set
    train_idx = ceil(len(dataframe) * train_ratio)
    val_idx = ceil(len(dataframe) * (train_ratio + val_ratio))
    
    # Shuffling the dataset
    sf = dataframe.sample(frac = 1)
    
    # Dividing X and Y matrices
    X = sf[columns]
    Y = sf[target]
    
    # Generating train, validation and test sets
    X_train = X.iloc[:train_idx].to_numpy().reshape((-1, len(columns)))
    X_val = X.iloc[train_idx : val_idx].to_numpy().reshape((-1, len(columns)))
    X_test = X.iloc[val_idx:].to_numpy().reshape((-1, len(columns)))
    
    Y_train = Y.iloc[:train_idx].to_numpy().reshape((-1, 1))
    Y_val = Y.iloc[train_idx : val_idx].to_numpy().reshape((-1, 1))
    Y_test = Y.iloc[val_idx:].to_numpy().reshape((-1, 1))
    
    return X_train, Y_train, X_val, Y_val, X_test, Y_test

## Function to initialize weights

In [6]:
def initialize_weights(X):
    num_features = X.shape[1]
    W = np.zeros((num_features, 1))
    return W

## Function to predict the results

In [7]:
def predict(W, X):
    predictions = X @ W         # '@' performs matrix multiplication
    return predictions

## Function to denormalize a column

In [8]:
def denormalize(arr, name):
    c = np.array(arr).flatten()
    c = np.apply_along_axis(lambda x: x * (normalize_coeff[name]['max'] - normalize_coeff[name]['min']) + normalize_coeff[name]['min'], 0, c)
    return c

## Functions to calculate Error and Accuracy
### Metric used for error -> MSE
## $ MSE = \frac {\sum_{i=1}^{m}{(y_i - \hat y_i)^2}} {2m} $

In [9]:
def calculate_error(y, y_hat):
    y = y.flatten()
    y_hat = y_hat.flatten()
    sq_err = np.sum((y - y_hat) ** 2)
    mse = sq_err / (2 * len(y))
    return mse

### Metric used for accuracy -> $ R^2 $
## $ R^2 = 1- \frac {2 \times MSE} {Var(Y)} $

In [10]:
def calculate_accuracy(y, y_hat):
    y = y.flatten()
    y_hat = y_hat.flatten()
    mse = calculate_error(y, y_hat)
    var_y = np.var(y)
    rMSE = 2 * mse / var_y
    R2 = 1 - rMSE
    return R2