In [127]:
# Importing Essential Libraries
import pandas as pd
import numpy as np

In [128]:
# Reading data into dataframe from csv file
df=pd.read_csv("insurance.csv")
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [129]:
# Converting Categorical data to numerical data using replace method in pandas
df=df.replace({
    "male":0,
    "female":1,
    "yes":0,
    "no":1,
    "southwest":0,
    "southeast":1,
    "northeast":2,
    "northwest":3
})

In [130]:
# Getting data form dataframe as an array
data=df.values

# Seperating Features and Predicted values from data


# Here in this particular case ,we need to predict Charges from Insurance company.
# Y is matrix with 50 rows and 1 column, which has predicted values for different values of Features

# Reshaping data ,so that it fits for matrix multiplication

Y=data[:, -1].reshape((data.shape[0],1))

# We have 6 Features in our data(age,sex,bmi,children,smoker,region)
# X is a matrix with 1338 rows and 6 columns
X=data[:, :-1]

In [131]:
# Analysing shapes of X and Y

print(Y.shape)
print(X.shape)

(1338, 1)
(1338, 6)


In [132]:
# Adding one's row to our X matrix using numpy vstack method.
# By adding ones row ,we can predict constant(θ0 in below example) along with all Parameters(θn,θ(n-1),θ(n-2),---θ2,θ1) 

# y_pred = θn xn + θ(n−1) x(n−1) + θ(n−2) x(n−2) +...+ θ2 x2 + θ1x1 + θ0

X=np.vstack((np.ones(X.shape[0]),X.T))
print(X.shape)
print(X)

(7, 1338)
[[ 1.  1.  1. ...  1.  1.  1.]
 [19. 18. 28. ... 18. 21. 61.]
 [ 1.  0.  0. ...  1.  1.  1.]
 ...
 [ 0.  1.  3. ...  0.  0.  0.]
 [ 0.  1.  1. ...  1.  1.  0.]
 [ 0.  1.  1. ...  1.  0.  3.]]


In [133]:
# Below is method that takes input features,predicted values and 
# returns slopes(Parameters) of respective Fetures in matrix form.
# It works on Gradiant Descesnt Algorithm

def model(X,Y,LearningRate,Iteration):
    # m is total no of samples we have
    m=Y.shape[0]
    
    # theta is a column_matrix(vector) with initial guess perameters of 0.
    # We can have any different values for parameters as initial guees.
    theta=np.zeros((X.shape[0],1))
    
    # Initializing cost evaluation array to know whether we are descending loss function using Gradiant Descent or not..
    # We can igonre this step ,but we can achieve better predictions by analysing cost function
    # We have many loss function , I found it's better to use LSE(Least Square Error method)
    
    # cost_evaluation=[]
    
    
    # Below is gradiant descent Algorithm for descending loss function
    # We can stop loop by specifying no of iterations or when d_theta tends to 0
    
    for i in range (Iteration):
        
        # Predicting new values with respect to theta.
        # Initially we set all parameters to 0.
        # For matrix multiplication we are transposing X here.
        y_pred=np.dot(X.T,theta)
        
        # Least Square Error method for cost function
        # Commenting Cost Function For better performance of this model ..
        
        # cost=1/2*m*(np.sum(np.square(Y-y_pred)))
        
        # Change in theta with respect to change in Predicted values
        d_theta=(1/m)*(np.dot(X,(Y-y_pred)))
        
        # LearningRate*d_theta--> is nothing but a stepsize
        # Overall theta decreases when d_theta is negative, and increases when d_theta is positive
        theta=theta+LearningRate*d_theta
        
        # Appending cost value for analysing how algorithm is going..
        # For better performance of this model commenting below lines..
        # cost_evaluation.append(cost)
        
        # printing cost for every 10000 appends
        # Status of our algorithm
        if(i%10000==0):
            # print(cost)
             print(f"Out of 2L Iterations ,{i} over")
        
    print("completed")
    return theta

In [134]:
# Again here LearningRate and Iterations are random guesses
# By applying different values for LearningRate and Iterations
# After many trails I found 0.00076 and 200000 are best values for LearningRate and Iteration (considering performance too)
theta=model(X,Y,0.00076,200000)
theta

# It will take sometime for execution of the loop ,because we are runnning it 2 lakh times

Out of 2L Iterations ,0 over
Out of 2L Iterations ,10000 over
Out of 2L Iterations ,20000 over
Out of 2L Iterations ,30000 over
Out of 2L Iterations ,40000 over
Out of 2L Iterations ,50000 over
Out of 2L Iterations ,60000 over
Out of 2L Iterations ,70000 over
Out of 2L Iterations ,80000 over
Out of 2L Iterations ,90000 over
Out of 2L Iterations ,100000 over
Out of 2L Iterations ,110000 over
Out of 2L Iterations ,120000 over
Out of 2L Iterations ,130000 over
Out of 2L Iterations ,140000 over
Out of 2L Iterations ,150000 over
Out of 2L Iterations ,160000 over
Out of 2L Iterations ,170000 over
Out of 2L Iterations ,180000 over
Out of 2L Iterations ,190000 over
completed


array([[ 10630.28774573],
       [   258.75240926],
       [   148.64644677],
       [   337.87183805],
       [   479.3004837 ],
       [-23786.58920709],
       [   298.53854477]])

In [152]:
# Predicting Insurance charges and comparing them with already available charges

print("Old         New")
for i in range(20):
    print(f"{df['charges'][i].round(2)}    {np.dot(X.T[i],theta)[0].round(2)}")

Old         New
16884.92    25121.85
1725.55    3689.01
4449.46    6974.98
21984.47    3949.52
3866.86    5777.13
3756.62    4009.03
8240.59    10971.23
7281.51    8272.27
6406.41    8051.93
28923.14    12143.71
2721.32    2768.59
27808.73    36002.77
1826.84    4417.8
11090.72    15235.08
39611.76    32149.68
1837.24    550.94
10797.34    11923.54
2395.17    1448.64
10602.38    14950.07
36837.47    30319.74
