In [41]:
# Importing Essential Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [242]:
# Loading data into dataframe using pandas
df=pd.read_csv("50_Startups.csv")

In [43]:
# Since State column has some categorical init , we need to convert them to numerical data for mathematical computation purpose.
# unique method returns unique values of specified column.
df["State"].unique()

array(['New York', 'California', 'Florida'], dtype=object)

In [240]:
# Replacing categorical data using replace method..
df=df.replace({
    "New York":0,
    "California":1,
    "Florida":2
})
df.head(10)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,0,192261.83
1,162597.7,151377.59,443898.53,1,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,0,182901.99
4,142107.34,91391.77,366168.42,2,166187.94
5,131876.9,99814.71,362861.36,0,156991.12
6,134615.46,147198.87,127716.82,1,156122.51
7,130298.13,145530.06,323876.68,2,155752.6
8,120542.52,148718.95,311613.29,0,152211.77
9,123334.88,108679.17,304981.62,1,149759.96


In [48]:
# getting values from dataframe into an array
data=df.values

In [50]:
# Seperating Features and Predicted values from data
# Here in this particular case ,we need to predict Profit of a startup company based on its available features.
# Y is matrix with 50 rows and 1 column, which contains predicted values for different values of Features

# Reshaping data ,so that it fits for matrix multiplication
X=data[:, :-1].T
Y=data[:, -1].reshape((data.shape[0],1))

In [51]:
# Analysing shapes of datasets, to avoid confusion on matrix multiplication..
print(X.shape)
print(Y.shape)

(4, 50)
(50, 1)


In [52]:
# Adding ones row to x, so that we can predict y-intercept(θ0) as well along with all features slope(θn,θ(n-1),θ(n-2),---θ2,θ1)
X=np.vstack((np.ones(X.shape[1]),X))
print(X.shape)

(5, 50)


In [243]:
# Below is method that takes input features,predicted values and 
# returns slopes(Parameters) of respective Fetures in matrix form.
# It works on Gradiant Descesnt Algorithm


def model(X,Y,LearningRate,Iteration):
    
    # m is total no of samples we have
    m=Y.shape[0]
    
    # theta is a column_matrix(vector) with initial guess perameters of 0.
    # We can have any different values for parameters as initial guees.
    theta=np.zeros((X.shape[0],1))
    
    # Initializing cost evaluation array to know whether we are descending loss function using Gradiant Descent or not..
    # We can igonre this step ,but we can achieve better predictions by analysing cost function
    # We have many loss function , I found it's better to use LSE(Least Square Error method)
    
    # cost_evaluation=[]
    
    
    # Below is gradiant descent Algorithm for descending loss function
    # We can stop loop by specifying no of iterations or when d_theta tends to 0
    for i in range (Iteration):
        
        # Predicting new values with respect to theta.
        # Initially we set all parameters to 0.
        # For matrix multiplication we are transposing X here.
        y_pred=np.dot(X.T,theta)
        
        # Least Square Error method for cost function
        # Commenting Cost Function For better performance of this model ..
        
        # cost=1/2*m*(np.sum(np.square(Y-y_pred)))
        
        # d_theta=np.sum(np.square(Y-y_pred))
        d_theta=1/(2*m)*np.dot(X,((y_pred-Y)))
        
        # LearningRate*d_theta--> is nothing but a stepsize
        # Overall theta decreases when d_theta is negative, and increases when d_theta is positive
        theta=theta-LearningRate*d_theta
        
        # Appending cost value for analysing how algorithm is going..
        # For better performance of this model commenting below lines..
        
        # cost_evaluation.append(cost)
        
        # printing cost for every 100 appends
        # Status of our algorithm
        if(i%100==0):
            # print(cost)
             print(f"Out of 1000 Iterations ,{i} over")
        
    print("Model learning completed now we can predict values..")
    return theta

In [244]:
theta=model(X,Y,0.00000000004,1000)
theta

Out of 1000 Iterations ,0 over
Out of 1000 Iterations ,100 over
Out of 1000 Iterations ,200 over
Out of 1000 Iterations ,300 over
Out of 1000 Iterations ,400 over
Out of 1000 Iterations ,500 over
Out of 1000 Iterations ,600 over
Out of 1000 Iterations ,700 over
Out of 1000 Iterations ,800 over
Out of 1000 Iterations ,900 over
Model learning completed now we can predict values..


array([[4.07121265e-05],
       [7.17952592e-01],
       [3.27694732e-01],
       [8.21795432e-02],
       [2.67166030e-05]])

In [249]:
# Predicting Insurance charges and comparing them with already available charges

print("Old          New")
for i in range(20):
    print(f"{Y[i][0].round(2)}    {np.dot(X.T[i],theta)[0].round(2)}")

Old          New
192261.83    202344.58
191792.06    202822.46
191050.39    176832.47
182901.99    174031.86
166187.94    162066.49
156991.12    157209.9
156122.51    155379.52
155752.6    167853.35
152211.77    160886.47
149759.96    149225.44
146121.95    128242.21
144259.4    122880.89
141585.52    129643.63
134307.35    131211.08
132602.65    158493.44
129917.04    143916.07
126992.93    117580.43
125370.37    138722.34
124266.9    127522.71
122776.86    112351.01
