In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./Datasets/50_startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df.shape

(50, 5)

In [4]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [5]:
df.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [6]:
df['State'].value_counts()

California    17
New York      17
Florida       16
Name: State, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
lb = LabelEncoder()
df['State'] = lb.fit_transform(df['State'])

In [9]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [10]:
df['State'].value_counts()

0    17
2    17
1    16
Name: State, dtype: int64

In [11]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
print(x.shape)
print(y.shape)

(50, 4)
(50,)


In [12]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(40, 4)
(10, 4)
(40,)
(10,)


In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
mlr = LinearRegression()
mlr.fit(x_train,y_train)

LinearRegression()

In [15]:
print('Training score',mlr.score(x_train,y_train))
print('Testing score',mlr.score(x_test,y_test))

Training score 0.9508224450912323
Testing score 0.9248098479622888


In [16]:
y_pred = mlr.predict(x_test)
y_pred

array([156692.1468033 , 130207.92090899,  86029.94204682,  71682.68954724,
       117712.64672847, 114007.91087082,  56109.10567153,  74338.89870277,
       176049.66660558,  62421.20053035])

In [17]:
res = pd.DataFrame({'Actual_Profit':y_test,'Pred_Profit':y_pred})
res

Unnamed: 0,Actual_Profit,Pred_Profit
9,149759.96,156692.146803
18,124266.9,130207.920909
39,81005.76,86029.942047
42,71498.49,71682.689547
24,108552.04,117712.646728
27,105008.31,114007.910871
48,35673.41,56109.105672
36,90708.19,74338.898703
4,166187.94,176049.666606
44,65200.33,62421.20053


In [18]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [19]:
MSE = mean_squared_error(res['Actual_Profit'],res['Pred_Profit'])
MAE = mean_absolute_error(res['Actual_Profit'],res['Pred_Profit'])
RMSE = MSE**0.5
print('MSE',MSE)
print('MAE',MAE)
print('RMSE',RMSE)

MSE 106408537.69026642
MAE 8568.763994962861
RMSE 10315.451405065432


In [20]:
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [21]:
print(mlr.coef_)
print(mlr.intercept_)

[ 8.40414944e-01 -1.04169857e-01  1.60924785e-02  7.95390640e+02]
59452.8540062179


In [22]:
df.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [23]:
# Eqn of MLR
df['Pred_Price_Eqn'] = df['R&D Spend']*mlr.coef_[0] + df['Administration']*mlr.coef_[1] + \
                        df['Marketing Spend']*mlr.coef_[2] + df['State']*mlr.coef_[3] + \
                        mlr.intercept_
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,Pred_Price_Eqn
0,165349.2,136897.8,471784.1,2,192261.83,193337.125134
1,162597.7,151377.59,443898.53,0,191792.06,187476.836547
2,153441.51,101145.55,407934.54,1,191050.39,185231.142969
3,144372.41,118671.85,383199.62,2,182901.99,176180.968111
4,142107.34,91391.77,366168.42,1,166187.94,176049.666606


In [24]:
print(mlr.predict([[12345,324324,432425,1]]))
print(mlr.predict([[365732,123422,345233,2]]))

[43797.17247105]
[361109.07598235]
