In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.preprocessing import MinMaxScaler

In [2]:
datastartups=pd.read_csv("50_Startups.csv")
datastartups.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [3]:
datastartups.head()
datastartups.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


# Data preprocessing

In [4]:

# Renaming the column names with special charachters and space.
datastartups=datastartups.rename(columns={"R&D Spend":"RnDspent","Marketing Spend":"MarketingSpent"})

# using label.encoding for State 
stateencoder=LabelEncoder()
datastartups["State"]=stateencoder.fit_transform(datastartups["State"])
datastartups.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   RnDspent        50 non-null     float64
 1   Administration  50 non-null     float64
 2   MarketingSpent  50 non-null     float64
 3   State           50 non-null     int32  
 4   Profit          50 non-null     float64
dtypes: float64(4), int32(1)
memory usage: 1.9 KB


# Model Transformation - Scaling

In [5]:
scaler=MinMaxScaler()
datastartups[["RnDspent","Administration","MarketingSpent","State","Profit"]]=scaler.fit_transform(datastartups)
datastartups.describe()

Unnamed: 0,RnDspent,Administration,MarketingSpent,State,Profit
count,50.0,50.0,50.0,50.0,50.0
mean,0.445854,0.533345,0.447292,0.5,0.548097
std,0.277608,0.213286,0.259208,0.416497,0.226974
min,0.0,0.0,0.0,0.0,0.0
25%,0.241527,0.39926,0.274066,0.0,0.42492
50%,0.441799,0.543661,0.450876,0.5,0.525378
75%,0.614474,0.712221,0.634759,1.0,0.704383
max,1.0,1.0,1.0,1.0,1.0


In [6]:

# initialisisng x and y
y=datastartups.iloc[:,4]
x=datastartups.iloc[:,:-1]

# splitting dataset into test and train

xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3,random_state=0)

# object creation and train the model
startModel=LinearRegression()
startModel.fit(xtrain,ytrain)

# Predicting the label

ytrainPredict=startModel.predict(xtrain)
ytestPredict=startModel.predict(xtest)

# Evaluating the output 

r2scoretrain=r2_score(ytrain,ytrainPredict)
r2scoretest=r2_score(ytest,ytestPredict)

print("Coefficient of determination r2Score-train =",r2scoretrain)
print("Coefficient of determination r2Score-test =",r2scoretest)

meansquarederrortrain=mean_squared_error(ytrain,ytrainPredict)
meansquarederrortest=mean_squared_error(ytest,ytestPredict)

print("Mean Squared Error for train =",meansquarederrortrain)
print("Mean Squared Error for test =",meansquarederrortest)

Coefficient of determination r2Score-train = 0.9515496105627431
Coefficient of determination r2Score-test = 0.9355139722149945
Mean Squared Error for train = 0.0028425081883885405
Mean Squared Error for test = 0.0019738493470693383


# Model performance Before Scaling the dataset

Coefficient of determination r2Score-train = 0.9515496105627431
Coefficient of determination r2Score-test = 0.9355139722149947
Mean Squared Error for train = 89637953.1399842
Mean Squared Error for test = 62244962.389464445

Before scaling the Coefficient of determination was good both during testing and training since its value is almost 95% that means this model is able to explain 95% of the variance in the data.
But there is a problem with the loss function - the loss function for a good perfoming model should be nearly zero but here it is high.

# Model performance  After transformation-scaling the Dataset

Coefficient of determination r2Score-train = 0.9515496105627431
Coefficient of determination r2Score-test = 0.9355139722149945
Mean Squared Error for train = 0.0028425081883885405
Mean Squared Error for test = 0.0019738493470693383

After scaling the Coefficient of determination remains good since its value is almost 95% that means this model is able to explain 95% of the variance in the data.
The loss function - the loss function for a good perfoming model should be nearly zero we can see after transforming the data it has reduced the MSE = loss function value to 0.002 train and 0.001 for test which is quite good.

So we can say our model after scaling is performing very well both in test and train and also expalining the variance with a very low residual error value which is our goal while creating a good model.
