A venture capitalist company has hired me for model creation assignment. Our goal is to prepare a model that can predict the profit of the company based on company's spending pattern and company's location

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('50_Startups.csv')

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [4]:
data.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [8]:
# Deal with categorical data

finalDataset = pd.concat( [pd.get_dummies(data['State']) , data.iloc[:,[0,1,2,4]]] , axis = 1)
finalDataset.head()

Unnamed: 0,California,Florida,New York,R&D Spend,Administration,Marketing Spend,Profit
0,0,0,1,165349.2,136897.8,471784.1,192261.83
1,1,0,0,162597.7,151377.59,443898.53,191792.06
2,0,1,0,153441.51,101145.55,407934.54,191050.39
3,0,0,1,144372.41,118671.85,383199.62,182901.99
4,0,1,0,142107.34,91391.77,366168.42,166187.94


**Inferential Stat approach starts...**

In [10]:
#Seperate data as features and label

features = finalDataset.iloc[:,[0,1,2,3,4,5]].values
label = finalDataset.iloc[:,[6]].values

In [11]:
#Create train test split
# Use the train split to perform model training
# Use the test split to perform model evaluation
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(features,
                                                 label,
                                                 test_size=0.2,
                                                 random_state=10)

In [20]:
X_train.ndim

2

In [12]:
#Train the model
from sklearn.linear_model import LinearRegression
modelProfitPredictor = LinearRegression()
modelProfitPredictor.fit(X_train,y_train)

In [14]:
#
# Model Evaluation
#
# SL = 0.05
# CL = 0.95
#
# My model must achieve atleast 95% accuracy !!!
#

# Checking score of the model with the training data
trainScore = modelProfitPredictor.score(X_train,y_train)
testScore = modelProfitPredictor.score(X_test,y_test)

# Test for Generalization --- We are trying to derive a generalized model. Generalized model basically means the model
# understands the population pattern to its optimal level.

#
# A generalized model is a model that not only perform best with known data but also performs best with unknown data.
# (We need to ensure that there is no Drift in the model when unknown data is introduced for prediction)
#

#
# testScore > trainScore -----> Ensures Generalized model
# testScore >= CL ------------> Ensures our SL criteria is achieved
#

SL = 0.05
CL = 1-SL
if testScore > trainScore and testScore >= CL:
  print("Model Approved")
else:
  print("Model Rejected")

print("\n\n TestScore is {} and TrainScore is {} ".format(testScore,trainScore))

Model Approved


 TestScore is 0.9901105113397705 and TrainScore is 0.9385918220043519 


In [16]:
#
# App Example
#

rdSpend = float(input("Enter RD Spend: "))
admSpend = float(input("Enter Admin Spend: "))
markSpend = float(input("Enter Marketing Spend: "))
state = input("Enter State: ")

approvedState = ['California','Florida', 'New York']

#Drift Check
if state in approvedState:
  if state == "California":
    finalFeatureSet = np.array([[1,0,0,rdSpend,admSpend,markSpend]])
  elif state == "Florida":
    finalFeatureSet = np.array([[0,1,0,rdSpend,admSpend,markSpend]])
  elif state == "New York":
    finalFeatureSet = np.array([[0,0,1,rdSpend,admSpend,markSpend]])

  profit = modelProfitPredictor.predict(finalFeatureSet)
  print("Model Predicted profit of $ ",profit)
else:
  print("Model doesnt identify {} state, thus can't do prediction !".format(state))


Enter RD Spend: 343443
Enter Admin Spend: 533535
Enter Marketing Spend: 3222334
Enter State: California
Model Predicted profit of $  [[390783.3351815]]


In [21]:
modelProfitPredictor.predict(np.array([[1,0,0,878787,9889,8989],[0,1,0,2323,2332,2332]]))

array([[758292.0250614 ],
       [ 52580.49311604]])