In [None]:
# Use-case: An HR Company has hired you as a DS. Your goal is to create a model
# that can predict the salary of the employee based on his/her years of exp.

In [90]:
import numpy as np
import pandas as pd

In [91]:
data = pd.read_csv('50_Startups.csv')

In [92]:
data.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [93]:
data.dropna(inplace=True)

In [94]:
#features = data.iloc[:,[0,1,2,3]].values
label = data.iloc[:,[4]].values
label

array([[192261.83],
       [191792.06],
       [191050.39],
       [182901.99],
       [166187.94],
       [156991.12],
       [156122.51],
       [152211.77],
       [146121.95],
       [141585.52],
       [134307.35],
       [132602.65],
       [129917.04],
       [126992.93],
       [125370.37],
       [124266.9 ],
       [122776.86],
       [118474.03],
       [111313.02],
       [110352.25],
       [108733.99],
       [108552.04],
       [107404.34],
       [105733.54],
       [105008.31],
       [103282.38],
       [101004.64],
       [ 99937.59],
       [ 97483.56],
       [ 97427.84],
       [ 96778.92],
       [ 96712.8 ],
       [ 96479.51],
       [ 90708.19],
       [ 89949.14],
       [ 81229.06],
       [ 81005.76],
       [ 78239.91],
       [ 77798.83],
       [ 71498.49],
       [ 69758.98],
       [ 65200.33],
       [ 64926.08],
       [ 49490.75],
       [ 42559.73],
       [ 35673.41],
       [ 14681.4 ]])

In [97]:
#Deal with categorical data, refer to DAY3 ML

from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(sparse=False)
dummyState = ohe.fit_transform(data.iloc[:,[3]])

print(dummyState)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]]




In [98]:
# Deal with categorical data
features = pd.concat([data.iloc[:,[0,1,2]], pd.get_dummies(data['State'])] , axis = 1)
features.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,California,Florida,New York
0,165349.2,136897.8,471784.1,0,0,1
1,162597.7,151377.59,443898.53,1,0,0
2,153441.51,101145.55,407934.54,0,1,0
3,144372.41,118671.85,383199.62,0,0,1
4,142107.34,91391.77,366168.42,0,1,0


In [None]:
# Rules for Regression for ANN
#=========================================================================
# 1. Data must be COMPLETE
# 2. Data must be STRICTLY NUMERIC
# 3. Features must be represented in the form of 2d numpy array
# 4. Label must be represented in the form of 2d numpy array
# 5. V-IMP POINT ------ NORMALIZING/STANDARDIZING FEATURES AND LABEL IS MANDATORY !!!!

# Guideline by Prashant Nair for Normalization rule
# ========================================================================
# 1. Features must be Normalized using StandardScaler or RobustScaler.
# 2. Label must be Normalized using MinMaxScaler with range 0,1

In [12]:
#Create Features and Label
#features = data.iloc[:,[0]].values
#label = data.iloc[:,[1]].values

In [99]:
#Feature Standardization
from sklearn.preprocessing import StandardScaler
scFeatures = StandardScaler()
features = scFeatures.fit_transform(features)

In [100]:
#Label Normalization
from sklearn.preprocessing import MinMaxScaler
minMaxLabel = MinMaxScaler()
label = minMaxLabel.fit_transform(label)

In [87]:
#pickle.dump(minMaxLabel, open("newMinMax", "wb"))

In [89]:
#0.01904087
#39343.0

#minMaxLabel.inverse_transform(np.array([[1.3952032]]))

array([[1.3952032]])

In [101]:
# train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(features,
                                               label,
                                               test_size=0.2,
                                               random_state=10)

In [None]:
#Model Building and training process for ANN

In [102]:
import tensorflow as tf
tf.__version__

'2.15.0'

# **Adding Custom Callback**

In [103]:
# SL = 0.2
# CL = 1 - SL = 0.8
#
# Setup the following condition
#
# if testScore > trainScore and testScore >= CL:
#       Stop Training and Finalize the trained model


Early Stopping Using Custom Callback function

Step1: Setup custom callback

In [104]:
class MyCLRuleCallback(tf.keras.callbacks.Callback):
  def __init__(self,cl):
    super(MyCLRuleCallback, self).__init__()
    self.cl=cl

  def on_epoch_end(self, epoch, logs=None):
    testScore = logs["val_r2_score"]
    trainScore = logs["r2_score"]

    if testScore > trainScore and testScore >= self.cl:
      self.model.stop_training=True

Step2: Architect Model

In [105]:
import tensorflow as tf

#Step1: Create a Sequential Object

model = tf.keras.Sequential()

#Step2: Create Dense Layer and add the same to my Sequential Layer
#units ---- no of neurons in the hidden layer or the output layer
#input_shape --- (noOfColsInFeatureArray,)


#Input Layer + H1
model.add( tf.keras.layers.Dense(units=100 , input_shape=(6,), activation='sigmoid') )
# H2
model.add( tf.keras.layers.Dense(units=100 , activation='sigmoid') )
# H3
model.add( tf.keras.layers.Dense(units=100 , activation='sigmoid') )
# Output Layer
model.add( tf.keras.layers.Dense(units=1 , activation='linear') )


Step3: Compile Model

In [106]:
model.compile(optimizer="sgd", loss="mean_squared_error", metrics=tf.keras.metrics.R2Score())

Step4: Train Model

In [107]:
myR2ScoreMonitor = MyCLRuleCallback(cl=0.8)
model.fit(X_train,y_train,epochs=1000000,validation_data=(X_test,y_test),callbacks=[myR2ScoreMonitor])

Epoch 1/1000000
Epoch 2/1000000
Epoch 3/1000000
Epoch 4/1000000
Epoch 5/1000000
Epoch 6/1000000
Epoch 7/1000000
Epoch 8/1000000
Epoch 9/1000000
Epoch 10/1000000
Epoch 11/1000000
Epoch 12/1000000
Epoch 13/1000000
Epoch 14/1000000
Epoch 15/1000000
Epoch 16/1000000
Epoch 17/1000000
Epoch 18/1000000
Epoch 19/1000000
Epoch 20/1000000
Epoch 21/1000000
Epoch 22/1000000
Epoch 23/1000000
Epoch 24/1000000
Epoch 25/1000000
Epoch 26/1000000
Epoch 27/1000000
Epoch 28/1000000
Epoch 29/1000000
Epoch 30/1000000
Epoch 31/1000000
Epoch 32/1000000
Epoch 33/1000000
Epoch 34/1000000
Epoch 35/1000000
Epoch 36/1000000
Epoch 37/1000000
Epoch 38/1000000
Epoch 39/1000000
Epoch 40/1000000
Epoch 41/1000000
Epoch 42/1000000
Epoch 43/1000000
Epoch 44/1000000
Epoch 45/1000000
Epoch 46/1000000
Epoch 47/1000000
Epoch 48/1000000
Epoch 49/1000000
Epoch 50/1000000
Epoch 51/1000000
Epoch 52/1000000
Epoch 53/1000000
Epoch 54/1000000
Epoch 55/1000000
Epoch 56/1000000
Epoch 57/1000000
Epoch 58/1000000
Epoch 59/1000000
Epoch 

<keras.src.callbacks.History at 0x7da4cee69510>

In [108]:
#Check the Quality
# Train Score and testScore

print("Training Score is ",model.evaluate(X_train,y_train)[1])
print("Testing Score is ",model.evaluate(X_test,y_test)[1])

Training Score is  0.7672538757324219
Testing Score is  0.8007076978683472


In [109]:
# Final Deploy (Save all objects)

#tf.Keras model

model.save("ProfitPredictor.h5")

import pickle
pickle.dump(scFeatures, open("scFeatures","wb"))
pickle.dump(minMaxLabel, open("minMaxLabel","wb"))
pickle.dump(ohe, open("StateEncoderDecoder.pkl","wb"))

  saving_api.save_model(
