In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset = pd.read_csv('50_Startups.csv')
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
# This is a dataset of 50 startups company

# Data Preprocessing

In [4]:
# 1) Handling missing value
# 2) Handling outlier - Outlier only required when we have large tree to avoid overfitting
# 3) Encoding - in tree based model, always use label encoder method
# 4) Feature Scaling - there is no need to do feature scaling in Decision Tree

In [5]:
# checking missing data
dataset.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [7]:
# converting State datatype from object to int data type
dataset['State'] = dataset['State'].astype('category')
dataset['State'] = dataset['State'].cat.codes

In [8]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     int8   
 4   Profit           50 non-null     float64
dtypes: float64(4), int8(1)
memory usage: 1.7 KB


In [9]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [10]:
# Split the data into independent and dependent variables
x = dataset.iloc[:,0:-1]  # Independent
y = dataset.iloc[:,-1]  # dependent 

In [11]:
x.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State
0,165349.2,136897.8,471784.1,2
1,162597.7,151377.59,443898.53,0
2,153441.51,101145.55,407934.54,1
3,144372.41,118671.85,383199.62,2
4,142107.34,91391.77,366168.42,1


In [12]:
y.head()

0    192261.83
1    191792.06
2    191050.39
3    182901.99
4    166187.94
Name: Profit, dtype: float64

In [13]:
# Splitting the data into training and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.25, random_state=1)

# Building DecisionTree Regressor Model

In [14]:
from sklearn.tree import DecisionTreeRegressor
dtregressor =  DecisionTreeRegressor()
dtregressor.fit(x_train, y_train)

In [15]:
# Predict test data with dtregressor model
y_pred_train = dtregressor.predict(x_train)
y_pred_test = dtregressor.predict(x_test)

In [16]:
# Evaluate your model
from sklearn.metrics import r2_score

In [17]:
print("Training Accuracy :", r2_score(y_train, y_pred_train))
print()
print("Test Accuracy :", r2_score(y_test, y_pred_test))

Training Accuracy : 1.0

Test Accuracy : 0.9020012676500316


In [18]:
#Using cross validation method (K-Fold Mothod)
from sklearn.model_selection import cross_val_score
Trainging_accuracy = cross_val_score(dtregressor, x_train, y_train, cv=10)
print(Trainging_accuracy)

[  0.42443097   0.83064354  -0.34675014   0.85141857   0.98007281
   0.26438253   0.92982412   0.88695806 -21.22435468   0.9252069 ]


In [19]:
Trainging_accuracy.mean()

-1.547816732728323

# Linear Regression Model

In [20]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(x_train, y_train)

In [21]:
# Predict test data with linear model
y_pred_train_lr = linear.predict(x_train)
y_pred_test_lr = linear.predict(x_test)

In [22]:
print("Training Accuracy :", r2_score(y_train, y_pred_train_lr))
print()
print("Test Accuracy :", r2_score(y_test, y_pred_test_lr))

Training Accuracy : 0.9419507593691141

Test Accuracy : 0.9616053937220065


# RandomForestRegressor Model

In [26]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor(n_estimators=500)
rf_regressor.fit(x_train, y_train)

In [27]:
# Predict test data with linear model
y_pred_train_rf = rf_regressor.predict(x_train)
y_pred_test_rf = rf_regressor.predict(x_test)

In [28]:
print("Training Accuracy :", r2_score(y_train, y_pred_train_rf))
print()
print("Test Accuracy :", r2_score(y_test, y_pred_test_rf))

Training Accuracy : 0.9855614361255239

Test Accuracy : 0.9393654580470462
