In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression

In [2]:
df = np.round(pd.read_csv('50_Startups.csv')[['R&D Spend','Administration','Marketing Spend','Profit']]/10000)
np.random.seed(9)


In [5]:
df = df.sample(5)
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
37,4.0,5.0,20.0,9.0
14,12.0,16.0,26.0,13.0
44,2.0,15.0,3.0,7.0
21,8.0,15.0,30.0,11.0
2,15.0,10.0,41.0,19.0


In [6]:
df = df.iloc[:,0:-1]
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,12.0,16.0,26.0
44,2.0,15.0,3.0
21,8.0,15.0,30.0
2,15.0,10.0,41.0


In [7]:
df.iloc[1,0] = np.NaN
df.iloc[3,1] = np.NaN
df.iloc[-1,-1] = np.NaN

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[1,0] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[3,1] = np.NaN
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.iloc[-1,-1] = np.NaN


In [8]:
df

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,,16.0,26.0
44,2.0,15.0,3.0
21,8.0,,30.0
2,15.0,10.0,


In [9]:
# Step 1 - Impute all missing values with mean of respective col

df0 = pd.DataFrame()

df0['R&D Spend'] = df['R&D Spend'].fillna(df['R&D Spend'].mean())
df0['Administration'] = df['Administration'].fillna(df['Administration'].mean())
df0['Marketing Spend'] = df['Marketing Spend'].fillna(df['Marketing Spend'].mean())

In [11]:
#After 0th iteration
df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,7.25,16.0,26.0
44,2.0,15.0,3.0
21,8.0,11.5,30.0
2,15.0,10.0,19.75


In [12]:
# Remove the col1 imputed value
df1 = df0.copy()

df1.iloc[1,0] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,,16.0,26.0
44,2.0,15.0,3.0
21,8.0,11.5,30.0
2,15.0,10.0,19.75


In [13]:
# Use first 3 rows to build a model and use the last for prediction

X = df1.iloc[[0,2,3,4],1:3]
X

Unnamed: 0,Administration,Marketing Spend
37,5.0,20.0
44,15.0,3.0
21,11.5,30.0
2,10.0,19.75


In [14]:
y = df1.iloc[[0,2,3,4],0]
y

37     4.0
44     2.0
21     8.0
2     15.0
Name: R&D Spend, dtype: float64

In [15]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[1,1:].values.reshape(1,2))



array([10.65431182])

In [18]:
df1.iloc[1,0] = 10.65

In [19]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.65,16.0,26.0
44,2.0,15.0,3.0
21,8.0,11.5,30.0
2,15.0,10.0,19.75


In [20]:
# Remove the col2 imputed value

df1.iloc[3,1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.65,16.0,26.0
44,2.0,15.0,3.0
21,8.0,,30.0
2,15.0,10.0,19.75


In [21]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[[0,1,2,4],[0,2]]
X

Unnamed: 0,R&D Spend,Marketing Spend
37,4.0,20.0
14,10.65,26.0
44,2.0,3.0
2,15.0,19.75


In [22]:
y = df1.iloc[[0,1,2,4],1]
y

37     5.0
14    16.0
44    15.0
2     10.0
Name: Administration, dtype: float64

In [23]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[3,[0,2]].values.reshape(1,2))



array([8.59936783])

In [24]:
df1.iloc[3,1] = 8.6

In [25]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.65,16.0,26.0
44,2.0,15.0,3.0
21,8.0,8.6,30.0
2,15.0,10.0,19.75


In [26]:
# Remove the col3 imputed value
df1.iloc[4,-1] = np.NaN

df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.65,16.0,26.0
44,2.0,15.0,3.0
21,8.0,8.6,30.0
2,15.0,10.0,


In [27]:
# Use last 3 rows to build a model and use the first for prediction
X = df1.iloc[0:4,0:2]
X

Unnamed: 0,R&D Spend,Administration
37,4.0,5.0
14,10.65,16.0
44,2.0,15.0
21,8.0,8.6


In [28]:
y = df1.iloc[0:4,-1]
y

37    20.0
14    26.0
44     3.0
21    30.0
Name: Marketing Spend, dtype: float64

In [29]:
lr = LinearRegression()
lr.fit(X,y)
lr.predict(df1.iloc[4,0:2].values.reshape(1,2))



array([47.24400526])

In [30]:
df1.iloc[4,-1] = 47.24

In [31]:
df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.65,16.0,26.0
44,2.0,15.0,3.0
21,8.0,8.6,30.0
2,15.0,10.0,47.24


In [32]:
# Subtract 0th iteration from 1st iteration

df1 - df0

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,0.0,0.0,0.0
14,3.4,0.0,0.0
44,0.0,0.0,0.0
21,0.0,-2.9,0.0
2,0.0,0.0,27.49


In [33]:
df2 = df1.copy()

df2.iloc[1,0] = np.NaN

df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,,16.0,26.0
44,2.0,15.0,3.0
21,8.0,8.6,30.0
2,15.0,10.0,47.24


In [34]:
X = df2.iloc[[0,2,3,4],1:3]
y = df2.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[1,1:].values.reshape(1,2))



array([9.97967398])

In [35]:
df2.iloc[1,0] = 10

In [36]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.0,16.0,26.0
44,2.0,15.0,3.0
21,8.0,8.6,30.0
2,15.0,10.0,47.24


In [37]:
df2.iloc[3,1] = np.NaN
X = df2.iloc[[0,1,2,4],[0,2]]
y = df2.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[3,[0,2]].values.reshape(1,2))



array([6.72940507])

In [38]:
df2.iloc[3,1] = 6.73

In [39]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.0,16.0,26.0
44,2.0,15.0,3.0
21,8.0,6.73,30.0
2,15.0,10.0,47.24


In [40]:
df2.iloc[4,-1] = np.NaN

X = df2.iloc[0:4,0:2]
y = df2.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df2.iloc[4,0:2].values.reshape(1,2))



array([47.52494696])

In [41]:
df2.iloc[4,-1] = 47.52

In [42]:
df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.0,16.0,26.0
44,2.0,15.0,3.0
21,8.0,6.73,30.0
2,15.0,10.0,47.52


In [43]:
df2 - df1

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,0.0,0.0,0.0
14,-0.65,0.0,0.0
44,0.0,0.0,0.0
21,0.0,-1.87,0.0
2,0.0,0.0,0.28


In [44]:
df3 = df2.copy()

df3.iloc[1,0] = np.NaN

df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,,16.0,26.0
44,2.0,15.0,3.0
21,8.0,6.73,30.0
2,15.0,10.0,47.52


In [45]:
X = df3.iloc[[0,2,3,4],1:3]
y = df3.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[1,1:].values.reshape(1,2))



array([10.02774441])

In [46]:
df3.iloc[1,0] = 10.02

In [47]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.02,16.0,26.0
44,2.0,15.0,3.0
21,8.0,6.73,30.0
2,15.0,10.0,47.52


In [48]:
df3.iloc[3,1] = np.NaN
X = df3.iloc[[0,1,2,4],[0,2]]
y = df3.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[3,[0,2]].values.reshape(1,2))



array([6.82447245])

In [50]:
df3.iloc[3,1] = 6.83

In [51]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.02,16.0,26.0
44,2.0,15.0,3.0
21,8.0,6.83,30.0
2,15.0,10.0,47.52


In [52]:
df3.iloc[4,-1] = np.NaN

X = df3.iloc[0:4,0:2]
y = df3.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df3.iloc[4,0:2].values.reshape(1,2))



array([47.55171878])

In [53]:
df3.iloc[4,-1] = 47.55

In [55]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.02,16.0,26.0
44,2.0,15.0,3.0
21,8.0,6.83,30.0
2,15.0,10.0,47.55


In [56]:
df3-df2

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,0.0,0.0,0.0
14,0.02,0.0,0.0
44,0.0,0.0,0.0
21,0.0,0.1,0.0
2,0.0,0.0,0.03


In [57]:
df4 = df3.copy()

df4.iloc[1,0] = np.NaN

df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,,16.0,26.0
44,2.0,15.0,3.0
21,8.0,6.83,30.0
2,15.0,10.0,47.55


In [58]:
X = df4.iloc[[0,2,3,4],1:3]
y = df4.iloc[[0,2,3,4],0]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df4.iloc[1,1:].values.reshape(1,2))



array([10.02187907])

In [63]:
df4.iloc[1,0] = 10.02

In [64]:
df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.02,16.0,26.0
44,2.0,15.0,3.0
21,8.0,,30.0
2,15.0,10.0,47.55


In [65]:
df4.iloc[3,1] = np.NaN
X = df4.iloc[[0,1,2,4],[0,2]]
y = df4.iloc[[0,1,2,4],1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df4.iloc[3,[0,2]].values.reshape(1,2))



array([6.83465304])

In [66]:
df4.iloc[3,1] = 6.83

In [67]:
df4.iloc[4,-1] = np.NaN

X = df4.iloc[0:4,0:2]
y = df4.iloc[0:4,-1]

lr = LinearRegression()
lr.fit(X,y)
lr.predict(df4.iloc[4,0:2].values.reshape(1,2))



array([47.55171878])

In [68]:
df4.iloc[4,-1] = 47.55

In [69]:
df4

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.02,16.0,26.0
44,2.0,15.0,3.0
21,8.0,6.83,30.0
2,15.0,10.0,47.55


In [70]:
df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,4.0,5.0,20.0
14,10.02,16.0,26.0
44,2.0,15.0,3.0
21,8.0,6.83,30.0
2,15.0,10.0,47.55


In [71]:
df4-df3

Unnamed: 0,R&D Spend,Administration,Marketing Spend
37,0.0,0.0,0.0
14,0.0,0.0,0.0
44,0.0,0.0,0.0
21,0.0,0.0,0.0
2,0.0,0.0,0.0
