In [117]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib inline

In [118]:
df = pd.read_csv('data/Startups_Expense.csv', delimiter=',')
df.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [119]:
df.describe()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
count,50.0,50.0,50.0,50.0
mean,73721.6156,121344.6396,211025.0978,112012.6392
std,45902.256482,28017.802755,122290.310726,40306.180338
min,0.0,51283.14,0.0,14681.4
25%,39936.37,103730.875,129300.1325,90138.9025
50%,73051.08,122699.795,212716.24,107978.19
75%,101602.8,144842.18,299469.085,139765.9775
max,165349.2,182645.56,471784.1,192261.83


In [120]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [121]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [122]:
state = pd.get_dummies(df['State'], drop_first=True)
df = pd.concat([x, state], axis=1)

In [123]:
df = df.drop(columns=['State'])
df.head(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Florida,New York
0,165349.2,136897.8,471784.1,False,True
1,162597.7,151377.59,443898.53,False,False
2,153441.51,101145.55,407934.54,True,False
3,144372.41,118671.85,383199.62,False,True
4,142107.34,91391.77,366168.42,True,False


In [124]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

In [125]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled_data = scaler.fit_transform(x_train)
x_test_scaled_data = scaler.transform(x_test)

In [126]:
len(x_train_scaled_data)

40

In [127]:
x_train_scaled_data

array([[ 0.34202149,  0.22787678,  0.12425038,  1.36277029, -0.69388867],
       [ 1.36207849, -1.0974737 ,  1.14990688,  1.36277029, -0.69388867],
       [-0.71081297, -2.5770186 , -0.34136825, -0.73379939, -0.69388867],
       [ 0.90611438,  1.0172367 ,  0.66890185, -0.73379939,  1.44115338],
       [ 1.40997088, -0.09115403,  1.30006861, -0.73379939,  1.44115338],
       [ 1.20367103,  0.96116332, -0.95248784, -0.73379939, -0.69388867],
       [-1.05285826, -1.34392538, -0.62843389,  1.36277029, -0.69388867],
       [-1.61480906, -0.19649414,  0.54106768,  1.36277029, -0.69388867],
       [-1.642623  ,  0.52691442, -2.07854935, -0.73379939, -0.69388867],
       [ 0.77885123,  0.05437051,  0.2294954 , -0.73379939,  1.44115338],
       [ 0.96515572, -0.45976843,  0.61043134, -0.73379939, -0.69388867],
       [ 0.00687736,  0.01677049,  0.25215324, -0.73379939, -0.69388867],
       [-0.01361318, -0.80643974, -0.83912073, -0.73379939,  1.44115338],
       [-0.66099544,  1.34830937, -0.2

In [128]:
len(y_train)

40

In [129]:
y_train

12    141585.52
4     166187.94
37     89949.14
8     152211.77
3     182901.99
6     156122.51
41     77798.83
46     49490.75
47     42559.73
15    129917.04
9     149759.96
16    126992.93
24    108552.04
34     96712.80
31     97483.56
0     192261.83
44     65200.33
27    105008.31
33     96778.92
5     156991.12
29    101004.64
11    144259.40
36     90708.19
1     191792.06
21    111313.02
2     191050.39
43     69758.98
35     96479.51
23    108733.99
40     78239.91
10    146121.95
22    110352.25
18    124266.90
49     14681.40
20    118474.03
7     155752.60
42     71498.49
14    132602.65
28    103282.38
38     81229.06
Name: Profit, dtype: float64

In [133]:
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

linear = LinearRegression()
linear.fit(x_train_scaled_data,y_train)
pred_2 = linear.predict(x_test_scaled_data)


In [135]:
from sklearn.metrics import r2_score
print(r2_score(y_test,pred_2))

0.8987266414319837


In [140]:
print(pd.DataFrame(list(y_test),pred_2))

                       0
126362.879083  134307.35
84608.453836    81005.76
99677.494252    99937.59
46357.460686    64926.08
128750.482885  125370.37
50912.417419    35673.41
109741.350327  105733.54
100643.242816  107404.34
97599.275746    97427.84
113097.425244  122776.86


In [152]:
svr = SVR(kernel='linear', gamma='auto')
svr.fit(x_train_scaled_data,y_train)
pred_3 = svr.predict(x_test_scaled_data)

In [153]:
from sklearn.metrics import r2_score
print(r2_score(y_test,pred_3))

-0.17569347992313866


In [154]:
print(pd.DataFrame(list(y_test),pred_3))

                       0
109540.813588  134307.35
109476.466040   81005.76
109487.644616   99937.59
109412.192061   64926.08
109546.692535  125370.37
109399.281790   35673.41
109511.762640  105733.54
109495.636972  107404.34
109471.637588   97427.84
109476.559843  122776.86
