# Support vector machine - regression --->  PHASE 1 model creation. 
## PROBLEM STATEMENT: 
### Profit prediction for startups based on the investment rate in different departments

## read the dataset 

In [3]:
import pandas as pd
dataset = pd.read_csv("50_Startups.csv")
dataset

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


## convert the categorical data (state) into numerical data 
### - [using get_dummies() , drop first col - to avoid duplicates, dtype=int]

In [4]:
dataset = pd.get_dummies(dataset, drop_first=True, dtype=int)
dataset.head()   # only shows 1st 5 rows of data

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


## split input and output

In [5]:
dataset.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'Profit',
       'State_Florida', 'State_New York'],
      dtype='object')

In [6]:
independent = dataset[['R&D Spend', 'Administration', 'Marketing Spend','State_Florida', 'State_New York'  ]]
independent.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
0,165349.2,136897.8,471784.1,0,1
1,162597.7,151377.59,443898.53,0,0
2,153441.51,101145.55,407934.54,1,0
3,144372.41,118671.85,383199.62,0,1
4,142107.34,91391.77,366168.42,1,0


In [7]:
dependent = dataset[['Profit']]
dependent.head()

Unnamed: 0,Profit
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


## split train and test data 

In [8]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(independent, dependent, test_size=0.20, random_state=0)
""" 
TRAIN TEST RATIO - 80 : 20 
"""

' \nTRAIN TEST RATIO - 80 : 20 \n'

In [9]:
x_train # TRAIN INPUT 

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
33,55493.95,103057.49,214634.81,1,0
35,46014.02,85047.44,205517.64,0,1
26,75328.87,144135.98,134050.07,1,0
34,46426.07,157693.92,210797.67,0,0
18,91749.16,114175.79,294919.57,1,0
7,130298.13,145530.06,323876.68,1,0
14,119943.24,156547.42,256512.92,1,0
45,1000.23,124153.04,1903.93,0,1
48,542.05,51743.15,0.0,0,1
29,65605.48,153032.06,107138.38,0,1


In [10]:
y_train.head() # TRAIN OUTPUT 

Unnamed: 0,Profit
33,96778.92
35,96479.51
26,105733.54
34,96712.8
18,124266.9


In [11]:
x_test # TEST INPUT 

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State_Florida,State_New York
28,66051.52,182645.56,118148.2,1,0
11,100671.96,91790.61,249744.55,0,0
10,101913.08,110594.11,229160.95,1,0
41,27892.92,84710.77,164470.71,1,0
2,153441.51,101145.55,407934.54,1,0
27,72107.6,127864.55,353183.81,0,1
38,20229.59,65947.93,185265.1,0,1
31,61136.38,152701.92,88218.23,0,1
22,73994.56,122782.75,303319.26,1,0
4,142107.34,91391.77,366168.42,1,0


In [12]:
y_test.head() # TEST OUTPUT 

Unnamed: 0,Profit
28,103282.38
11,144259.4
10,146121.95
41,77798.83
2,191050.39


## standardization in SVM regression

In [18]:
from sklearn.preprocessing import StandardScaler       #also used to scale features 
# fit: finds mean and standard deviation values ,  transform: performs the calculaton and applies them in dataset

# PREPROCESSING THE "INPUT" DATA
sc = StandardScaler()
x_train = sc.fit_transform(x_train)       
x_test = sc.transform(x_test)

In [20]:
# PREPROCESSING THE "OUTPUT" DATA
scy = StandardScaler()
y_train = scy.fit_transform(y_train)
y_test = scy.transform(y_test)

## MODEL CREATION 

In [21]:
# from sklearn.svm import SVR
# """
# kernel - kernel type (linear/non linear kernels) , {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’} or callable, default=’rbf’
# C - regularization parameter (used to increase model accurace by reducing errors in data points - when closer data points exist in dataset)
# """
# # create svr obj
# regressor = SVR(kernel="linear",C=3000)  
# # model creation
# regressor.fit(x_train,y_train)


## save the standardization (preprocessing technique) - to be used in the deployment phase

In [23]:
# or using joblib
# import joblib 
# joblib.dump(sc,'scaler.pkl')     # trained preprocessing object is saved 

# or using pickle 
# import pickle
# pickle.dump(sc,open('scaler.sav', 'wb'))

# Using Pipeline
import joblib
from sklearn.pipeline import Pipeline        # pipeline - connects preprocessing + model 
from sklearn.svm import SVR
# creating the pipeline
svr_pipeline = Pipeline([
    ("scaler", StandardScaler() ),
    ("model", SVR(kernel="linear",C=3000) ),
])

# run the pipeline + model creation
svr_pipeline.fit(x_train,y_train)
# save the pipeline
joblib.dump(svr_pipeline, 'svr_pipeline.pkl')


  y = column_or_1d(y, warn=True)


['svr_pipeline.pkl']

##  testing the model

In [24]:
y_predicted = svr_pipeline.predict(x_test)
y_predicted

array([-0.1402012 ,  0.60067208,  0.54966829, -0.96405228,  1.65249697,
        0.12861194, -1.05077401, -0.21478753,  0.05986832,  1.39587483])

## evaluation metrics

In [26]:

from sklearn.metrics import r2_score
r_score = r2_score(y_test,y_predicted)
r_score

# accuracy ( for preprocessed input + raw output) -> 0.863223093251272
# accuracy (for preprocessed input + preprocessed output -> 0.929558109611905

0.929558109611905

 # DEPLOYMENT PHASE

## PRE - PREPROCESSING THE INPUT DATA 

In [27]:
# sc.transform()  -- uses calculated value of the mean, std deviation from sc.fit (which is calculated previously in the code)
preproceed_input = sc.transform([[165349.20	,136897.80	,471784.10	,0,	1]]) 
preproceed_input

array([[1.65349200e+05, 1.36897800e+05, 4.71784100e+05, 4.99600361e-17,
        1.00000000e+00]])

##  model prediction

### prediction with preprocessed input data 

In [28]:
# predict the model
svr_pipeline.predict(preproceed_input)    # return original data

array([187329.0377399])