In [9]:
import pandas as pd
import numpy as np
  
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor  

In [10]:
# load dataset 
df = pd.read_csv('50_Startups.csv')
#untuk menyeleksi 5 data teratas
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [13]:
X = df.drop('Profit', 1)
y = df['Profit']

  X = df.drop('Profit', 1)


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [15]:
trf1 = ColumnTransformer(transformers =[
	('cat', SimpleImputer(strategy ='most_frequent'), ['State']),
	('num', SimpleImputer(strategy ='median'), ['R&D Spend', 'Administration', 'Marketing Spend']),
	
], remainder ='passthrough')

In [16]:
first_step = trf1.fit_transform(X_train)
first_step

array([['Florida', 55493.95, 103057.49, 214634.81],
       ['New York', 46014.02, 85047.44, 205517.64],
       ['Florida', 75328.87, 144135.98, 134050.07],
       ['California', 46426.07, 157693.92, 210797.67],
       ['Florida', 91749.16, 114175.79, 294919.57],
       ['Florida', 130298.13, 145530.06, 323876.68],
       ['Florida', 119943.24, 156547.42, 256512.92],
       ['New York', 1000.23, 124153.04, 1903.93],
       ['New York', 542.05, 51743.15, 0.0],
       ['New York', 65605.48, 153032.06, 107138.38],
       ['New York', 114523.61, 122616.84, 261776.23],
       ['Florida', 61994.48, 115641.28, 91131.24],
       ['California', 63408.86, 129219.61, 46085.25],
       ['California', 78013.11, 121597.55, 264346.06],
       ['California', 23640.93, 96189.63, 148001.11],
       ['California', 76253.86, 113867.3, 298664.47],
       ['New York', 15505.73, 127382.3, 35534.17],
       ['New York', 120542.52, 148718.95, 311613.29],
       ['California', 91992.39, 135495.07, 252664.93],
  

In [17]:
pd.DataFrame(first_step).head()

Unnamed: 0,0,1,2,3
0,Florida,55493.95,103057.49,214634.81
1,New York,46014.02,85047.44,205517.64
2,Florida,75328.87,144135.98,134050.07
3,California,46426.07,157693.92,210797.67
4,Florida,91749.16,114175.79,294919.57


In [18]:
pd.DataFrame(first_step).isna().sum()

0    0
1    0
2    0
3    0
dtype: int64

In [19]:
# this is a dictionary, with the names of the transformers as keys.
trf1.named_transformers_

{'cat': SimpleImputer(strategy='most_frequent'),
 'num': SimpleImputer(strategy='median')}

In [20]:
# these were the median values of each of the three numerical columns.
# for any transformer, you can access its specific attributes this way.
trf1.named_transformers_['num'].statistics_

array([ 71430.7  , 125604.625, 208157.655])

In [21]:
trf2 = ColumnTransformer(transformers =[
	('enc', OneHotEncoder(sparse = False, drop ='first'), list(range(1))),
], remainder ='passthrough')

In [22]:
second_step = trf2.fit_transform(first_step)
pd.DataFrame(second_step).head()

Unnamed: 0,0,1,2,3,4
0,1.0,0.0,55493.95,103057.49,214634.81
1,0.0,1.0,46014.02,85047.44,205517.64
2,1.0,0.0,75328.87,144135.98,134050.07
3,0.0,0.0,46426.07,157693.92,210797.67
4,1.0,0.0,91749.16,114175.79,294919.57


In [23]:
pipe = Pipeline(steps =[
	('tf1', trf1),
	('tf2', trf2),
	('tf3', MinMaxScaler()), # or StandardScaler, or any other scaler
	('model', RandomForestRegressor(n_estimators = 200)),
# or LinearRegression, SVR, DecisionTreeRegressor, etc
])

In [24]:
cvs = cross_val_score(pipe, X_train, y_train, cv = 5)
print("All cross val scores:", cvs)
print("All cross val scores:", cvs)
print("Mean of all scores: ", cvs.mean())

All cross val scores: [0.80510486 0.87645741 0.88977462 0.87913836 0.93133291]
All cross val scores: [0.80510486 0.87645741 0.88977462 0.87913836 0.93133291]
Mean of all scores:  0.8763616318224814


In [25]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('tf1',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  SimpleImputer(strategy='most_frequent'),
                                                  ['State']),
                                                 ('num',
                                                  SimpleImputer(strategy='median'),
                                                  ['R&D Spend',
                                                   'Administration',
                                                   'Marketing Spend'])])),
                ('tf2',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('enc',
                                                  OneHotEncoder(drop='first',
                                                                sparse=False),
                                                  [0])])),


In [26]:
preds = pipe.predict(X_test)
  
# This is how the original test set insurance prices and 
# our predicted ones stack up
  
pd.DataFrame({'original test set':y_test, 'predictions': preds})

Unnamed: 0,original test set,predictions
28,103282.38,103308.41605
11,144259.4,135564.2869
10,146121.95,137085.80365
41,77798.83,80378.7636
2,191050.39,181862.3364
27,105008.31,113078.57185
38,81229.06,69733.31735
31,97483.56,99127.6017
22,110352.25,111213.50575
4,166187.94,168197.43165
