# Random Forest Regression

## Importing The Library

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk

## Importing The DataSet

In [15]:
data = pd.read_csv("insurance.csv")
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Encoding categorical data

In [16]:
obj_column = list(data.select_dtypes(['object']).columns)
data_encoded = pd.get_dummies(data=data,columns=obj_column)
data_encoded = data_encoded[["sex_female",'sex_male','smoker_no','smoker_yes',\
                             'region_northeast','region_northwest','region_southeast',\
                             'region_southwest','age','bmi','children','charges']]
data_encoded.head()

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest,age,bmi,children,charges
0,1,0,0,1,0,0,0,1,19,27.9,0,16884.924
1,0,1,1,0,0,0,1,0,18,33.77,1,1725.5523
2,0,1,1,0,0,0,1,0,28,33.0,3,4449.462
3,0,1,1,0,0,1,0,0,33,22.705,0,21984.47061
4,0,1,1,0,0,1,0,0,32,28.88,0,3866.8552


In [17]:
x = data_encoded.iloc[:, :-1].values
y = data_encoded.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [18]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 0)

## Training the Random Forest Regression model on the Training set

In [28]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=20,random_state=0)
regressor.fit(x_train,y_train)

RandomForestRegressor(n_estimators=20, random_state=0)

## Predicting the Test set results

In [29]:
y_pred = regressor.predict(x_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

[[ 9929.89  9724.53]
 [10572.22  8547.69]
 [44175.83 45702.02]
 [12996.47 12950.07]
 [ 9365.03  9644.25]
 [10605.6   4500.34]
 [ 2400.51  2198.19]
 [10628.46 11436.74]
 [ 7891.12  7537.16]
 [ 5211.12  5425.02]
 [10647.08  6753.04]
 [17226.28 10493.95]
 [ 9788.23  7337.75]
 [ 5483.3   4185.1 ]
 [23498.82 18310.74]
 [11862.49 10702.64]
 [16639.42 12523.6 ]
 [12221.81  3490.55]
 [ 7003.5   6457.84]
 [34019.22 33475.82]
 [23472.91 23967.38]
 [12500.78 12643.38]
 [10158.2  23045.57]
 [28014.96 23065.42]
 [ 4704.53  1674.63]
 [ 9248.9   4667.61]
 [ 7166.39  3732.63]
 [ 9693.27  7682.67]
 [ 3761.62  3756.62]
 [12575.93  8413.46]
 [ 7408.85  8059.68]
 [48057.68 48970.25]
 [14091.1  12979.36]
 [11679.56 20630.28]
 [14849.7  14571.89]
 [ 3666.96  4137.52]
 [ 9167.23  8347.16]
 [37663.1  51194.56]
 [40575.15 40003.33]
 [ 8069.94  1880.49]
 [ 9645.89  5458.05]
 [ 3213.46  2867.12]
 [21222.44 20149.32]
 [46375.4  47496.49]
 [36775.72 36149.48]
 [ 8419.92 26018.95]
 [11862.49 19749.38]
 [ 6783.83  6

## Evaluating the Model Performance

In [30]:
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.8968649606488012

In [31]:
regressor.predict([[1,0,0,1,1,0,1,1,25,26,3]])

array([17013.85])