# Coil2000 dataset
[Source](https://www.openml.org/search?type=data&sort=qualities.NumberOfFeatures&status=active&qualities.NumberOfClasses=lte_1&qualities.NumberOfFeatures=between_10_100&format=ARFF&qualities.NumberOfInstances=between_1000_10000&id=298)

Can you predict who would be interested in buying a caravan insurance policy and give an explanation why?

Perfect challenge for a random forest tree regressor.

9822 instances 

86 features

Each instance is a customer. All features are the customer informations, which include product usage data and socio-demographic data derived from zip area codes.

### Import libraries

In [23]:
import numpy as np
import pandas as pd
from scipy.io.arff import loadarff 
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import sklearn
from random_forest_regressor import RandomForestRegressor

RANDOM_SEED = 42


### Import data

In [None]:
raw_data = loadarff('coil2000.arff')
df = pd.DataFrame(raw_data[0])

In [3]:
df.head()

Unnamed: 0,MOSTYPE,MAANTHUI,MGEMOMV,MGEMLEEF,MOSHOOFD,MGODRK,MGODPR,MGODOV,MGODGE,MRELGE,...,APERSONG,AGEZONG,AWAOREG,ABRAND,AZEILPL,APLEZIER,AFIETS,AINBOED,ABYSTAND,CARAVAN
0,33.0,1.0,3.0,2.0,8.0,0.0,5.0,1.0,3.0,7.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,37.0,1.0,2.0,2.0,8.0,1.0,4.0,1.0,4.0,6.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,37.0,1.0,2.0,2.0,8.0,0.0,4.0,2.0,4.0,3.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,9.0,1.0,3.0,3.0,3.0,2.0,3.0,2.0,4.0,5.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,40.0,1.0,4.0,2.0,10.0,1.0,4.0,1.0,4.0,7.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9822 entries, 0 to 9821
Data columns (total 86 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   MOSTYPE   9822 non-null   float64
 1   MAANTHUI  9822 non-null   float64
 2   MGEMOMV   9822 non-null   float64
 3   MGEMLEEF  9822 non-null   float64
 4   MOSHOOFD  9822 non-null   float64
 5   MGODRK    9822 non-null   float64
 6   MGODPR    9822 non-null   float64
 7   MGODOV    9822 non-null   float64
 8   MGODGE    9822 non-null   float64
 9   MRELGE    9822 non-null   float64
 10  MRELSA    9822 non-null   float64
 11  MRELOV    9822 non-null   float64
 12  MFALLEEN  9822 non-null   float64
 13  MFGEKIND  9822 non-null   float64
 14  MFWEKIND  9822 non-null   float64
 15  MOPLHOOG  9822 non-null   float64
 16  MOPLMIDD  9822 non-null   float64
 17  MOPLLAAG  9822 non-null   float64
 18  MBERHOOG  9822 non-null   float64
 19  MBERZELF  9822 non-null   float64
 20  MBERBOER  9822 non-null   floa

In [5]:
len(df)

9822

In [6]:
df.empty

False

### Prepare the data

In [13]:
X_data = df.drop("CARAVAN", axis=1)
y_data = df["CARAVAN"]

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=RANDOM_SEED)

In [27]:
rf = RandomForestRegressor(n_trees=2, max_features=1, min_samples_split=2, max_depth=2)
rf.train(X_train, y_train)

In [28]:
y_pred = rf.predict(X_test)

In [29]:
import sklearn.ensemble


sklearn_rf = sklearn.ensemble.RandomForestRegressor(n_estimators=2, max_depth=2)
sklearn_rf.fit(X_train, y_train)

In [30]:
sk_y_pred = sklearn_rf.predict(X_test)

In [31]:
beauty_rmse = root_mean_squared_error(y_test, y_pred)
sk_rmse = root_mean_squared_error(y_test, sk_y_pred)
print("Beauty", beauty_rmse)
print("SK", sk_rmse)

Beauty 0.23853915651997426
SK 0.2360292221002501
