In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df=pd.read_csv('data/car-sales-extended-missing-data.csv')
df.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [3]:
df.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [4]:
df.dropna(subset=['Price'],inplace=True)

In [5]:
x=df.drop('Price',axis=1)
y=df['Price']

In [6]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [7]:
obj_fet=['Make','Colour']
number_fet=['Odometer (KM)']
door_fet=['Doors']

    
obj_imputer=SimpleImputer(strategy='constant',fill_value='Missing')
number_imputer=SimpleImputer(strategy='mean')
door_imputer=SimpleImputer(strategy='most_frequent')


transformer=ColumnTransformer([
    ('obj_imputer',obj_imputer,obj_fet),
    ('number_imputer',number_imputer,number_fet),
    ('door_imputer',door_imputer,door_fet)
])

modifieddf=transformer.fit_transform(x)

In [8]:
modifiedx=pd.DataFrame(modifieddf,columns=['Make','Colour','Odometer (KM)','Doors'])

In [9]:
modifiedx.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [10]:
y.head()

0    15323.0
1    19943.0
2    28343.0
3    13434.0
4    14043.0
Name: Price, dtype: float64

In [11]:
modifiedx.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64

# Make dataFrame as numerical

In [12]:
x.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
dtype: object

In [13]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

fet=['Make','Colour','Doors']

one_hot=OneHotEncoder()

transformer=ColumnTransformer([
    ('one_hot',one_hot,fet)
],remainder='passthrough')

x=transformer.fit_transform(modifiedx)

In [14]:
x

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

# Fit a model

In [15]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2)

In [16]:
from sklearn.ensemble import RandomForestRegressor

rf=RandomForestRegressor(n_estimators=100)
rf.fit(x_train,y_train);

In [17]:
rf.score(x_test,y_test)

0.3152140279962015

In [18]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)
clf.fit(x_train,y_train);


In [19]:
clf.score(x_test,y_test)

0.0

In [21]:

df=pd.read_csv('data/heart-disease.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [22]:
df.isna().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [23]:
df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [24]:
x=df.drop('target',axis=1)
y=df['target']

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2)

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
clf=RandomForestClassifier(n_estimators=100)

In [44]:
clf.fit(x_train,y_train)
clf.score(x_train,y_train),clf.score(x_test,y_test)

(1.0, 0.8360655737704918)

In [30]:
result=clf.predict(x_test)

In [31]:
result

array([0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1], dtype=int64)

In [32]:
clf.predict_proba(x_test)

array([[0.5 , 0.5 ],
       [0.35, 0.65],
       [0.67, 0.33],
       [0.92, 0.08],
       [0.15, 0.85],
       [0.96, 0.04],
       [0.14, 0.86],
       [0.42, 0.58],
       [0.29, 0.71],
       [0.87, 0.13],
       [0.99, 0.01],
       [0.35, 0.65],
       [0.98, 0.02],
       [0.6 , 0.4 ],
       [0.89, 0.11],
       [0.64, 0.36],
       [0.12, 0.88],
       [0.12, 0.88],
       [0.16, 0.84],
       [0.52, 0.48],
       [0.02, 0.98],
       [0.83, 0.17],
       [0.2 , 0.8 ],
       [0.01, 0.99],
       [0.31, 0.69],
       [0.01, 0.99],
       [0.8 , 0.2 ],
       [0.04, 0.96],
       [0.26, 0.74],
       [0.93, 0.07],
       [0.58, 0.42],
       [0.86, 0.14],
       [1.  , 0.  ],
       [0.39, 0.61],
       [0.81, 0.19],
       [0.39, 0.61],
       [0.8 , 0.2 ],
       [0.18, 0.82],
       [0.26, 0.74],
       [0.13, 0.87],
       [0.53, 0.47],
       [0.42, 0.58],
       [0.9 , 0.1 ],
       [0.17, 0.83],
       [0.04, 0.96],
       [0.2 , 0.8 ],
       [0.37, 0.63],
       [0.83,

In [34]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,result)

0.18032786885245902

In [39]:
from sklearn.model_selection import cross_val_score

cval=cross_val_score(clf,x,y,cv=5)

In [40]:
cval

array([0.81967213, 0.90163934, 0.78688525, 0.81666667, 0.76666667])

In [45]:
np.mean(cval), clf.score(x_test,y_test)

(0.8183060109289617, 0.8360655737704918)