# 1. using pandas

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
car_sales_missing = pd.read_csv('Data/car-sales-extended-missing-data.csv')

In [6]:
len(car_sales_missing)

1000

In [7]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


## 1. Fill the data with pandas

In [8]:
# check how the columns have missing data
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [9]:
car_sales_missing['Make'].fillna('missing', inplace=True)

In [10]:
car_sales_missing.isna().sum()

Make              0
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [11]:
car_sales_missing['Colour'].fillna('missing', inplace=True)

In [14]:
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace=True)

In [15]:
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors            50
Price            50
dtype: int64

In [16]:
car_sales_missing['Doors'].fillna(4, inplace=True)

In [17]:
# This is regression problem, we will divide our feature and labels and Price should be our label coloumn
car_sales_missing.dropna(inplace=True)

In [18]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [19]:
# now length is decrease to 50
len(car_sales_missing)


950

## Once missing data is filled
1. create features and labels
2. convert into training and test sets
3. fit into model

### 1. Create features and labels

In [20]:
x = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

### 2. Convert into training and lest sets

In [21]:
from sklearn.model_selection import train_test_split

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) #using 20% data for testing

### 3. Choose the model and fit data into model

In [24]:
# this is regression problem
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

In [25]:
model.fit(x_train, y_train)

ValueError: could not convert string to float: 'Honda'

## Problem is that Model can't read the strings it only understand numaric data
### Now convet these strings into numeric

In [28]:
car_sales_missing.dtypes

Make              object
Colour            object
Odometer (KM)    float64
Doors            float64
Price            float64
dtype: object

In [31]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

features = ['Make', 'Colour', 'Doors']
one_hot = OneHotEncoder()

transformer = ColumnTransformer([('one_hot',
                                  one_hot,
                                  features)],
                                remainder='passthrough')

In [35]:
transformed_x = transformer.fit_transform(car_sales_missing)

transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        3.54310e+04, 1.53230e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        1.92714e+05, 1.99430e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        8.47140e+04, 2.83430e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        6.66040e+04, 3.15700e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.15883e+05, 4.00100e+03],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        2.48360e+05, 1.27320e+04]])

In [37]:
# let's do convert our x and y again with numeric features
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)

In [38]:
model = RandomForestRegressor()

In [39]:
model.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [40]:
model.score(x_test, y_test)

0.9995603681034669

In [41]:
import pickle

pickle.dump(model, open('car_sales_missing_extended.pkl', 'wb'))

In [42]:
load_model = pickle.load(open('car_sales_missing_extended.pkl', 'rb'))

In [43]:
load_model.score(x_test, y_test)

0.9995603681034669

# 2. Use sklearn to fill the missing values as well as convert into numbers

In [3]:
# basics import

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [5]:
car_sales_missing = pd.read_csv('Data/car-sales-extended-missing-data.csv')

In [7]:
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [10]:
# checking how many missing values in data set
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [11]:
# let's drop the label columns that have no value
car_sales_missing.dropna(subset=['Price'], inplace=True)

In [33]:
x = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [12]:
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [13]:
# our data contain missing values and now let's do fill the features missing values with sklean

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer


In [16]:
# create possible (required) imputer
cat_imputer = SimpleImputer(strategy='constant', fill_value='missing')
num_imputer = SimpleImputer(strategy='mean')
door_imputer = SimpleImputer(strategy='constant', fill_value=4)

In [17]:
# create saperate feature that are required to be impute so it will fill the missing values
cat_features = ['Make', 'Colour']
num_features = ['Odometer (KM)']
door_features = ['Doors']

In [21]:
# now create a imputer that will fill the missing values using imputer

imputer = ColumnTransformer([('cat_features', cat_imputer, cat_features),
                             ('num_features', num_imputer, num_features),
                             ('door_features', door_imputer, door_features)])

imputer

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('cat_features',
                                 SimpleImputer(add_indicator=False, copy=True,
                                               fill_value='missing',
                                               missing_values=nan,
                                               strategy='constant', verbose=0),
                                 ['Make', 'Colour']),
                                ('num_features',
                                 SimpleImputer(add_indicator=False, copy=True,
                                               fill_value=None,
                                               missing_values=nan,
                                               strategy='mean', verbose=0),
                                 ['Odometer (KM)']),
                                ('door_features',
                                 SimpleImputer(ad

In [23]:
# Now convert this data into DataFrame to check how many rows contain still missing values

features = pd.DataFrame(imputer)

ValueError: DataFrame constructor not properly called!

In [24]:
pd.DataFrame(imputer)

ValueError: DataFrame constructor not properly called!

In [25]:
np.random.seed(33)
pd.DataFrame(np.random.random((4, 4)))

Unnamed: 0,0,1,2,3
0,0.24851,0.449975,0.410941,0.2603
1,0.870396,0.18504,0.019661,0.953252
2,0.680451,0.486588,0.965027,0.393399
3,0.079558,0.351407,0.163635,0.983167


In [31]:
type(imputer)

sklearn.compose._column_transformer.ColumnTransformer

In [34]:
# Problem is that we create the imputer butt not pass the features to be filled
# now put the data into imputrer

filled_x = imputer.fit_transform(x)
filled_x

array([['Honda', 'White', 35431.0, 4.0],
       ['BMW', 'Blue', 192714.0, 5.0],
       ['Honda', 'White', 84714.0, 4.0],
       ...,
       ['Nissan', 'Blue', 66604.0, 4.0],
       ['Honda', 'White', 215883.0, 4.0],
       ['Toyota', 'Blue', 248360.0, 4.0]], dtype=object)

In [38]:
# now convert this filled_x (that is actually our feature ) into dataframe

filled_data = pd.DataFrame(filled_x,
                           columns=['Make', 'Colour', 'Odometer (KM)', 'Doors'])
filled_data

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
...,...,...,...,...
945,Toyota,Black,35820,4
946,missing,White,155144,3
947,Nissan,Blue,66604,4
948,Honda,White,215883,4


In [39]:
filled_data.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64

In [41]:
# data is filled

# now its time to fit into model

y.head(), filled_data.head()

(0    15323.0
 1    19943.0
 2    28343.0
 3    13434.0
 4    14043.0
 Name: Price, dtype: float64,
      Make Colour Odometer (KM) Doors
 0   Honda  White         35431     4
 1     BMW   Blue        192714     5
 2   Honda  White         84714     4
 3  Toyota  White        154365     4
 4  Nissan   Blue        181577     3)

In [42]:
# select a model and fit this into our model
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(filled_data, y)

ValueError: could not convert string to float: 'Honda'

In [44]:
# our features contain string values and first we need to convert this into number

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
one_hot = OneHotEncoder()

features = ['Make', 'Colour', 'Doors']

transformer = ColumnTransformer([('one_hot',
                                  one_hot,
                                  features)], remainder='passthrough')

transformed_x = transformer.fit_transform(filled_data)
transformed_x

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [45]:
# convert these features and labels into testing and training set

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)

In [46]:
# now fit into model
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.18016091763752173

## Alhamdulillah Done Well but its time to revised both

# 1. Fill the missing values with Pandas and save this model as pandas_model

In [47]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [49]:
car_sales_missing = pd.read_csv('Data/car-sales-extended-missing-data.csv')

In [50]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [51]:
# 1. getting our data ready
x = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [53]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [54]:
# pick a model and fit the data into model
from sklearn.ensemble import RandomForestRegressor
pandas_model = RandomForestRegressor()

In [56]:
pandas_model.fit(x_train, y_train)

ValueError: could not convert string to float: 'Toyota'

In [57]:
### First Error is convert the string into float, so let's solve this problem

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

one_hot = OneHotEncoder()
features = ['Make', 'Colour', 'Doors']

transformer = ColumnTransformer([('one_hot', one_hot, features)], remainder='passthrough')

transformed_x = transformer.fit_transform(x)

ValueError: Input contains NaN

In [58]:
### So the Data contain missing values

x.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
dtype: int64

In [60]:
y.isna().sum()

50

In [61]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

### Error no. 1 Fill the main data then reproceed all the steps

In [62]:
# fill out make column
car_sales_missing['Make'].fillna('missing', inplace=True)

In [63]:
car_sales_missing.isna().sum()

Make              0
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [64]:
# fill all the other remaining features columns in this cell
car_sales_missing['Colour'].fillna('missing', inplace=True)
car_sales_missing['Odometer (KM)'].fillna(car_sales_missing['Odometer (KM)'].mean(), inplace=True)
car_sales_missing['Doors'].fillna(4, inplace=True)



In [65]:
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [67]:
# its time for labels column
car_sales_missing['Price'].fillna(car_sales_missing['Price'].mean(), inplace=True)
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [69]:
# 1. data set contain's no missing values then getting the data ready

x = car_sales_missing.drop('Price', axis=1)
y = car_sales_missing['Price']

In [70]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)


In [71]:
len(car_sales_missing)

1000

In [74]:
# fit this into model
pandas_model.fit(x_train, y_train)

ValueError: could not convert string to float: 'Toyota'

### Error no. 2  Convert the string into float

In [76]:
# convert the features columns from string to integer except ( Odometer (KM) ) using one hot technique

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

one_hot = OneHotEncoder()
features = ['Make', 'Colour', 'Doors']

transformer = ColumnTransformer([('one_hot', one_hot, features)], remainder = 'passthrough')

transformed_x = transformer.fit_transform(x)
transformed_x

<1000x15 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

In [77]:
# fit this transformed_x into our model

pandas_model.fit(transformed_x, y_train)

ValueError: Number of labels=750 does not match number of samples=1000

### This Error rise because we use x (features includes testing and training data)

In [81]:
# To solve this error let's quikly rebuilt our training and testing sets

x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.25)

In [82]:
# now fit this into our model
pandas_model.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [83]:
# save this model 
import pickle 

pickle.dump(pandas_model, open('pandas_model.pkl', 'wb'))

In [85]:
# load the model and check accuracy on test data
pandas_model = pickle.load(open('pandas_model.pkl', 'rb'))



In [86]:
pandas_model.score(x_test, y_test)

0.1668609784893803

In [87]:
pandas_model.score(x_train, y_train)

0.8813699229697374

# 2. Fill the missing values with sklearn and save this model as sklearn_model

In [88]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [89]:
car_sales_missing = pd.read_csv('Data/car-sales-extended-missing-data.csv')
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [112]:
# 1. Fill the missing values with sklearn

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# create the imputers
cat_imputor = SimpleImputer(strategy='constant', fill_value='missing')
num_imputor = SimpleImputer(strategy='mean')
door_imputor = SimpleImputer(strategy='constant', fill_value=4)

# create the features on which the imputers will apply to fill accordingly

cat_features = ['Make', 'Colour']
int_features = ['Odometer (KM)', 'Price']
door_features = ['Doors']

# create an imputer that will operation column wise

imputer = ColumnTransformer([('cat_features', cat_imputor, cat_features),
                             ('num_features', num_imputor, num_features),
                             ('door_features', door_imputor, door_features)], remainder='passthrough')

In [113]:
imputer_output = imputer.fit_transform(car_sales_missing)

In [114]:
imputer_output

array([['Honda', 'White', 35431.0, 4.0, 15323.0],
       ['BMW', 'Blue', 192714.0, 5.0, 19943.0],
       ['Honda', 'White', 84714.0, 4.0, 28343.0],
       ...,
       ['Nissan', 'Blue', 66604.0, 4.0, 31570.0],
       ['Honda', 'White', 215883.0, 4.0, 4001.0],
       ['Toyota', 'Blue', 248360.0, 4.0, 12732.0]], dtype=object)

In [115]:
# convert this into dataframe 

car_sales_filled = pd.DataFrame(imputer_output, columns=['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'])

car_sales_filled

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
...,...,...,...,...,...
995,Toyota,Black,35820,4,32042
996,missing,White,155144,3,5716
997,Nissan,Blue,66604,4,31570
998,Honda,White,215883,4,4001


In [108]:
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [109]:
car_sales_missing.isna().sum(), car_sales_filled.isna().sum()

(Make             49
 Colour           50
 Odometer (KM)    50
 Doors            50
 Price            50
 dtype: int64,
 Make             0
 Colour           0
 Odometer (KM)    0
 Doors            0
 dtype: int64)

In [110]:
pd.DataFrame(car_sales_missing.isna().sum(), car_sales_filled.isna().sum())

Unnamed: 0,0
0,
0,
0,
0,


In [116]:
car_sales_filled.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [117]:
car_sales_filled['Price'].fillna(car_sales_filled['Price'].mean(), inplace=True)

In [118]:
car_sales_filled.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [119]:
len(car_sales_filled)

1000

In [120]:
#2 Getting the Data Ready - Split this into x and y then training and testing data
x = car_sales_filled.drop('Price', axis=1)
y = car_sales_filled['Price']

In [121]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.22)

In [122]:
# 3 pick the model and train it
from sklearn.ensemble import RandomForestRegressor
sklearn_model = RandomForestRegressor()

sklearn_model.fit(x_train, y_train)

ValueError: could not convert string to float: 'Toyota'

In [124]:
# convert the strings into integers

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

one_hot = OneHotEncoder()
features = ['Make', 'Colour', 'Doors']

transformer = ColumnTransformer([('one_hot', one_hot, features)], remainder='passthrough')

transformed_x = transformer.fit_transform(x)


In [125]:
transformed_x

<1000x15 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

In [130]:
# lets  re create our test and train data

x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.24)


In [131]:
# let's refit into our model
sklearn_model.fit(transformed_x, y)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [132]:
# 4 save and load the model and then check accuracy at the end

import pickle 
pickle.dump(sklearn_model, open('sklearn_model.pkl', 'wb'))

In [133]:
sklearn_model = pickle.load(open('sklearn_model.pkl', 'rb'))

In [134]:
sklearn_model.score(x_test, y_test)

0.8625129382266828

In [135]:
sklearn_model.score(x_train, y_train)

0.8675562783409443

In [136]:
sklearn_model.score(x_test, y_test)

0.8625129382266828

In [137]:
pandas_model.score(x_test, y_test)

0.630990244438239

In [138]:
sklearn_model.score(x_test, y_test)


0.8625129382266828