In [1]:
# Importing all the necessary libraries

import numpy as np
import pandas as pd

In [2]:
# Loading the dataset

df_car = pd.read_csv('datasets/car-data.csv')

In [3]:
# Viewing first few rows of data

df_car.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [4]:
# Viewing the columns present in the dataset

df_car.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [5]:
# Datatypes of columns present in the dataset

df_car.dtypes

symboling              int64
normalized-losses    float64
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders       int64
engine-size            int64
fuel-system           object
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [6]:
# General Information regarding the dataset

df_car.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
symboling            205 non-null int64
normalized-losses    164 non-null float64
make                 205 non-null object
fuel-type            205 non-null object
aspiration           205 non-null object
num-of-doors         203 non-null object
body-style           205 non-null object
drive-wheels         205 non-null object
engine-location      205 non-null object
wheel-base           205 non-null float64
length               205 non-null float64
width                205 non-null float64
height               205 non-null float64
curb-weight          205 non-null int64
engine-type          205 non-null object
num-of-cylinders     205 non-null int64
engine-size          205 non-null int64
fuel-system          205 non-null object
bore                 201 non-null float64
stroke               201 non-null float64
compression-ratio    205 non-null float64
horsepower           203 non-n

In [7]:
# Dimensions of the dataset

df_car.shape

(205, 26)

In [12]:
# Including columns which are of object datatype in modified dataframe

df_car_mod = df_car.select_dtypes(include=['object'])

# Viewing first few rows of data

df_car_mod.head()

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,fuel-system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,mpfi


In [13]:
# Checking for any null values present in the dataset

df_car_mod.isnull().sum()

make               0
fuel-type          0
aspiration         0
num-of-doors       2
body-style         0
drive-wheels       0
engine-location    0
engine-type        0
fuel-system        0
dtype: int64

In [14]:
# Value counts for num-of-doors column

df_car_mod['num-of-doors'].value_counts()

four    114
two      89
Name: num-of-doors, dtype: int64

In [15]:
# Replacing null values with most frequent value

df_car_mod['num-of-doors'] = df_car_mod['num-of-doors'].fillna(df_car_mod['num-of-doors'].value_counts().index[0])

In [9]:
# pd.options.mode.chained_assignment = None

In [16]:
# Checking for null values

df_car_mod['num-of-doors'].isnull().sum()

0

In [18]:
# Using Pandas
# Value counts for fuel_type

df_car_mod['fuel-type'].value_counts()

gas       185
diesel     20
Name: fuel-type, dtype: int64

In [19]:
# Encoding fuel_type using get_dummies

df_car_mod = pd.get_dummies(df_car_mod, columns=['fuel-type'], drop_first=True)

In [20]:
# Few rows of encoded data

df_car_mod.sample(10)

Unnamed: 0,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,fuel-system,fuel-type_gas
50,mazda,std,two,hatchback,fwd,front,ohc,2bbl,1
127,porsche,std,two,hardtop,rwd,rear,ohcf,mpfi,1
149,subaru,turbo,four,wagon,4wd,front,ohcf,mpfi,1
71,mercedes-benz,std,four,sedan,rwd,front,ohcv,mpfi,1
156,toyota,std,four,sedan,fwd,front,ohc,2bbl,1
3,audi,std,four,sedan,fwd,front,ohc,mpfi,1
30,honda,std,two,hatchback,fwd,front,ohc,1bbl,1
96,nissan,std,four,sedan,fwd,front,ohc,2bbl,1
59,mazda,std,two,hatchback,fwd,front,ohc,2bbl,1
0,alfa-romero,std,two,convertible,rwd,front,dohc,mpfi,1


In [21]:
# Value counts for fuel-type_gas

df_car_mod['fuel-type_gas'].value_counts()

1    185
0     20
Name: fuel-type_gas, dtype: int64

In [22]:
## Find and Replace value in the dataset
# Value counts for num-of-doors column

df_car_mod['num-of-doors'].value_counts()

four    116
two      89
Name: num-of-doors, dtype: int64

In [23]:
# Replacing values in num-of-doors column

df_car_mod['num-of-doors'] = df_car_mod['num-of-doors'].replace('four', 4)
df_car_mod['num-of-doors'] = df_car_mod['num-of-doors'].replace('two', 2)

In [24]:
# Value counts for num-of-doors column

df_car_mod['num-of-doors'].value_counts()

4    116
2     89
Name: num-of-doors, dtype: int64

In [10]:
# Create a dictionary to find and replace values

dic_to_replace = {"num-of-doors": {"four": 4, "two": 2}}

df_car_mod.replace(dic_to_replace, inplace=True)

In [11]:
# Viewing few rows of num-of-doors column

df_car_mod['num-of-doors'].head()

0    2.0
1    2.0
2    2.0
3    4.0
4    4.0
Name: num-of-doors, dtype: float64

In [25]:
# Using SciKit Learn
# Encoding make column using LabelEncoder

from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()

df_car_mod['make_encoded'] = labelencoder.fit_transform(df_car_mod['make'])

In [26]:
# Viewing few rows of make and its encoded columns

df_car_mod[['make', 'make_encoded']].sample(20)

Unnamed: 0,make,make_encoded
100,nissan,12
50,mazda,8
74,mercedes-benz,9
179,toyota,19
94,nissan,12
81,mitsubishi,11
93,nissan,12
88,mitsubishi,11
12,bmw,2
28,dodge,4


In [29]:
# Value counts for make column

df_car_mod['make'].value_counts()

toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
volkswagen       12
subaru           12
volvo            11
peugot           11
dodge             9
bmw               8
mercedes-benz     8
plymouth          7
audi              7
saab              6
porsche           5
isuzu             4
alfa-romero       3
chevrolet         3
jaguar            3
renault           2
mercury           1
Name: make, dtype: int64

In [30]:
# Value counts for make_encoded column

df_car_mod['make_encoded'].value_counts()

19    32
12    18
8     17
5     13
11    13
18    12
20    12
13    11
21    11
4      9
9      8
2      8
14     7
1      7
17     6
15     5
6      4
3      3
0      3
7      3
16     2
10     1
Name: make_encoded, dtype: int64

In [31]:
# Viewing first few rows of data

df_car_mod.head()

Unnamed: 0,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,fuel-system,fuel-type_gas,make_encoded
0,alfa-romero,std,2,convertible,rwd,front,dohc,mpfi,1,0
1,alfa-romero,std,2,convertible,rwd,front,dohc,mpfi,1,0
2,alfa-romero,std,2,hatchback,rwd,front,ohcv,mpfi,1,0
3,audi,std,4,sedan,fwd,front,ohc,mpfi,1,1
4,audi,std,4,sedan,4wd,front,ohc,mpfi,1,1


In [32]:
# Enoding make column using LabelBinarizer

from sklearn.preprocessing import LabelBinarizer

labelbinarizer = LabelBinarizer()

make_encoded_results = labelbinarizer.fit_transform(df_car_mod['make'])

In [33]:
# Classes created in make column after encoding

labelbinarizer.classes_

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'mazda', 'mercedes-benz', 'mercury',
       'mitsubishi', 'nissan', 'peugot', 'plymouth', 'porsche', 'renault',
       'saab', 'subaru', 'toyota', 'volkswagen', 'volvo'], dtype='<U13')

In [34]:
# Converting an numpy array into a pandas dataframe

df_make_encoded = pd.DataFrame(make_encoded_results, columns=labelbinarizer.classes_)

# Viewing few rows of data

df_make_encoded.sample(10)

Unnamed: 0,alfa-romero,audi,bmw,chevrolet,dodge,honda,isuzu,jaguar,mazda,mercedes-benz,...,nissan,peugot,plymouth,porsche,renault,saab,subaru,toyota,volkswagen,volvo
74,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
201,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
83,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
120,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
88,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
39,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
90,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
125,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [35]:
# Creating an MultiLabel Array

multilabel_feature = [("New Delhi", "New York"),
                     ("New York", "Sydney", "Hyderabad", "Bangalore"),
                     ("Hyderabad", "Sydney", "Chennai"),
                      ("Chennai", "New Delhi", "Bangalore"),
                     ("Bangalore", "Chennai")]

# Printing the MultiLabel Array

print(multilabel_feature)

[('New Delhi', 'New York'), ('New York', 'Sydney', 'Hyderabad', 'Bangalore'), ('Hyderabad', 'Sydney', 'Chennai'), ('Chennai', 'New Delhi', 'Bangalore'), ('Bangalore', 'Chennai')]


In [36]:
# Encoding MultiLabel data using MultiLabel Binarizer

from sklearn.preprocessing import MultiLabelBinarizer

multilabelbinarizer = MultiLabelBinarizer()

multilabel_encoded_results = multilabelbinarizer.fit_transform(multilabel_feature)

In [37]:
# Classes created in MultiLabel data after Encoding

multilabelbinarizer.classes_

array(['Bangalore', 'Chennai', 'Hyderabad', 'New Delhi', 'New York',
       'Sydney'], dtype=object)

In [38]:
# Converting an Numpy Array into a pandas dataframe

df_multilabel_data = pd.DataFrame(multilabel_encoded_results, columns=multilabelbinarizer.classes_)

# Viewing few rows of data

df_multilabel_data.head()

Unnamed: 0,Bangalore,Chennai,Hyderabad,New Delhi,New York,Sydney
0,0,0,0,1,1,0
1,1,0,1,0,1,1
2,0,1,1,0,0,1
3,1,1,0,1,0,0
4,1,1,0,0,0,0


In [39]:
# Ordinal Data can be encoded using OrdinalEncoder

# Creating an Pandas dataframe

data = {'Employee Id' : [112, 113, 114, 115], 'Income Range' : ['Low', 'High', 'Medium', 'High']}

df_ordinal = pd.DataFrame(data)

# Viewing few rows of created dataframe

df_ordinal.head()

Unnamed: 0,Employee Id,Income Range
0,112,Low
1,113,High
2,114,Medium
3,115,High


In [40]:
# Encoding above ordinal data using OrdinalEncoder

from sklearn.preprocessing import OrdinalEncoder

ordinalencoder = OrdinalEncoder()

ordinalencoder.fit_transform(df_ordinal[['Income Range']])

array([[1.],
       [0.],
       [2.],
       [0.]])

In [41]:
# Using pandas factorize method for ordinal data

categories = pd.Categorical(df_ordinal['Income Range'], categories=['Low', 'Medium', 'High'], ordered=True)

In [42]:
# Order of labels set for data

categories

[Low, High, Medium, High]
Categories (3, object): [Low < Medium < High]

In [43]:
# Factorizing the column data

labels, unique = pd.factorize(categories, sort=True)
df_ordinal['Income Range'] = labels

In [44]:
# Encoded Income Range Data

df_ordinal['Income Range']

0    0
1    2
2    1
3    2
Name: Income Range, dtype: int64

In [45]:
# DictVectorizer for encoding data

# Creating a dictionary for sample data

data_prices = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]


from sklearn.feature_extraction import DictVectorizer

dictvectorizer = DictVectorizer(sparse=False, dtype=int)

data_prices_encoded = dictvectorizer.fit_transform(data_prices)

In [46]:
# Features names of encoded data

dictvectorizer.get_feature_names()

['neighborhood=Fremont',
 'neighborhood=Queen Anne',
 'neighborhood=Wallingford',
 'price',
 'rooms']

In [47]:
# Converting encoded data into pandas dataframe

df_prices = pd.DataFrame(data_prices_encoded, columns=dictvectorizer.get_feature_names())

# viewing few rows of data

df_prices.head()

Unnamed: 0,neighborhood=Fremont,neighborhood=Queen Anne,neighborhood=Wallingford,price,rooms
0,0,1,0,850000,4
1,1,0,0,700000,3
2,0,0,1,650000,3
3,1,0,0,600000,2


In [48]:
# Encoding drive-wheels and engine-location columns using ColumnTransformer and OneHotEncoder

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ctransformer = ColumnTransformer([("encoded_data", OneHotEncoder(sparse=False), [4,5]),])
#ctransformer = ColumnTransformer([("encoded_data", OneHotEncoder(sparse=False), [4, 5]),], remainder='passthrough')

ct_encoded_results = ctransformer.fit_transform(df_car_mod)

In [49]:
# Get Feature Names of Encoded columns

ctransformer.get_feature_names()

['encoded_data__x0_4wd',
 'encoded_data__x0_fwd',
 'encoded_data__x0_rwd',
 'encoded_data__x1_front',
 'encoded_data__x1_rear']

In [50]:
# Converting the numpy array into a pandas dataframe

df_ct_encoded_data = pd.DataFrame(ct_encoded_results, columns=ctransformer.get_feature_names())

# Viewing first few rows of data

df_ct_encoded_data.head()

Unnamed: 0,encoded_data__x0_4wd,encoded_data__x0_fwd,encoded_data__x0_rwd,encoded_data__x1_front,encoded_data__x1_rear
0,0.0,0.0,1.0,1.0,0.0
1,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,1.0,1.0,0.0
3,0.0,1.0,0.0,1.0,0.0
4,1.0,0.0,0.0,1.0,0.0


In [51]:
# Dropping dummy variables to avoid multicollinearity

df_ct_encoded_data.drop(['encoded_data__x0_4wd', 'encoded_data__x1_front'], inplace=True, axis=1)

In [52]:
# Viewing few rows of data after dropping dummy varibles

df_ct_encoded_data.head()

Unnamed: 0,encoded_data__x0_fwd,encoded_data__x0_rwd,encoded_data__x1_rear
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,0.0,1.0,0.0
3,1.0,0.0,0.0
4,0.0,0.0,0.0


In [53]:
# Concatenating the encoded dataframe with the original dataframe

df = pd.concat([df_car_mod.reset_index(drop=True), df_ct_encoded_data.reset_index(drop=True)], axis=1)

# Viewing few rows of data

df.head()

Unnamed: 0,make,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,fuel-system,fuel-type_gas,make_encoded,encoded_data__x0_fwd,encoded_data__x0_rwd,encoded_data__x1_rear
0,alfa-romero,std,2,convertible,rwd,front,dohc,mpfi,1,0,0.0,1.0,0.0
1,alfa-romero,std,2,convertible,rwd,front,dohc,mpfi,1,0,0.0,1.0,0.0
2,alfa-romero,std,2,hatchback,rwd,front,ohcv,mpfi,1,0,0.0,1.0,0.0
3,audi,std,4,sedan,fwd,front,ohc,mpfi,1,1,1.0,0.0,0.0
4,audi,std,4,sedan,4wd,front,ohc,mpfi,1,1,0.0,0.0,0.0


In [54]:
# Dropping drive-wheels, make and engine-location columns

df.drop(['drive-wheels', 'engine-location', 'make'], inplace=True, axis=1)

In [55]:
# Viewing few rows of data

df.head()

Unnamed: 0,aspiration,num-of-doors,body-style,engine-type,fuel-system,fuel-type_gas,make_encoded,encoded_data__x0_fwd,encoded_data__x0_rwd,encoded_data__x1_rear
0,std,2,convertible,dohc,mpfi,1,0,0.0,1.0,0.0
1,std,2,convertible,dohc,mpfi,1,0,0.0,1.0,0.0
2,std,2,hatchback,ohcv,mpfi,1,0,0.0,1.0,0.0
3,std,4,sedan,ohc,mpfi,1,1,1.0,0.0,0.0
4,std,4,sedan,ohc,mpfi,1,1,0.0,0.0,0.0


In [56]:
from sklearn.preprocessing import LabelEncoder

lenc = LabelEncoder()

df['aspiration'] = lenc.fit_transform(df['aspiration'])

In [57]:
lenc.classes_

array(['std', 'turbo'], dtype=object)

In [58]:
df.head()

Unnamed: 0,aspiration,num-of-doors,body-style,engine-type,fuel-system,fuel-type_gas,make_encoded,encoded_data__x0_fwd,encoded_data__x0_rwd,encoded_data__x1_rear
0,0,2,convertible,dohc,mpfi,1,0,0.0,1.0,0.0
1,0,2,convertible,dohc,mpfi,1,0,0.0,1.0,0.0
2,0,2,hatchback,ohcv,mpfi,1,0,0.0,1.0,0.0
3,0,4,sedan,ohc,mpfi,1,1,1.0,0.0,0.0
4,0,4,sedan,ohc,mpfi,1,1,0.0,0.0,0.0


In [59]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0], sparse=False)

ohe_results = ohe.fit_transform(df[['aspiration']])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [60]:
# Converting OneHotEncoded results into an dataframe

df_ohe_results = pd.DataFrame(ohe_results, columns=lenc.classes_)

# Viewing first few rows of data

df_ohe_results.head()

Unnamed: 0,std,turbo
0,1.0,0.0
1,1.0,0.0
2,1.0,0.0
3,1.0,0.0
4,1.0,0.0


In [61]:
# Categorical columns present in the dataframe

categorical_cols = df.columns[df.dtypes==object].tolist()

categorical_cols

['body-style', 'engine-type', 'fuel-system']

In [62]:
# Performing LabelEncoding for remaining all categorical features

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df[categorical_cols] = df[categorical_cols].apply(lambda col: le.fit_transform(col))

# Viewing first few rows of data

df[categorical_cols].head(10)

Unnamed: 0,body-style,engine-type,fuel-system
0,0,0,5
1,0,0,5
2,2,5,5
3,3,3,5
4,3,3,5
5,3,3,5
6,3,3,5
7,4,3,5
8,3,3,5
9,2,3,5


In [63]:
from sklearn.preprocessing import OneHotEncoder

onehotencoder = OneHotEncoder(sparse=False)

onehotencoder.fit_transform(df[categorical_cols])

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])