# Experiment 8

##### Aim : Handling Missing values from the dataset

#### Importing libraries

In [20]:
import os 
import pandas as pd 
import numpy as np

#### Reading Dataset

In [21]:
df = pd.read_csv("Toyota.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170


#### Figuring out missing values

In [22]:
df.isna().sum()

Unnamed: 0      0
Price           0
Age           100
KM              0
FuelType      100
HP              0
MetColor      150
Automatic       0
CC              0
Doors           0
Weight          0
dtype: int64

In [23]:
missing = df[df.isna().any(axis=1)]

#### Finding out missing values

In [24]:
missing.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 325 entries, 2 to 1434
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  325 non-null    int64  
 1   Price       325 non-null    int64  
 2   Age         225 non-null    float64
 3   KM          325 non-null    object 
 4   FuelType    225 non-null    object 
 5   HP          325 non-null    object 
 6   MetColor    175 non-null    float64
 7   Automatic   325 non-null    int64  
 8   CC          325 non-null    int64  
 9   Doors       325 non-null    object 
 10  Weight      325 non-null    int64  
dtypes: float64(2), int64(5), object(4)
memory usage: 30.5+ KB


#### Statistical analysis of numeric data

In [25]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Price,Age,MetColor,Automatic,CC,Weight
count,1436.0,1436.0,1336.0,1286.0,1436.0,1436.0,1436.0
mean,717.5,10730.824513,55.672156,0.674961,0.05571,1566.827994,1072.45961
std,414.681806,3626.964585,18.589804,0.468572,0.229441,187.182436,52.64112
min,0.0,4350.0,1.0,0.0,0.0,1300.0,1000.0
25%,358.75,8450.0,43.0,0.0,0.0,1400.0,1040.0
50%,717.5,9900.0,60.0,1.0,0.0,1600.0,1070.0
75%,1076.25,11950.0,70.0,1.0,0.0,1600.0,1085.0
max,1435.0,32500.0,80.0,1.0,1.0,2000.0,1615.0


#### Handling 'Age', 'KM', 'HP', 'FuelType' and 'MetColor' missing attributes

In [26]:
age_mean = df['Age'].mean()
df['Age'].fillna(age_mean, inplace = True)

In [27]:
df['KM'] = pd.to_numeric(df['KM'], errors='coerce').fillna(0).astype(np.int64)
df['KM'].dtype

dtype('int64')

In [28]:
km_median = df['KM'].median()
df['KM'].fillna(km_median, inplace = True)

In [29]:
df['HP'] = pd.to_numeric(df['HP'], errors='coerce').fillna(0).astype(np.int64)
df['HP'].dtype

dtype('int64')

In [30]:
df['HP'].fillna(df['HP'].mean(), inplace=True)

In [31]:
df['FuelType'].value_counts()

Petrol    1177
Diesel     144
CNG         15
Name: FuelType, dtype: int64

In [32]:
df['FuelType'].fillna(df['FuelType'].value_counts().index[0], inplace=True)

In [33]:
df['FuelType'].fillna(df['FuelType'].value_counts().index[1], inplace=True)

In [34]:
df['FuelType'].fillna(df['FuelType'].value_counts().index[2], inplace=True)

In [35]:
metMode = df['MetColor'].mode()
df['MetColor'].fillna(metMode[0], inplace = True)

#### Missing values are gone!

In [36]:
df.isna().sum()

Unnamed: 0    0
Price         0
Age           0
KM            0
FuelType      0
HP            0
MetColor      0
Automatic     0
CC            0
Doors         0
Weight        0
dtype: int64