## **Import necessary libraries**

In [20]:
import pandas as pd
import numpy as np
from sklearn import preprocessing


## 1. Load the above Dataset

In [21]:
dataset = pd.read_csv('data.csv', index_col=False)
dataset

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
506,0.98765,0.0,12.50,0,0.561,6.980,89.0,2.0980,3,320,23.0,396.00,12.00,12.0
507,0.23456,0.0,12.50,0,0.561,6.980,76.0,2.6540,3,320,23.0,343.00,25.00,32.0
508,0.44433,0.0,12.50,0,0.561,6.123,98.0,2.9870,3,320,23.0,343.00,21.00,54.0
509,0.77763,0.0,12.70,0,0.561,6.222,34.0,2.5430,3,329,23.0,343.00,76.00,67.0


## 2. Get the number of columns

In [22]:
num_rows = len(dataset)
num_columns = len(dataset.columns)
print('Number of rows: %s' % num_rows)
print('Number of columns: %s' % num_columns)

Number of rows: 511
Number of columns: 14


## 3. Use describe function for the dataset

In [23]:
dataset_describe = dataset.describe()
dataset_describe

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,511.0,511.0,511.0,511.0,511.0,506.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0,511.0
mean,3.584139,11.252446,11.151096,0.068493,0.554757,6.287589,68.616243,3.783876,9.485323,407.440313,18.5,356.6009,12.87955,22.682192
std,8.564433,23.234838,6.828175,0.252838,0.11531,0.703802,28.09913,2.098631,8.688469,167.903532,2.200348,90.882679,7.797416,9.484262
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082325,0.0,5.19,0.0,0.449,5.8855,45.05,2.10035,4.0,279.5,17.4,374.71,7.065,17.05
50%,0.26169,0.0,9.69,0.0,0.538,6.209,77.3,3.1523,5.0,330.0,19.1,391.34,11.45,21.2
75%,3.621175,12.5,18.1,0.0,0.624,6.62975,94.05,5.118,24.0,666.0,20.2,396.21,17.105,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,23.0,396.9,76.0,67.0


## 4. Use info function for the dataset

In [24]:
dataset_info = dataset.info()
dataset_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 511 entries, 0 to 510
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   CRIM     511 non-null    float64
 1   ZN       511 non-null    float64
 2   INDUS    511 non-null    float64
 3   CHAS     511 non-null    int64  
 4   NOX      511 non-null    float64
 5   RM       506 non-null    float64
 6   AGE      511 non-null    float64
 7   DIS      511 non-null    float64
 8   RAD      511 non-null    int64  
 9   TAX      511 non-null    int64  
 10  PTRATIO  511 non-null    float64
 11  B        511 non-null    float64
 12  LSTAT    511 non-null    float64
 13  MEDV     511 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 56.0 KB


## 5. Count the number of missing values

In [25]:
missing_values_count = dataset.isnull().sum()
print(missing_values_count)

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         5
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64


## 6. Replace the missing values with the average of non-null values.

In [26]:
dataset['RM'] = dataset['RM'].replace(np.nan, dataset['RM'].mean())
new_missing_values_count = dataset.isnull().sum()
new_missing_values_count

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

## 7. Perform data normalization to all the features.

In [27]:
dataset = np.array(dataset)
dataset_copy = dataset
normalized_dataset = preprocessing.normalize(dataset)
normalized_dataset

array([[1.26243020e-05, 3.59552904e-02, 4.61426227e-03, ...,
        7.92814154e-01, 9.94763036e-03, 4.79403873e-02],
       [5.77925202e-05, 0.00000000e+00, 1.49613006e-02, ...,
        8.39906675e-01, 1.93417662e-02, 4.57092068e-02],
       [5.84112194e-05, 0.00000000e+00, 1.51325512e-02, ...,
        8.40809063e-01, 8.62576820e-03, 7.42715029e-02],
       ...,
       [9.19027982e-04, 0.00000000e+00, 2.58543195e-02, ...,
        7.09442527e-01, 4.34352567e-02, 1.11690660e-01],
       [1.59379730e-03, 0.00000000e+00, 2.60293786e-02, ...,
        7.02998177e-01, 1.55766360e-01, 1.37320344e-01],
       [1.36460903e-03, 0.00000000e+00, 2.66948825e-02, ...,
        6.69457601e-01, 9.38491964e-02, 5.00529047e-02]])

## 8. Perform data standardization to all the features.

In [28]:
standard_scaler = preprocessing.StandardScaler()
scaled_dataset = standard_scaler.fit_transform(dataset_copy)
scaled_dataset

array([[-0.41816246,  0.29069132, -1.29606519, ...,  0.44385344,
        -1.01409118,  0.13908299],
       [-0.41570923, -0.48476656, -0.59827044, ...,  0.44385344,
        -0.48005831, -0.11421576],
       [-0.41571156, -0.48476656, -0.59827044, ...,  0.39902654,
        -1.13604581,  1.26837325],
       ...,
       [-0.36696944, -0.48476656,  0.19774332, ..., -0.14980003,
         1.04244887,  3.30531737],
       [-0.32801455, -0.48476656,  0.22706243, ..., -0.14980003,
         8.10297964,  4.67735227],
       [-0.34242657, -0.48476656,  0.24172198, ..., -0.39210757,
         4.12340775,  0.13908299]])