# Chapter 4: Dimension Reduction

> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck
>
> Code included in
>
> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition)
> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.

## Import required packages

Make sure DMBA package is available

In [1]:
pip install dmba #install the dmba package

Collecting dmba
  Downloading dmba-0.2.4-py3-none-any.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dmba
Successfully installed dmba-0.2.4


In [2]:
#import required libraries
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pylab as plt

import dmba

%matplotlib inline


Colab environment detected.


## Table 4.3

In [3]:
bostonHousing_df = dmba.load_data('BostonHousing.csv') #load the dataset from the dmba package
bostonHousing_df = bostonHousing_df.rename(columns={'CAT. MEDV': 'CAT_MEDV'}) #rename the column CAT.MEDV to CAT_MEDV
bostonHousing_df.head(9) #get the first 9 rows

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV,CAT_MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0,0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6,0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7,1
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4,1
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2,1
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,5.21,28.7,0
6,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,12.43,22.9,0
7,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,19.15,27.1,0
8,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,29.93,16.5,0


In [4]:
bostonHousing_df.describe() #get descriptive statsitics of the dataset

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV,CAT_MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,12.653063,22.532806,0.166008
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,7.141062,9.197104,0.372456
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,1.73,5.0,0.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,6.95,17.025,0.0
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,11.36,21.2,0.0
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,16.955,25.0,0.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,37.97,50.0,1.0


Compute mean, standard deviation, min, max, median, length, and missing values of CRIM

In [None]:
print('Mean : ', bostonHousing_df.CRIM.mean()) #print the mean of the variable CRIM
print('Std. dev : ', bostonHousing_df.CRIM.std()) #print the standard deviation of the variable CRIM
print('Min : ', bostonHousing_df.CRIM.min()) #print the minimum value of the variable CRIM
print('Max : ', bostonHousing_df.CRIM.max()) #print the maximum value of the variable CRIM
print('Median : ', bostonHousing_df.CRIM.median()) #print the median of the variable CRIM
print('Length : ', len(bostonHousing_df.CRIM)) #print the number of total observations of the variable CRIM

print('Number of missing values : ', bostonHousing_df.CRIM.isnull().sum()) #print the number of missing values of the variable CRIM

Mean :  3.613523557312254
Std. dev :  8.60154510533249
Min :  0.00632
Max :  88.9762
Median :  0.25651
Length :  506
Number of missing values :  0


Compute mean, standard dev., min, max, median, length, and missing values for all variables

In [None]:
# create a dataframe with the summary statistics of the variables as the columns
pd.DataFrame({'mean': bostonHousing_df.mean(),
              'sd': bostonHousing_df.std(),
              'min': bostonHousing_df.min(),
              'max': bostonHousing_df.max(),
              'median': bostonHousing_df.median(),
              'length': len(bostonHousing_df),
              'miss.val': bostonHousing_df.isnull().sum(),
             })

Unnamed: 0,mean,sd,min,max,median,length,miss.val
CRIM,3.613524,8.601545,0.00632,88.9762,0.25651,506,0
ZN,11.363636,23.322453,0.0,100.0,0.0,506,0
INDUS,11.136779,6.860353,0.46,27.74,9.69,506,0
CHAS,0.06917,0.253994,0.0,1.0,0.0,506,0
NOX,0.554695,0.115878,0.385,0.871,0.538,506,0
RM,6.284634,0.702617,3.561,8.78,6.2085,506,0
AGE,68.574901,28.148861,2.9,100.0,77.5,506,0
DIS,3.795043,2.10571,1.1296,12.1265,3.20745,506,0
RAD,9.549407,8.707259,1.0,24.0,5.0,506,0
TAX,408.237154,168.537116,187.0,711.0,330.0,506,0


## Table 4.4

In [None]:
bostonHousing_df.corr().round(2) #correlation table of the variables

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV,CAT_MEDV
CRIM,1.0,-0.2,0.41,-0.06,0.42,-0.22,0.35,-0.38,0.63,0.58,0.29,0.46,-0.39,-0.15
ZN,-0.2,1.0,-0.53,-0.04,-0.52,0.31,-0.57,0.66,-0.31,-0.31,-0.39,-0.41,0.36,0.37
INDUS,0.41,-0.53,1.0,0.06,0.76,-0.39,0.64,-0.71,0.6,0.72,0.38,0.6,-0.48,-0.37
CHAS,-0.06,-0.04,0.06,1.0,0.09,0.09,0.09,-0.1,-0.01,-0.04,-0.12,-0.05,0.18,0.11
NOX,0.42,-0.52,0.76,0.09,1.0,-0.3,0.73,-0.77,0.61,0.67,0.19,0.59,-0.43,-0.23
RM,-0.22,0.31,-0.39,0.09,-0.3,1.0,-0.24,0.21,-0.21,-0.29,-0.36,-0.61,0.7,0.64
AGE,0.35,-0.57,0.64,0.09,0.73,-0.24,1.0,-0.75,0.46,0.51,0.26,0.6,-0.38,-0.19
DIS,-0.38,0.66,-0.71,-0.1,-0.77,0.21,-0.75,1.0,-0.49,-0.53,-0.23,-0.5,0.25,0.12
RAD,0.63,-0.31,0.6,-0.01,0.61,-0.21,0.46,-0.49,1.0,0.91,0.46,0.49,-0.38,-0.2
TAX,0.58,-0.31,0.72,-0.04,0.67,-0.29,0.51,-0.53,0.91,1.0,0.46,0.54,-0.47,-0.27


## Table 4.5

In [None]:
bostonHousing_df.CHAS.value_counts() #count the number of observations in each category of the variable CHAS

CHAS
0    471
1     35
Name: count, dtype: int64

## Table 4.6
Create bins of size 1 for variable using the method `pd.cut`. By default, the method creates a categorical variable, e.g. `(6,7]`. The argument `labels=False` determines integers instead, e.g. `6`.

In [None]:
bostonHousing_df['RM_bin'] = pd.cut(bostonHousing_df.RM, range(0, 10), labels=False) #create bins for variable RM with the range of 10
bostonHousing_df.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV,CAT_MEDV,RM_bin
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0,0,6
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6,0,6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7,1,7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4,1,6
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2,1,7


## Table 4.10
Compute principal components on two dimensions

In [5]:
cereals_df = dmba.load_data('Cereals.csv') #load the dataset
pcs = PCA(n_components=2) #define number of principal components
pcs.fit(cereals_df[['calories', 'rating']]) #run the PCA model

The importance of components can be assessed using the explained variance.

In [6]:
pcsSummary = pd.DataFrame({'Standard deviation': np.sqrt(pcs.explained_variance_),
                           'Proportion of variance': pcs.explained_variance_ratio_,
                           'Cumulative proportion': np.cumsum(pcs.explained_variance_ratio_)}) #create a dataframe with three columns of explained variance, explained variance ratio and cumulative variance
pcsSummary = pcsSummary.transpose() #transpose the dataframe
pcsSummary.columns = ['PC1', 'PC2'] #rename the columns of the transposed dataframe
pcsSummary.round(4) #round the numbers to four decimal points

Unnamed: 0,PC1,PC2
Standard deviation,22.3165,8.8844
Proportion of variance,0.8632,0.1368
Cumulative proportion,0.8632,1.0


The `components_` field of `pcs` gives the individual components. The columns in this matrix are the principal components `PC1`, `PC2`. The rows are variables in the order they are found in the input matrix, `calories` and `rating`.

In [None]:
pcsComponents_df = pd.DataFrame(pcs.components_.transpose(), columns=['PC1', 'PC2'],
                                index=['calories', 'rating']) #create a dataframe with the transposed version of the pcs.components_ with columns named PC1 & PC2 and rows named Calories and Rating
pcsComponents_df

Unnamed: 0,PC1,PC2
calories,-0.847053,0.531508
rating,0.531508,0.847053


Use the `transform` method to get the scores.

In [7]:
scores = pd.DataFrame(pcs.transform(cereals_df[['calories', 'rating']]),
                      columns=['PC1', 'PC2']) # get the PCA scores with pcs.transform() and create a dataframe
scores.head()

Unnamed: 0,PC1,PC2
0,44.921528,2.197183
1,-15.725265,-0.382416
2,40.149935,-5.407212
3,75.310772,12.999126
4,-7.041508,-5.357686
