# Feature Selection and Dimensionality Reduction Project

About the dataset: A retail company “ABC Private Limited” wants to understand the customer purchase behaviour 
(specifically, purchase amount) against various products of different categories. They have 
shared purchase summaries of various customers for selected high volume products from last 
month.

In [1]:
# importing libraries
import pandas as pd
import numpy as np

In [2]:
# loading dataset
data = pd.read_csv('C:\\Users\\RKO\\Downloads\\DS C4-Project\\Problem Statement - Dimentionality Reduction-Dataset.csv')

In [3]:
data.head()

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969


In [4]:
# shape of the dataset
data.shape

(550068, 12)

In [5]:
# checking the datatypes of variables
data.dtypes

User_ID                         int64
Product_ID                     object
Gender                         object
Age                            object
Occupation                      int64
City_Category                  object
Stay_In_Current_City_Years     object
Marital_Status                  int64
Product_Category_1              int64
Product_Category_2            float64
Product_Category_3            float64
Purchase                        int64
dtype: object

In [6]:
# checking the missing values
data.isnull().sum()

User_ID                            0
Product_ID                         0
Gender                             0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Purchase                           0
dtype: int64

In [7]:
# percentage of missing values
data.isna().sum()/data.shape[0]

User_ID                       0.000000
Product_ID                    0.000000
Gender                        0.000000
Age                           0.000000
Occupation                    0.000000
City_Category                 0.000000
Stay_In_Current_City_Years    0.000000
Marital_Status                0.000000
Product_Category_1            0.000000
Product_Category_2            0.315666
Product_Category_3            0.696727
Purchase                      0.000000
dtype: float64

Product_Category_3 has almost 70% missing values

### Calculating Correlation Matrix

In [8]:
# dropping the target variable
df = data.drop(['Purchase'],axis=1)

In [9]:
# encoding the Age variable using map function
df['Age'] = df['Age'].map({'0-17':1,'18-25':2,'26-35':3,'36-45':4,'46-50':5,'51-55':6,'55+':6})

In [10]:
# encoding the City_Category variable using map function
df['City_Category'] = df['City_Category'].map({'A':1,'B':2,'C':3})

In [11]:
# encoding the Stay_In_Current_City_Years variable using map function
df['Stay_In_Current_City_Years'] = df['Stay_In_Current_City_Years'].map({'0':0,'1':1,'2':2,'3':3,'4+':4})

In [12]:
# dropping the categorical varibales
df = df.drop(['Product_ID'],axis=1)

In [13]:
# encoding the Gender variable
df = pd.get_dummies(df,drop_first=True)

In [14]:
df.head()

Unnamed: 0,User_ID,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Gender_M
0,1000001,1,10,1,2,0,3,,,0
1,1000001,1,10,1,2,0,1,6.0,14.0,0
2,1000001,1,10,1,2,0,12,,,0
3,1000001,1,10,1,2,0,12,14.0,,0
4,1000002,6,16,3,4,0,8,,,1


In [15]:
# correlation matrix for independent variabls
df.corr()

Unnamed: 0,User_ID,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Gender_M
User_ID,1.0,0.035797,-0.023971,0.022859,-0.030737,0.020443,0.003825,0.001529,0.003419,-0.033474
Age,0.035797,1.0,0.091237,0.116207,-0.006269,0.319946,0.060368,0.055039,0.057868,-0.005322
Occupation,-0.023971,0.091237,1.0,0.034479,0.030005,0.02428,-0.007618,-0.000384,0.013263,0.117291
City_Category,0.022859,0.116207,0.034479,1.0,0.019946,0.03979,-0.014364,-0.011822,-0.002347,-0.004515
Stay_In_Current_City_Years,-0.030737,-0.006269,0.030005,0.019946,1.0,-0.012819,-0.004213,-0.001657,0.002093,0.01466
Marital_Status,0.020443,0.319946,0.02428,0.03979,-0.012819,1.0,0.019888,0.015138,0.019473,-0.011603
Product_Category_1,0.003825,0.060368,-0.007618,-0.014364,-0.004213,0.019888,1.0,0.540583,0.229678,-0.045594
Product_Category_2,0.001529,0.055039,-0.000384,-0.011822,-0.001657,0.015138,0.540583,1.0,0.543649,-0.018591
Product_Category_3,0.003419,0.057868,0.013263,-0.002347,0.002093,0.019473,0.229678,0.543649,1.0,0.028069
Gender_M,-0.033474,-0.005322,0.117291,-0.004515,0.01466,-0.011603,-0.045594,-0.018591,0.028069,1.0


In [16]:
# printing the absolute values and the upper diagonal of the correlation matrix
corr_matrix = df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))
upper

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(np.bool))


Unnamed: 0,User_ID,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Gender_M
User_ID,,0.035797,0.023971,0.022859,0.030737,0.020443,0.003825,0.001529,0.003419,0.033474
Age,,,0.091237,0.116207,0.006269,0.319946,0.060368,0.055039,0.057868,0.005322
Occupation,,,,0.034479,0.030005,0.02428,0.007618,0.000384,0.013263,0.117291
City_Category,,,,,0.019946,0.03979,0.014364,0.011822,0.002347,0.004515
Stay_In_Current_City_Years,,,,,,0.012819,0.004213,0.001657,0.002093,0.01466
Marital_Status,,,,,,,0.019888,0.015138,0.019473,0.011603
Product_Category_1,,,,,,,,0.540583,0.229678,0.045594
Product_Category_2,,,,,,,,,0.543649,0.018591
Product_Category_3,,,,,,,,,,0.028069
Gender_M,,,,,,,,,,


We can see the features Product_Category_2 and Product_Category_3 are highly correlated with correlation value 0.543649. And also Product_Category_1 and Product_Category_2 are highly correlated with correlation value 0.540583.

### Finding the Covariance Matrix

In [17]:
df.isna().sum()

User_ID                            0
Age                                0
Occupation                         0
City_Category                      0
Stay_In_Current_City_Years         0
Marital_Status                     0
Product_Category_1                 0
Product_Category_2            173638
Product_Category_3            383247
Gender_M                           0
dtype: int64

In [18]:
df.head()

Unnamed: 0,User_ID,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Gender_M
0,1000001,1,10,1,2,0,3,,,0
1,1000001,1,10,1,2,0,1,6.0,14.0,0
2,1000001,1,10,1,2,0,12,,,0
3,1000001,1,10,1,2,0,12,14.0,,0
4,1000002,6,16,3,4,0,8,,,1


In [19]:
# Standardise the dataset
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [20]:
data_scaled = scaler.fit_transform(df)

In [21]:
data_scaled = pd.DataFrame(data_scaled,columns=df.columns)

In [22]:
data_scaled.head()

Unnamed: 0,User_ID,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Gender_M
0,-1.752639,-1.94516,0.294864,-1.371516,0.109801,-0.833018,-0.610809,,,-1.746513
1,-1.752639,-1.94516,0.294864,-1.371516,0.109801,-0.833018,-1.118912,-0.755385,0.322825,-1.746513
2,-1.752639,-1.94516,0.294864,-1.371516,0.109801,-0.833018,1.675656,,,-1.746513
3,-1.752639,-1.94516,0.294864,-1.371516,0.109801,-0.833018,1.675656,0.81738,,-1.746513
4,-1.752061,2.012703,1.214734,1.259336,1.660861,-0.833018,0.659449,,,0.57257


In [23]:
data_scaled.cov()

Unnamed: 0,User_ID,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Gender_M
User_ID,1.000002,0.035797,-0.023971,0.022859,-0.030737,0.020443,0.003825,0.001532,0.003429,-0.033475
Age,0.035797,1.000002,0.091237,0.116207,-0.006269,0.319947,0.060368,0.054903,0.057345,-0.005322
Occupation,-0.023971,0.091237,1.000002,0.034479,0.030005,0.02428,-0.007618,-0.000383,0.013192,0.117291
City_Category,0.022859,0.116207,0.034479,1.000002,0.019946,0.039791,-0.014364,-0.011819,-0.002346,-0.004515
Stay_In_Current_City_Years,-0.030737,-0.006269,0.030005,0.019946,1.000002,-0.012819,-0.004213,-0.001654,0.002085,0.01466
Marital_Status,0.020443,0.319947,0.02428,0.039791,-0.012819,1.000002,0.019888,0.015119,0.019421,-0.011603
Product_Category_1,0.003825,0.060368,-0.007618,-0.014364,-0.004213,0.019888,1.000002,0.451724,0.150192,-0.045594
Product_Category_2,0.001532,0.054903,-0.000383,-0.011819,-0.001654,0.015119,0.451724,1.000003,0.480988,-0.018495
Product_Category_3,0.003429,0.057345,0.013192,-0.002346,0.002085,0.019421,0.150192,0.480988,1.000006,0.027198
Gender_M,-0.033475,-0.005322,0.117291,-0.004515,0.01466,-0.011603,-0.045594,-0.018495,0.027198,1.000002


In [24]:
# printing the upper diagonal of the covariance matrix
cov_matrix = data_scaled.cov()
upper = cov_matrix.where(np.triu(np.ones(cov_matrix.shape),k=1).astype(np.bool))
upper

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = cov_matrix.where(np.triu(np.ones(cov_matrix.shape),k=1).astype(np.bool))


Unnamed: 0,User_ID,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Gender_M
User_ID,,0.035797,-0.023971,0.022859,-0.030737,0.020443,0.003825,0.001532,0.003429,-0.033475
Age,,,0.091237,0.116207,-0.006269,0.319947,0.060368,0.054903,0.057345,-0.005322
Occupation,,,,0.034479,0.030005,0.02428,-0.007618,-0.000383,0.013192,0.117291
City_Category,,,,,0.019946,0.039791,-0.014364,-0.011819,-0.002346,-0.004515
Stay_In_Current_City_Years,,,,,,-0.012819,-0.004213,-0.001654,0.002085,0.01466
Marital_Status,,,,,,,0.019888,0.015119,0.019421,-0.011603
Product_Category_1,,,,,,,,0.451724,0.150192,-0.045594
Product_Category_2,,,,,,,,,0.480988,-0.018495
Product_Category_3,,,,,,,,,,0.027198
Gender_M,,,,,,,,,,


### Finding the Eigen values and Eigen vectors

In [25]:
# calculating eigen values and eigen vectors from the covariance matrix
from numpy.linalg import eig

eigen_values, eigen_vectors = eig(cov_matrix)

In [26]:
# printing eigen values and eigen vectors
print('Eigen values : ',eigen_values)
print('\nEigen vectors : ',eigen_vectors)

Eigen values :  [1.76183431 0.41031795 1.36343442 0.66153881 1.14490499 1.00771155
 0.99149596 0.94453221 0.82731826 0.88693474]

Eigen vectors :  [[ 1.90738502e-02  3.16164898e-03 -9.44013348e-02  3.32897049e-02
  -3.39671635e-01  2.44610431e-01  7.30483040e-01 -5.27917567e-01
  -3.66566295e-02 -2.73707068e-02]
 [ 1.82421444e-01  2.59220199e-02 -6.48443477e-01 -7.25571000e-01
  -6.51315301e-02  4.28451428e-03 -1.15462427e-01 -3.86790722e-02
  -5.09905684e-03  2.03219887e-03]
 [ 2.48966490e-02 -2.84020673e-04 -2.41952870e-01  1.45866825e-01
   5.84563021e-01  1.46263681e-01  1.45780392e-01 -3.46546545e-02
   2.45782549e-01 -6.88180816e-01]
 [ 1.45196547e-02 -2.96301089e-03 -3.11843899e-01  1.58751720e-01
   1.24651624e-02 -3.85652760e-01  5.47451992e-01  6.36711143e-01
  -8.79509317e-02  1.24932436e-01]
 [-6.20474320e-03  8.82305018e-04  1.03423411e-04 -6.29262983e-03
   2.80429439e-01 -7.97726124e-01  2.02869043e-02 -5.25351988e-01
  -2.66282865e-02  8.83070812e-02]
 [ 1.18637492e-01 

### Selecting the Principal Components

In [27]:
# arrange the magnitudes of eigen values in decreasing order
sorted_index = np.argsort(eigen_values)[::-1]
sorted_eigen_values = eigen_values[sorted_index]
sorted_eigen_values

array([1.76183431, 1.36343442, 1.14490499, 1.00771155, 0.99149596,
       0.94453221, 0.88693474, 0.82731826, 0.66153881, 0.41031795])

In [28]:
# arranginf the eigen vectors
sorted_eigen_vectors = eigen_vectors[:,sorted_index]
sorted_eigen_vectors

array([[ 1.90738502e-02, -9.44013348e-02, -3.39671635e-01,
         2.44610431e-01,  7.30483040e-01, -5.27917567e-01,
        -2.73707068e-02, -3.66566295e-02,  3.32897049e-02,
         3.16164898e-03],
       [ 1.82421444e-01, -6.48443477e-01, -6.51315301e-02,
         4.28451428e-03, -1.15462427e-01, -3.86790722e-02,
         2.03219887e-03, -5.09905684e-03, -7.25571000e-01,
         2.59220199e-02],
       [ 2.48966490e-02, -2.41952870e-01,  5.84563021e-01,
         1.46263681e-01,  1.45780392e-01, -3.46546545e-02,
        -6.88180816e-01,  2.45782549e-01,  1.45866825e-01,
        -2.84020673e-04],
       [ 1.45196547e-02, -3.11843899e-01,  1.24651624e-02,
        -3.85652760e-01,  5.47451992e-01,  6.36711143e-01,
         1.24932436e-01, -8.79509317e-02,  1.58751720e-01,
        -2.96301089e-03],
       [-6.20474320e-03,  1.03423411e-04,  2.80429439e-01,
        -7.97726124e-01,  2.02869043e-02, -5.25351988e-01,
         8.83070812e-02, -2.66282865e-02, -6.29262983e-03,
         8.

In [29]:
# Calculating the percentage of variance explained by each eigen vector
for i in sorted_eigen_values:
    print(i/sum(sorted_eigen_values))

0.17618302191576762
0.13634312589635092
0.11449023338807485
0.1007709216583989
0.09914936562672154
0.09445300189093488
0.08869326836409132
0.08273163374638277
0.0661537273816092
0.041031700131668146


In [30]:
# Selecting the two most important principal components
k_components = 2
eigen_vector_subset = sorted_eigen_vectors[:,0:k_components]

In [31]:
eigen_vector_subset

array([[ 1.90738502e-02, -9.44013348e-02],
       [ 1.82421444e-01, -6.48443477e-01],
       [ 2.48966490e-02, -2.41952870e-01],
       [ 1.45196547e-02, -3.11843899e-01],
       [-6.20474320e-03,  1.03423411e-04],
       [ 1.18637492e-01, -6.07654558e-01],
       [ 5.06132013e-01,  1.07190434e-01],
       [ 6.47397098e-01,  1.49473483e-01],
       [ 5.24749539e-01,  9.58940450e-02],
       [-2.75661441e-02, -5.04953866e-02]])

### Selecting 8 features using PCA

In [32]:
# Selecting 8 principal components
k_components = 8
eigen_vector_subset = sorted_eigen_vectors[:,0:k_components]

In [33]:
eigen_vector_subset

array([[ 1.90738502e-02, -9.44013348e-02, -3.39671635e-01,
         2.44610431e-01,  7.30483040e-01, -5.27917567e-01,
        -2.73707068e-02, -3.66566295e-02],
       [ 1.82421444e-01, -6.48443477e-01, -6.51315301e-02,
         4.28451428e-03, -1.15462427e-01, -3.86790722e-02,
         2.03219887e-03, -5.09905684e-03],
       [ 2.48966490e-02, -2.41952870e-01,  5.84563021e-01,
         1.46263681e-01,  1.45780392e-01, -3.46546545e-02,
        -6.88180816e-01,  2.45782549e-01],
       [ 1.45196547e-02, -3.11843899e-01,  1.24651624e-02,
        -3.85652760e-01,  5.47451992e-01,  6.36711143e-01,
         1.24932436e-01, -8.79509317e-02],
       [-6.20474320e-03,  1.03423411e-04,  2.80429439e-01,
        -7.97726124e-01,  2.02869043e-02, -5.25351988e-01,
         8.83070812e-02, -2.66282865e-02],
       [ 1.18637492e-01, -6.07654558e-01, -1.58164788e-01,
         6.41019082e-02, -3.27442224e-01, -1.74637905e-01,
         1.65677691e-01, -2.41390113e-02],
       [ 5.06132013e-01,  1.071904