# Dimensionality Reduction

Robin Burke
March 12, 2020

### Imports

In [80]:
import pandas as pd
from sklearn.decomposition import PCA
from prince import MCA

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score

import warnings

warnings.filterwarnings("ignore", category=FutureWarning)

import matplotlib.pyplot as plt

### Read the data

In [81]:
data = pd.read_csv('Data/TrainingData.csv')

cols = data.columns

print (data.shape)

data.head()


(44102, 130)


Unnamed: 0,UNIQUE_ID,Overall_Rating,Technical_Skills,Teamwork,Customer_Service,Hire_Again,High_Performer,Protected_Group,Retained,SJ_Most_1,...,PScale12_Q1,PScale12_Q2,PScale12_Q3,PScale12_Q4,PScale13_Q1,PScale13_Q2,PScale13_Q3,PScale13_Q4,PScale13_Q5,split
0,245021089,3.0,3.0,4.0,4.0,4.0,0.0,0.0,1,3.0,...,1.0,1.0,3.0,4.0,1.0,2.0,3.0,2.0,1.0,train
1,245181465,5.0,5.0,5.0,5.0,5.0,1.0,1.0,0,3.0,...,1.0,1.0,4.0,4.0,2.0,1.0,4.0,4.0,4.0,train
2,229682665,3.0,3.0,3.0,3.0,4.0,0.0,1.0,0,2.0,...,1.0,1.0,4.0,4.0,1.0,1.0,4.0,4.0,4.0,train
3,245174982,4.0,4.0,4.0,4.0,5.0,1.0,0.0,1,2.0,...,1.0,1.0,2.0,4.0,1.0,1.0,3.0,3.0,3.0,train
4,244979030,2.0,2.0,3.0,2.0,3.0,0.0,0.0,1,3.0,...,2.0,1.0,4.0,4.0,2.0,1.0,4.0,3.0,2.0,train


### Extract the situational judgement columns

Right now am ignoring the time. This could be an interesting variable, too. We will come back and look at the rows with missing data later.

In [83]:
sj_cols = cols[cols.str.contains('SJ_[Most|Least]')]
sj_df = data.loc[:, sj_cols].copy()
sj_df.dropna(inplace=True)
sj_df = sj_df.astype('int')

sj_df.head()

Unnamed: 0,SJ_Most_1,SJ_Least_1,SJ_Most_2,SJ_Least_2,SJ_Most_3,SJ_Least_3,SJ_Most_4,SJ_Least_4,SJ_Most_5,SJ_Least_5,SJ_Most_6,SJ_Least_6,SJ_Most_7,SJ_Least_7,SJ_Most_8,SJ_Least_8,SJ_Most_9,SJ_Least_9
0,3,4,3,2,1,4,1,2,2,4,2,4,4,3,1,4,1,3
1,3,4,3,2,1,4,1,2,2,3,3,4,1,4,1,3,2,4
2,2,4,1,4,1,3,1,2,1,3,2,4,1,3,1,3,1,4
3,2,1,1,2,1,4,1,2,2,4,4,1,1,2,2,3,3,4
4,3,2,3,2,1,4,2,3,2,3,2,4,1,2,1,4,1,3


Convert to dummy variables because these are different options that employees picked

In [84]:
mca = MCA(n_components=5)
mca.fit(sj_df)

MCA(check_input=True, copy=True, engine='auto', n_components=5, n_iter=10,
    random_state=None)

In [85]:
mca.explained_inertia_

[0.042490579748388456,
 0.036074806458672046,
 0.03414276018714455,
 0.029040037555755002,
 0.02616090548282249]

Doesn't seem very promising

### Extract the biodata columns

In [86]:
bio_cols = cols[cols.str.contains('Biodata_')]
bio_df = data.loc[:, bio_cols].copy()
bio_df.dropna(inplace=True)
bio_df = bio_df.astype('int')

bio_df.head()

Unnamed: 0,Biodata_01,Biodata_02,Biodata_03,Biodata_04,Biodata_05,Biodata_06,Biodata_07,Biodata_08,Biodata_09,Biodata_10,Biodata_11,Biodata_12,Biodata_13,Biodata_14,Biodata_15,Biodata_16,Biodata_17,Biodata_18,Biodata_19,Biodata_20
0,2,3,2,2,2,1,3,7,2,5,2,2,4,1,2,5,2,1,4,1
1,2,5,5,2,3,1,1,5,2,6,1,1,4,3,2,5,2,4,6,1
2,2,3,1,1,2,2,2,8,5,7,1,2,1,2,2,5,2,5,1,1
3,2,1,2,7,1,1,2,1,7,1,1,2,4,3,2,5,7,7,2,1
4,3,1,6,6,1,3,2,1,1,1,1,2,4,4,3,1,3,1,3,1


In [87]:
pca = PCA(n_components=8)
pca.fit(bio_df)

PCA(copy=True, iterated_power='auto', n_components=8, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [88]:
pca.explained_variance_ratio_.sum()

0.8593032608326032

8 components gives us 86% of the variance. 60% reduction in the number of variables. Could be useful. 10 components is 95%.

### Extract the personality scale columns

In [54]:
ps_cols = cols[cols.str.contains('PScale')]
ps_df = data.loc[:, ps_cols].copy()
ps_df.dropna(inplace=True)
ps_df = ps_df.astype('int')

ps_df.head()

Unnamed: 0,PScale01_Q1,PScale01_Q2,PScale01_Q3,PScale01_Q4,PScale02_Q1,PScale02_Q2,PScale02_Q3,PScale02_Q4,PScale03_Q1,PScale03_Q2,...,PScale11_Q4,PScale12_Q1,PScale12_Q2,PScale12_Q3,PScale12_Q4,PScale13_Q1,PScale13_Q2,PScale13_Q3,PScale13_Q4,PScale13_Q5
0,4,1,1,4,3,1,4,4,1,3,...,1,1,1,3,4,1,2,3,2,1
1,4,1,1,4,4,1,4,4,1,4,...,1,1,1,4,4,2,1,4,4,4
2,4,1,1,4,4,1,4,4,1,4,...,4,1,1,4,4,1,1,4,4,4
3,3,3,3,3,4,1,4,3,1,4,...,2,1,1,2,4,1,1,3,3,3
4,2,3,3,2,3,2,3,2,1,3,...,4,2,1,4,4,2,1,4,3,2


## PCA on the personality scales

In [96]:
pca = PCA(n_components=30)
pca.fit(ps_df)

PCA(copy=True, iterated_power='auto', n_components=30, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [97]:
pca.explained_variance_ratio_.sum()

0.8500290755036111

In [98]:
ps_df.shape

(39259, 55)

OK. It is possible to reduce the dimensionality by 40% or so.

## Looking at individual scales

In [56]:
sc1 = ps_df.iloc[:, 0:4]

sc1.head()

Unnamed: 0,PScale01_Q1,PScale01_Q2,PScale01_Q3,PScale01_Q4
0,4,1,1,4
1,4,1,1,4
2,4,1,1,4
3,3,3,3,3
4,2,3,3,2


## Clustering the first scale

### Two clusters

In [64]:
kmeans = KMeans(n_clusters=2)
kmeans.fit(sc1)

kmeans.cluster_centers_

array([[3.30481731, 1.4208983 , 1.57654177, 3.39390272],
       [1.67576235, 2.74243277, 2.91133116, 1.62687071]])

In [65]:
cluster_labels = kmeans.fit_predict(sc1)
silhouette_score(sc1, cluster_labels)


0.42959542469031886

### Three clusters

In [61]:
kmeans = KMeans(n_clusters=3)
kmeans.fit(sc1)

kmeans.cluster_centers_

array([[3.74870119, 1.14069046, 1.27333669, 3.80668678],
       [1.56874108, 3.16880826, 3.29369279, 1.52280171],
       [2.42398495, 1.81138929, 2.02386821, 2.48235828]])

In [62]:
cluster_labels = kmeans.fit_predict(sc1)
silhouette_score(sc1, cluster_labels)

In [63]:
silhouette_avg

0.3504041872726267

### Four clusters

In [66]:
kmeans = KMeans(n_clusters=4)
kmeans.fit(sc1)

kmeans.cluster_centers_

array([[1.68442889, 1.73201089, 2.0210035 , 1.52975496],
       [2.84799113, 2.01808411, 2.28746908, 3.02772328],
       [3.75837254, 1.06245964, 1.14632346, 3.81585017],
       [1.52192787, 3.42152716, 3.42709261, 1.46716385]])

In [67]:
cluster_labels = kmeans.fit_predict(sc1)
silhouette_score(sc1, cluster_labels)

0.364890618770901

As expected, there is decent clustering of the individual personality scales. Another option would to try to cluster the whole set at once. 

### Need to do similar for the other questions

In [None]:
ps_cols = cols[cols.str.contains('PScale')]
ps_df = data.loc[:, ps_cols].copy()
ps_df.dropna(inplace=True)
ps_df = ps_df.astype('int')

ps_df.head()