# Educational Process Mining

https://archive-beta.ics.uci.edu/dataset/346/educational+process+mining+epm+a+learning+analytics+data+set

Educational Process Mining (EPM): A Learning Analytics Data Set. (2015). UCI Machine Learning Repository.

## PCA

This notebook includes PCA preprocessing for inclusion with other notebooks.

### Standardize the numeric inputs

In [None]:
Xtimepoints = StandardScaler().fit_transform( features_df )

In [None]:
Xtimepoints.shape

In [None]:
sns.catplot(data = pd.DataFrame(Xtimepoints, columns=feature_names), kind='box', aspect=3.5)

plt.show()

### PCA

In [None]:
pca_object = PCA()

In [None]:
type( pca_object )

In [None]:
pca_object.fit( Xtimepoints )

In [None]:
timepoint_pcs = pca_object.transform( Xtimepoints )

In [None]:
timepoint_pcs.shape

In [None]:
type(timepoint_pcs)

In [None]:
pc_names = ['pc_'+str(d).zfill(2) for d in range(1, (timepoint_pcs.shape[1]+1))]

In [None]:
pc_names

In [None]:
timepoint_pcs_df = pd.DataFrame( timepoint_pcs, columns=pc_names )

In [None]:
timepoint_pcs_df

In [None]:
sns.catplot(data = timepoint_pcs_df, kind='box', aspect=3.5)

plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

timepoint_pcs_df.describe().loc['std'].plot(ax=ax)

plt.show()

##### Correlation plot showing no correlation

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

sns.heatmap( data = timepoint_pcs_df.corr(),
            vmin=-1, vmax=1, center = 0,
            cmap='coolwarm',
            ax=ax)

plt.show()

##### PCA Scree plot and total variance explained

In [None]:
fig, ax = plt.subplots(figsize=(12, 8))

ax.plot( np.arange( len(pca_object.explained_variance_ratio_ ) )+1, pca_object.explained_variance_ratio_, 'bo-')

ax.set_xlabel('PC')
ax.set_ylabel('Variance Explained Ratio')

plt.show()

In [None]:
total_var_explain = pca_object.explained_variance_ratio_.cumsum()

In [None]:
fig, ax = plt.subplots(figsize=(12, 6))

ax.plot( np.arange( len(total_var_explain) ) + 1, total_var_explain, 'bo-')

ax.set_xlabel('PC')
ax.set_ylabel('Total variance explained')

plt.show()

In [None]:
pc_scores_df = pd.DataFrame( timepoint_pcs, columns=['PC' + str(d+1).zfill(2) for d in range( timepoint_pcs.shape[1]) ] )

In [None]:
pc_scores_df.info()

##### PCA Contribution

In [None]:
def calc_pca_contrib(load_mat):
    # square each element in the loadings matrix
    load_mat_sq = load_mat ** 2
    # sum the square loadings for each PC
    sum_load_squares = load_mat_sq.sum(axis=0)
    # divide the square loadings by the sum of the squared loadings per PC
    contrib_mat = load_mat_sq / sum_load_squares
    
    return(contrib_mat)

In [None]:
fig, ax = plt.subplots(figsize=(36, 18))

sns.heatmap( data = np.transpose( calc_pca_contrib( pca_object.components_) ) > (1 / timepoint_pcs.shape[1]), 
            xticklabels=pc_scores_df.iloc[:, :len(feature_names)].columns.tolist(),
            yticklabels=feature_names,
            vmin=0, vmax=1)

plt.show()

##### Biplot for the first two PCs

In [None]:
fig, ax = plt.subplots(figsize=(27, 27))

ax.scatter(pc_scores_df.PC01, pc_scores_df.PC02, s=11)

for i in range(pca_object.components_[0:2, :].shape[1]):
    ax.arrow(0, 0, 
             pca_object.components_[0, i] * (pc_scores_df.PC01.max() - pc_scores_df.PC01.min()), 
             pca_object.components_[1, i] * (pc_scores_df.PC02.max() - pc_scores_df.PC02.min()), 
             color='r')
    
    ax.text(pca_object.components_[0, i] * 1.15 * (pc_scores_df.PC01.max() - pc_scores_df.PC01.min()), 
            pca_object.components_[1, i] * 1.15 * (pc_scores_df.PC02.max() - pc_scores_df.PC02.min()),
            features_df.columns[i],
            color = 'black', ha = 'center', va = 'center')

ax.set_xlabel('PC01')
ax.set_ylabel('PC02')

plt.show()

##### Biplot for the PC02 and PC03

In [None]:
fig, ax = plt.subplots(figsize=(27, 27))

ax.scatter(pc_scores_df.PC02, pc_scores_df.PC03, s=11)

for i in range(pca_object.components_[0:2, :].shape[1]):
    ax.arrow(0, 0, 
             pca_object.components_[0, i] * (pc_scores_df.PC02.max() - pc_scores_df.PC02.min()), 
             pca_object.components_[1, i] * (pc_scores_df.PC03.max() - pc_scores_df.PC03.min()), 
             color='r')
    
    ax.text(pca_object.components_[0, i] * 1.15 * (pc_scores_df.PC02.max() - pc_scores_df.PC02.min()), 
            pca_object.components_[1, i] * 1.15 * (pc_scores_df.PC03.max() - pc_scores_df.PC03.min()),
            features_df.columns[i],
            color = 'black', ha = 'center', va = 'center')

ax.set_xlabel('PC02')
ax.set_ylabel('PC03')

plt.show()

##### Biplot for the PC03 and PC04

In [None]:
fig, ax = plt.subplots(figsize=(27, 27))

ax.scatter(pc_scores_df.PC03, pc_scores_df.PC04, s=11)

for i in range(pca_object.components_[0:2, :].shape[1]):
    ax.arrow(0, 0, 
             pca_object.components_[0, i] * (pc_scores_df.PC03.max() - pc_scores_df.PC03.min()), 
             pca_object.components_[1, i] * (pc_scores_df.PC04.max() - pc_scores_df.PC04.min()), 
             color='r')
    
    ax.text(pca_object.components_[0, i] * 1.15 * (pc_scores_df.PC03.max() - pc_scores_df.PC03.min()), 
            pca_object.components_[1, i] * 1.15 * (pc_scores_df.PC04.max() - pc_scores_df.PC04.min()),
            features_df.columns[i],
            color = 'black', ha = 'center', va = 'center')

ax.set_xlabel('PC03')
ax.set_ylabel('PC04')

plt.show()

##### Biplot for the PC04 and PC05

In [None]:
fig, ax = plt.subplots(figsize=(27, 27))

ax.scatter(pc_scores_df.PC04, pc_scores_df.PC05, s=11)

for i in range(pca_object.components_[0:2, :].shape[1]):
    ax.arrow(0, 0, 
             pca_object.components_[0, i] * (pc_scores_df.PC04.max() - pc_scores_df.PC04.min()), 
             pca_object.components_[1, i] * (pc_scores_df.PC05.max() - pc_scores_df.PC05.min()), 
             color='r')
    
    ax.text(pca_object.components_[0, i] * 1.15 * (pc_scores_df.PC04.max() - pc_scores_df.PC04.min()), 
            pca_object.components_[1, i] * 1.15 * (pc_scores_df.PC05.max() - pc_scores_df.PC05.min()),
            features_df.columns[i],
            color = 'black', ha = 'center', va = 'center')

ax.set_xlabel('PC04')
ax.set_ylabel('PC05')

plt.show()