In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
# from sklearn.metrics import adjusted_rand_score
# from sklearn.metrics import adjusted_mutual_info_score as mi
# from sklearn.metrics import homogeneity_score as hs
from sklearn.metrics import confusion_matrix
import matplotlib
import seaborn as sns
from scipy import stats
from imblearn.over_sampling import RandomOverSampler 
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE


In [2]:
# Load all required data
from google.colab import drive
drive.mount('/content/gdrive')
% cd '/content/gdrive/MyDrive/Colab Notebooks/Unsupervised'
essayEncode_file = 'essayEncodings_BERT.npz'
essayClass_file = 'essayClass.csv'
essayRating_file = 'essayRating.csv'
#os.chdir(npz_loc)

essayEncode = np.load(essayEncode_file)
essayEncode = essayEncode['arr_0']
essayClass = pd.read_csv(essayClass_file)
essayRating = pd.read_csv(essayRating_file)
print(essayEncode.shape)
print(essayClass.shape)
print(essayRating.shape)


Mounted at /content/gdrive
/content/gdrive/MyDrive/Colab Notebooks/Unsupervised
(36765, 768)
(36765, 1)
(36765, 1)


In [3]:
essayClassOneHot = pd.get_dummies(essayClass.discourse_type).to_numpy()
essayClassOneHot_1 = pd.get_dummies(essayClass.discourse_type)
essayClassDummies = pd.factorize(essayClass.discourse_type)
essayClassDummies = essayClassDummies[0]
essayRatingDummies = pd.factorize(essayRating.discourse_effectiveness)
essayRatingDummies = essayRatingDummies[0]
print(essayClassOneHot.shape)
print(essayClassDummies.shape)
print(essayRatingDummies.shape)

(36765, 7)
(36765,)
(36765,)


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(np.concatenate((essayEncode,essayClassOneHot),axis=1),
                                                    essayRatingDummies, test_size=0.2, random_state=42, shuffle = True)

In [7]:
print(X_train.shape)
print(X_test.shape)
X_super=np.concatenate((X_train, X_test), axis=0)
print(X_super.shape)


(29412, 775)
(7353, 775)
(36765, 775)


In [11]:
tsne = TSNE(n_components=2, verbose=1)
xTSNE = tsne.fit_transform(X_super)





[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 36765 samples in 0.039s...
[t-SNE] Computed neighbors for 36765 samples in 88.120s...
[t-SNE] Computed conditional probabilities for sample 1000 / 36765
[t-SNE] Computed conditional probabilities for sample 2000 / 36765
[t-SNE] Computed conditional probabilities for sample 3000 / 36765
[t-SNE] Computed conditional probabilities for sample 4000 / 36765
[t-SNE] Computed conditional probabilities for sample 5000 / 36765
[t-SNE] Computed conditional probabilities for sample 6000 / 36765
[t-SNE] Computed conditional probabilities for sample 7000 / 36765
[t-SNE] Computed conditional probabilities for sample 8000 / 36765
[t-SNE] Computed conditional probabilities for sample 9000 / 36765
[t-SNE] Computed conditional probabilities for sample 10000 / 36765
[t-SNE] Computed conditional probabilities for sample 11000 / 36765
[t-SNE] Computed conditional probabilities for sample 12000 / 36765
[t-SNE] Computed conditional probabilities for sa

In [12]:
xTSNE_train, xTSNE_test = train_test_split(xTSNE, test_size=0.2,shuffle = False)


In [13]:
# PCA for for testing data
# K-Means on PCA-reduced data
kmeans = KMeans(n_clusters=3, random_state=0)
xLabs = kmeans.fit_predict(xTSNE_train)
xLabs_Test = kmeans.predict(xTSNE_test)
nLabs = np.unique(xLabs)
yPred = np.zeros(xLabs.shape[0])
yPred_Test = np.zeros(xLabs_Test.shape[0])
# Assign label to each cluster
for lab in nLabs:
    inds = np.where(xLabs==lab)[0]
    inds_test = np.where(xLabs_Test==lab)[0]
    trueLabels = y_train[inds]
    clusterLabel = stats.mode(trueLabels)[0]
    yPred[inds] = clusterLabel
#     print(clusterLabel)
    yPred_Test[inds_test] = clusterLabel
print(100*sum(yPred == y_train)/len(yPred))
print('Training: Metrics with no oversampling')
print(classification_report(y_train,yPred,target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(y_test,yPred_Test,target_names=['Adequate','Inadequate','Effective']))

# Remove duplicates arising from oversampling in training data
dfTrain = np.concatenate((xTSNE_train,yPred[:,np.newaxis],y_train[:,np.newaxis]),axis = 1)
dfTest = np.concatenate((xTSNE_test,yPred_Test[:,np.newaxis],y_test[:,np.newaxis]),axis = 1)
dfTrain = pd.DataFrame(dfTrain)
dfTest = pd.DataFrame(dfTest)
print('Train data before removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
dfTrain.drop_duplicates(inplace=True)
# dfTest.drop_duplicates(inplace=True)
print('Train data after removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
print('Training: Metrics with duplicates removed')
print(classification_report(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(dfTest.iloc[:,-1],dfTest.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Confusion matrix - validation')
print(confusion_matrix(dfTest.iloc[:,-1],dfTest.iloc[:,-2]))
print('Confusion matrix - training')
print(confusion_matrix(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2]))


56.88494492044064
Training: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.57      1.00      0.73     16731
  Inadequate       0.00      0.00      0.00      5180
   Effective       0.00      0.00      0.00      7501

    accuracy                           0.57     29412
   macro avg       0.19      0.33      0.24     29412
weighted avg       0.32      0.57      0.41     29412

Validation: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.58      1.00      0.73      4246
  Inadequate       0.00      0.00      0.00      1282
   Effective       0.00      0.00      0.00      1825

    accuracy                           0.58      7353
   macro avg       0.19      0.33      0.24      7353
weighted avg       0.33      0.58      0.42      7353

Train data before removing duplicates
(29412, 4)
Train data after removing duplicates
(29394, 4)
Training: Metrics with duplicates removed
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# PCA for for testing data
# K-Means on PCA-reduced data
kmeans = KMeans(n_clusters=5, random_state=0)
xLabs = kmeans.fit_predict(xTSNE_train)
xLabs_Test = kmeans.predict(xTSNE_test)
nLabs = np.unique(xLabs)
yPred = np.zeros(xLabs.shape[0])
yPred_Test = np.zeros(xLabs_Test.shape[0])
# Assign label to each cluster
for lab in nLabs:
    inds = np.where(xLabs==lab)[0]
    inds_test = np.where(xLabs_Test==lab)[0]
    trueLabels = y_train[inds]
    clusterLabel = stats.mode(trueLabels)[0]
    yPred[inds] = clusterLabel
#     print(clusterLabel)
    yPred_Test[inds_test] = clusterLabel
print(100*sum(yPred == y_train)/len(yPred))

print('Training: Metrics with no oversampling')
print(classification_report(y_train,yPred,target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(y_test,yPred_Test,target_names=['Adequate','Inadequate','Effective']))

# Remove duplicates arising from oversampling in training data
dfTrain = np.concatenate((xTSNE_train,yPred[:,np.newaxis],y_train[:,np.newaxis]),axis = 1)
dfTest = np.concatenate((xTSNE_test,yPred_Test[:,np.newaxis],y_test[:,np.newaxis]),axis = 1)
dfTrain = pd.DataFrame(dfTrain)
dfTest = pd.DataFrame(dfTest)
print('Train data before removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
dfTrain.drop_duplicates(inplace=True)
# dfTest.drop_duplicates(inplace=True)
print('Train data after removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
print('Training: Metrics with duplicates removed')
print(classification_report(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(dfTest.iloc[:,-1],dfTest.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Confusion matrix - validation')
print(confusion_matrix(dfTest.iloc[:,-1],dfTest.iloc[:,-2]))
print('Confusion matrix - training')
print(confusion_matrix(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2]))


56.88494492044064
Training: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.57      1.00      0.73     16731
  Inadequate       0.00      0.00      0.00      5180
   Effective       0.00      0.00      0.00      7501

    accuracy                           0.57     29412
   macro avg       0.19      0.33      0.24     29412
weighted avg       0.32      0.57      0.41     29412

Validation: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.58      1.00      0.73      4246
  Inadequate       0.00      0.00      0.00      1282
   Effective       0.00      0.00      0.00      1825

    accuracy                           0.58      7353
   macro avg       0.19      0.33      0.24      7353
weighted avg       0.33      0.58      0.42      7353

Train data before removing duplicates
(29412, 4)
Train data after removing duplicates
(29394, 4)
Training: Metrics with duplicates removed
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# PCA for for testing data
# K-Means on PCA-reduced data
kmeans = KMeans(n_clusters=10, random_state=0)
xLabs = kmeans.fit_predict(xTSNE_train)
xLabs_Test = kmeans.predict(xTSNE_test)
nLabs = np.unique(xLabs)
yPred = np.zeros(xLabs.shape[0])
yPred_Test = np.zeros(xLabs_Test.shape[0])
# Assign label to each cluster
for lab in nLabs:
    inds = np.where(xLabs==lab)[0]
    inds_test = np.where(xLabs_Test==lab)[0]
    trueLabels = y_train[inds]
    clusterLabel = stats.mode(trueLabels)[0]
    yPred[inds] = clusterLabel
#     print(clusterLabel)
    yPred_Test[inds_test] = clusterLabel
print(100*sum(yPred == y_train)/len(yPred))

print('Training: Metrics with no oversampling')
print(classification_report(y_train,yPred,target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(y_test,yPred_Test,target_names=['Adequate','Inadequate','Effective']))

# Remove duplicates arising from oversampling in training data
dfTrain = np.concatenate((xTSNE_train,yPred[:,np.newaxis],y_train[:,np.newaxis]),axis = 1)
dfTest = np.concatenate((xTSNE_test,yPred_Test[:,np.newaxis],y_test[:,np.newaxis]),axis = 1)
dfTrain = pd.DataFrame(dfTrain)
dfTest = pd.DataFrame(dfTest)
print('Train data before removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
dfTrain.drop_duplicates(inplace=True)
# dfTest.drop_duplicates(inplace=True)
print('Train data after removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
print('Training: Metrics with duplicates removed')
print(classification_report(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(dfTest.iloc[:,-1],dfTest.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Confusion matrix - validation')
print(confusion_matrix(dfTest.iloc[:,-1],dfTest.iloc[:,-2]))
print('Confusion matrix - training')
print(confusion_matrix(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2]))


56.88494492044064
Training: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.57      1.00      0.73     16731
  Inadequate       0.00      0.00      0.00      5180
   Effective       0.00      0.00      0.00      7501

    accuracy                           0.57     29412
   macro avg       0.19      0.33      0.24     29412
weighted avg       0.32      0.57      0.41     29412

Validation: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.58      1.00      0.73      4246
  Inadequate       0.00      0.00      0.00      1282
   Effective       0.00      0.00      0.00      1825

    accuracy                           0.58      7353
   macro avg       0.19      0.33      0.24      7353
weighted avg       0.33      0.58      0.42      7353

Train data before removing duplicates
(29412, 4)
Train data after removing duplicates
(29394, 4)
Training: Metrics with duplicates removed
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [16]:
# PCA for for testing data
# K-Means on PCA-reduced data
kmeans = KMeans(n_clusters=15, random_state=0)
xLabs = kmeans.fit_predict(xTSNE_train)
xLabs_Test = kmeans.predict(xTSNE_test)
nLabs = np.unique(xLabs)
yPred = np.zeros(xLabs.shape[0])
yPred_Test = np.zeros(xLabs_Test.shape[0])
# Assign label to each cluster
for lab in nLabs:
    inds = np.where(xLabs==lab)[0]
    inds_test = np.where(xLabs_Test==lab)[0]
    trueLabels = y_train[inds]
    clusterLabel = stats.mode(trueLabels)[0]
    yPred[inds] = clusterLabel
#     print(clusterLabel)
    yPred_Test[inds_test] = clusterLabel
print(100*sum(yPred == y_train)/len(yPred))

print('Training: Metrics with no oversampling')
print(classification_report(y_train,yPred,target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(y_test,yPred_Test,target_names=['Adequate','Inadequate','Effective']))

# Remove duplicates arising from oversampling in training data
dfTrain = np.concatenate((xTSNE_train,yPred[:,np.newaxis],y_train[:,np.newaxis]),axis = 1)
dfTest = np.concatenate((xTSNE_test,yPred_Test[:,np.newaxis],y_test[:,np.newaxis]),axis = 1)
dfTrain = pd.DataFrame(dfTrain)
dfTest = pd.DataFrame(dfTest)
print('Train data before removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
dfTrain.drop_duplicates(inplace=True)
# dfTest.drop_duplicates(inplace=True)
print('Train data after removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
print('Training: Metrics with duplicates removed')
print(classification_report(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(dfTest.iloc[:,-1],dfTest.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Confusion matrix - validation')
print(confusion_matrix(dfTest.iloc[:,-1],dfTest.iloc[:,-2]))
print('Confusion matrix - training')
print(confusion_matrix(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2]))


58.69713042295662
Training: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.60      0.90      0.72     16731
  Inadequate       0.00      0.00      0.00      5180
   Effective       0.53      0.30      0.39      7501

    accuracy                           0.59     29412
   macro avg       0.38      0.40      0.37     29412
weighted avg       0.48      0.59      0.51     29412

Validation: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.60      0.88      0.72      4246
  Inadequate       0.00      0.00      0.00      1282
   Effective       0.51      0.30      0.38      1825

    accuracy                           0.59      7353
   macro avg       0.37      0.40      0.36      7353
weighted avg       0.47      0.59      0.51      7353

Train data before removing duplicates
(29412, 4)
Train data after removing duplicates
(29394, 4)
Training: Metrics with duplicates removed
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

    Adequate       0.60      0.88      0.72      4246
  Inadequate       0.00      0.00      0.00      1282
   Effective       0.51      0.30      0.38      1825

    accuracy                           0.59      7353
   macro avg       0.37      0.40      0.36      7353
weighted avg       0.47      0.59      0.51      7353

Confusion matrix - validation
[[3755    0  491]
 [1230    0   52]
 [1271    0  554]]
Confusion matrix - training
[[14987     0  1730]
 [ 4932     0   245]
 [ 5237     0  2263]]


In [17]:
# PCA for for testing data
# K-Means on PCA-reduced data
kmeans = KMeans(n_clusters=15, random_state=0)
xLabs = kmeans.fit_predict(xTSNE_train)
xLabs_Test = kmeans.predict(xTSNE_test)
nLabs = np.unique(xLabs)
yPred = np.zeros(xLabs.shape[0])
yPred_Test = np.zeros(xLabs_Test.shape[0])
# Assign label to each cluster
for lab in nLabs:
    inds = np.where(xLabs==lab)[0]
    inds_test = np.where(xLabs_Test==lab)[0]
    trueLabels = y_train[inds]
    clusterLabel = stats.mode(trueLabels)[0]
    yPred[inds] = clusterLabel
#     print(clusterLabel)
    yPred_Test[inds_test] = clusterLabel
print(100*sum(yPred == y_train)/len(yPred))

print('Training: Metrics with no oversampling')
print(classification_report(y_train,yPred,target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(y_test,yPred_Test,target_names=['Adequate','Inadequate','Effective']))

# Remove duplicates arising from oversampling in training data
dfTrain = np.concatenate((xTSNE_train,yPred[:,np.newaxis],y_train[:,np.newaxis]),axis = 1)
dfTest = np.concatenate((xTSNE_test,yPred_Test[:,np.newaxis],y_test[:,np.newaxis]),axis = 1)
dfTrain = pd.DataFrame(dfTrain)
dfTest = pd.DataFrame(dfTest)
print('Train data before removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
dfTrain.drop_duplicates(inplace=True)
# dfTest.drop_duplicates(inplace=True)
print('Train data after removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
print('Training: Metrics with duplicates removed')
print(classification_report(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(dfTest.iloc[:,-1],dfTest.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Confusion matrix - validation')
print(confusion_matrix(dfTest.iloc[:,-1],dfTest.iloc[:,-2]))
print('Confusion matrix - training')
print(confusion_matrix(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2]))


58.69713042295662
Training: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.60      0.90      0.72     16731
  Inadequate       0.00      0.00      0.00      5180
   Effective       0.53      0.30      0.39      7501

    accuracy                           0.59     29412
   macro avg       0.38      0.40      0.37     29412
weighted avg       0.48      0.59      0.51     29412

Validation: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.60      0.88      0.72      4246
  Inadequate       0.00      0.00      0.00      1282
   Effective       0.51      0.30      0.38      1825

    accuracy                           0.59      7353
   macro avg       0.37      0.40      0.36      7353
weighted avg       0.47      0.59      0.51      7353

Train data before removing duplicates
(29412, 4)
Train data after removing duplicates
(29394, 4)
Training: Metrics with duplicates removed
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
# PCA for for testing data
# K-Means on PCA-reduced data
kmeans = KMeans(n_clusters=25, random_state=0)
xLabs = kmeans.fit_predict(xTSNE_train)
xLabs_Test = kmeans.predict(xTSNE_test)
nLabs = np.unique(xLabs)
yPred = np.zeros(xLabs.shape[0])
yPred_Test = np.zeros(xLabs_Test.shape[0])
# Assign label to each cluster
for lab in nLabs:
    inds = np.where(xLabs==lab)[0]
    inds_test = np.where(xLabs_Test==lab)[0]
    trueLabels = y_train[inds]
    clusterLabel = stats.mode(trueLabels)[0]
    yPred[inds] = clusterLabel
#     print(clusterLabel)
    yPred_Test[inds_test] = clusterLabel
print(100*sum(yPred == y_train)/len(yPred))

print('Training: Metrics with no oversampling')
print(classification_report(y_train,yPred,target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(y_test,yPred_Test,target_names=['Adequate','Inadequate','Effective']))

# Remove duplicates arising from oversampling in training data
dfTrain = np.concatenate((xTSNE_train,yPred[:,np.newaxis],y_train[:,np.newaxis]),axis = 1)
dfTest = np.concatenate((xTSNE_test,yPred_Test[:,np.newaxis],y_test[:,np.newaxis]),axis = 1)
dfTrain = pd.DataFrame(dfTrain)
dfTest = pd.DataFrame(dfTest)
print('Train data before removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
dfTrain.drop_duplicates(inplace=True)
# dfTest.drop_duplicates(inplace=True)
print('Train data after removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
print('Training: Metrics with duplicates removed')
print(classification_report(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(dfTest.iloc[:,-1],dfTest.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Confusion matrix - validation')
print(confusion_matrix(dfTest.iloc[:,-1],dfTest.iloc[:,-2]))
print('Confusion matrix - training')
print(confusion_matrix(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2]))


58.67673058615531
Training: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.59      0.94      0.72     16731
  Inadequate       0.00      0.00      0.00      5180
   Effective       0.58      0.20      0.30      7501

    accuracy                           0.59     29412
   macro avg       0.39      0.38      0.34     29412
weighted avg       0.48      0.59      0.49     29412

Validation: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.60      0.94      0.73      4246
  Inadequate       0.00      0.00      0.00      1282
   Effective       0.56      0.20      0.30      1825

    accuracy                           0.59      7353
   macro avg       0.39      0.38      0.34      7353
weighted avg       0.48      0.59      0.50      7353

Train data before removing duplicates
(29412, 4)
Train data after removing duplicates
(29394, 4)
Training: Metrics with duplicates removed
 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# PCA for for testing data
# K-Means on PCA-reduced data
kmeans = KMeans(n_clusters=50, random_state=0)
xLabs = kmeans.fit_predict(xTSNE_train)
xLabs_Test = kmeans.predict(xTSNE_test)
nLabs = np.unique(xLabs)
yPred = np.zeros(xLabs.shape[0])
yPred_Test = np.zeros(xLabs_Test.shape[0])
# Assign label to each cluster
for lab in nLabs:
    inds = np.where(xLabs==lab)[0]
    inds_test = np.where(xLabs_Test==lab)[0]
    trueLabels = y_train[inds]
    clusterLabel = stats.mode(trueLabels)[0]
    yPred[inds] = clusterLabel
#     print(clusterLabel)
    yPred_Test[inds_test] = clusterLabel
print(100*sum(yPred == y_train)/len(yPred))

print('Training: Metrics with no oversampling')
print(classification_report(y_train,yPred,target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(y_test,yPred_Test,target_names=['Adequate','Inadequate','Effective']))

# Remove duplicates arising from oversampling in training data
dfTrain = np.concatenate((xTSNE_train,yPred[:,np.newaxis],y_train[:,np.newaxis]),axis = 1)
dfTest = np.concatenate((xTSNE_test,yPred_Test[:,np.newaxis],y_test[:,np.newaxis]),axis = 1)
dfTrain = pd.DataFrame(dfTrain)
dfTest = pd.DataFrame(dfTest)
print('Train data before removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
dfTrain.drop_duplicates(inplace=True)
# dfTest.drop_duplicates(inplace=True)
print('Train data after removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
print('Training: Metrics with duplicates removed')
print(classification_report(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(dfTest.iloc[:,-1],dfTest.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Confusion matrix - validation')
print(confusion_matrix(dfTest.iloc[:,-1],dfTest.iloc[:,-2]))
print('Confusion matrix - training')
print(confusion_matrix(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2]))


60.30191758465932
Training: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.61      0.90      0.72     16731
  Inadequate       0.49      0.04      0.08      5180
   Effective       0.60      0.33      0.43      7501

    accuracy                           0.60     29412
   macro avg       0.57      0.42      0.41     29412
weighted avg       0.58      0.60      0.53     29412

Validation: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.61      0.89      0.73      4246
  Inadequate       0.45      0.04      0.08      1282
   Effective       0.57      0.34      0.42      1825

    accuracy                           0.60      7353
   macro avg       0.54      0.42      0.41      7353
weighted avg       0.57      0.60      0.54      7353

Train data before removing duplicates
(29412, 4)
Train data after removing duplicates
(29394, 4)
Training: Metrics with duplicates removed
 

In [20]:
# PCA for for testing data
# K-Means on PCA-reduced data
kmeans = KMeans(n_clusters=75, random_state=0)
xLabs = kmeans.fit_predict(xTSNE_train)
xLabs_Test = kmeans.predict(xTSNE_test)
nLabs = np.unique(xLabs)
yPred = np.zeros(xLabs.shape[0])
yPred_Test = np.zeros(xLabs_Test.shape[0])
# Assign label to each cluster
for lab in nLabs:
    inds = np.where(xLabs==lab)[0]
    inds_test = np.where(xLabs_Test==lab)[0]
    trueLabels = y_train[inds]
    clusterLabel = stats.mode(trueLabels)[0]
    yPred[inds] = clusterLabel
#     print(clusterLabel)
    yPred_Test[inds_test] = clusterLabel
print(100*sum(yPred == y_train)/len(yPred))

print('Training: Metrics with no oversampling')
print(classification_report(y_train,yPred,target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(y_test,yPred_Test,target_names=['Adequate','Inadequate','Effective']))

# Remove duplicates arising from oversampling in training data
dfTrain = np.concatenate((xTSNE_train,yPred[:,np.newaxis],y_train[:,np.newaxis]),axis = 1)
dfTest = np.concatenate((xTSNE_test,yPred_Test[:,np.newaxis],y_test[:,np.newaxis]),axis = 1)
dfTrain = pd.DataFrame(dfTrain)
dfTest = pd.DataFrame(dfTest)
print('Train data before removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
dfTrain.drop_duplicates(inplace=True)
# dfTest.drop_duplicates(inplace=True)
print('Train data after removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
print('Training: Metrics with duplicates removed')
print(classification_report(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(dfTest.iloc[:,-1],dfTest.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Confusion matrix - validation')
print(confusion_matrix(dfTest.iloc[:,-1],dfTest.iloc[:,-2]))
print('Confusion matrix - training')
print(confusion_matrix(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2]))


60.30191758465932
Training: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.61      0.87      0.72     16731
  Inadequate       0.51      0.08      0.13      5180
   Effective       0.58      0.36      0.45      7501

    accuracy                           0.60     29412
   macro avg       0.57      0.44      0.43     29412
weighted avg       0.58      0.60      0.55     29412

Validation: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.62      0.87      0.72      4246
  Inadequate       0.48      0.07      0.12      1282
   Effective       0.56      0.37      0.45      1825

    accuracy                           0.61      7353
   macro avg       0.55      0.44      0.43      7353
weighted avg       0.58      0.61      0.55      7353

Train data before removing duplicates
(29412, 4)
Train data after removing duplicates
(29394, 4)
Training: Metrics with duplicates removed
 

In [21]:
# PCA for for testing data
# K-Means on PCA-reduced data
kmeans = KMeans(n_clusters=100, random_state=0)
xLabs = kmeans.fit_predict(xTSNE_train)
xLabs_Test = kmeans.predict(xTSNE_test)
nLabs = np.unique(xLabs)
yPred = np.zeros(xLabs.shape[0])
yPred_Test = np.zeros(xLabs_Test.shape[0])
# Assign label to each cluster
for lab in nLabs:
    inds = np.where(xLabs==lab)[0]
    inds_test = np.where(xLabs_Test==lab)[0]
    trueLabels = y_train[inds]
    clusterLabel = stats.mode(trueLabels)[0]
    yPred[inds] = clusterLabel
#     print(clusterLabel)
    yPred_Test[inds_test] = clusterLabel
print(100*sum(yPred == y_train)/len(yPred))

print('Training: Metrics with no oversampling')
print(classification_report(y_train,yPred,target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(y_test,yPred_Test,target_names=['Adequate','Inadequate','Effective']))

# Remove duplicates arising from oversampling in training data
dfTrain = np.concatenate((xTSNE_train,yPred[:,np.newaxis],y_train[:,np.newaxis]),axis = 1)
dfTest = np.concatenate((xTSNE_test,yPred_Test[:,np.newaxis],y_test[:,np.newaxis]),axis = 1)
dfTrain = pd.DataFrame(dfTrain)
dfTest = pd.DataFrame(dfTest)
print('Train data before removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
dfTrain.drop_duplicates(inplace=True)
# dfTest.drop_duplicates(inplace=True)
print('Train data after removing duplicates')
print(dfTrain.shape)
# print(dfTest.shape)
print('Training: Metrics with duplicates removed')
print(classification_report(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Validation: Metrics with no oversampling')
print(classification_report(dfTest.iloc[:,-1],dfTest.iloc[:,-2],target_names=['Adequate','Inadequate','Effective']))
print('Confusion matrix - validation')
print(confusion_matrix(dfTest.iloc[:,-1],dfTest.iloc[:,-2]))
print('Confusion matrix - training')
print(confusion_matrix(dfTrain.iloc[:,-1],dfTrain.iloc[:,-2]))


61.45110839113287
Training: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.62      0.88      0.73     16731
  Inadequate       0.54      0.10      0.16      5180
   Effective       0.61      0.39      0.47      7501

    accuracy                           0.61     29412
   macro avg       0.59      0.45      0.45     29412
weighted avg       0.60      0.61      0.56     29412

Validation: Metrics with no oversampling
              precision    recall  f1-score   support

    Adequate       0.62      0.87      0.73      4246
  Inadequate       0.53      0.09      0.15      1282
   Effective       0.59      0.39      0.47      1825

    accuracy                           0.62      7353
   macro avg       0.58      0.45      0.45      7353
weighted avg       0.60      0.62      0.56      7353

Train data before removing duplicates
(29412, 4)
Train data after removing duplicates
(29394, 4)
Training: Metrics with duplicates removed
 

In [21]:
from sklearn import mixture
#kmeans = mixture.GaussianMixture(n_components=3, covariance_type='full')