In [None]:
import pandas as pd
import numpy as np
import os, sklearn, keras, tensorflow, gensim, multiprocessing
import matplotlib.pyplot as plt
from pandas import read_csv
from keras.models import Sequential, Model
from keras import layers
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input, Embedding, BatchNormalization
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer 
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.models.phrases import Phrases, Phraser
from nltk.tokenize import RegexpTokenizer

In [None]:
# Set working directory
dirname = ''
os.chdir(dirname)

# Read course description data
X = read_csv('input/courses_nested.csv')

# DEFINE CONSTANTS
N = 5000           # Number of words in document matrix
TEST_SIZE = 0.05   # Define split size for test, training data (% of all records)
LAM = 0.01        # Regularization parameter

In [None]:
# Process outcome data y
le = LabelEncoder() 
onehot_encoder = OneHotEncoder(sparse = False)

y_v2 = le.fit_transform(np.array(X['blom_group']))
y_v2 = y_v2.reshape(len(y_v2), 1)
y_onehot = onehot_encoder.fit_transform(y_v2)

In [None]:
# Count vectorizer for features
vectorizer = sklearn.feature_extraction.text.TfidfVectorizer(max_features = N, ngram_range = (1,2))  # ALLOW LARGER NGRAM: ngram_range = (1,2)?
x = vectorizer.fit_transform(X['desc'])
x_dense = x.toarray()

In [None]:
##########################################
# PREPARE DATASETS FOR LEARING
##########################################

# EVENTUALLY - NEED TO NORMALIZE THE X DATA

# Split into test, training datasets
train_X, test_X, train_Y, test_Y = train_test_split(x_dense, y_onehot, test_size = TEST_SIZE, shuffle = True, stratify = y_onehot)
train_X, dev_X, train_Y, dev_Y = train_test_split(train_X, train_Y, test_size = TEST_SIZE / (1 - TEST_SIZE), shuffle = True, stratify = train_Y)

In [None]:
##########################################
# DEFINE MODEL - multilayer sequential model with
# regularization to avoid overfitting on individual words,
# final activation function should be a softmax activation
# where the number of nodes corresponds to the number of 
# distinct major categories
##########################################

model = Sequential()
model.add(Dense(1152, input_dim = train_X.shape[1], activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(lam)))
model.add(BatchNormalization())
model.add(Dense(576, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(LAM)))
model.add(BatchNormalization())
model.add(Dense(288, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(LAM)))
model.add(BatchNormalization())
model.add(Dense(144, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(LAM)))
model.add(BatchNormalization())
model.add(Dense(72, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(LAM)))
model.add(BatchNormalization())
model.add(Dense(72, activation = 'relu', kernel_regularizer = tf.keras.regularizers.l1(LAM)))
model.add(BatchNormalization())
model.add(Dense(36, activation = 'softmax'))

# model.summary()

In [None]:
# Compile model defined above, use ADAM optimizer.
# Output is categorical, use categorical cross-entropy loss function
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

In [None]:
# Test model fit using mini-batch gradient descent
test_fit = model.fit(x = train_X, y = train_Y, validation_data = (dev_X, dev_Y), epochs = 40, batch_size = 64)

In [None]:
# Evaluate the model
predictions = model.evaluate(x = test_X, y = test_Y)

In [None]:
# Plot loss functionplt.plot(test_fit.history['loss'])
plt.plot(test_fit.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'dev'], loc='upper left')
plt.ylim((10, 14))
plt.show()

In [None]:
# Evaluate the model
predictions = model.predict(x = test_X)
print(predictions)

In [None]:
# Convert softmax predictions to labels 
def undo_onehot (df, df_prime):
    # Use model to predict output
    # out = model.predict(df_prime)
    # out = model.predict(df_prime)
    out = np.argmax(df_prime, axis = 1)
    df = np.argmax(df, axis = 1)

    # Inverse transform using the encoder defined above
    out = le.inverse_transform(out)
    df = le.inverse_transform(df)


    # Return as data frame
    out = {'y': df,
          'y_prime': out,
          'match': (df == out) + 0}
    return pd.DataFrame(out)

test_out = undo_onehot(df = test_Y, df_prime = model.predict(test_X))
test_out2 = test_out.groupby('y').mean()
print(pd.DataFrame.to_latex(test_out2))