In [3]:
import pandas as pd
import numpy as np
import os
import sys

In [31]:
data_dir = './data/'

student_log_paths = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.startswith('student_log')]
training_label_path = os.path.join(data_dir, 'training_label.csv')
validation_test_label = os.path.join(data_dir, 'validation_test_label.csv')

dfs = []
for path in student_log_paths:
    temp = pd.read_csv(path)
    dfs.append(temp)
student_df = pd.concat(dfs)

training_label_df = pd.read_csv(training_label_path)
validation_test_label_df = pd.read_csv(validation_test_label)

  interactivity=interactivity, compiler=compiler, result=result)


In [32]:
print("student_df.shape:", student_df.shape) 
print("training_label_df.shape:", training_label_df.shape)
print("validation_test_label_df.shape:", validation_test_label_df.shape)

student_df.shape: (942816, 77)
training_label_df.shape: (514, 5)
validation_test_label_df.shape: (172, 4)


In [33]:
student_specific_columns = ["AveKnow",
                            "AveCarelessness",
                            "AveCorrect",
                            "NumActions",
                            "AveResBored",
                            "AveResEngcon",
                            "AveResConf",
                            "AveResFrust",
                            "AveResOfftask",
                            "AveResGaming"]

In [34]:
required_cols = ['ITEST_id'] + student_specific_columns
student_specific_df = student_df[required_cols].drop_duplicates()

In [35]:
student_specific_df.head()

Unnamed: 0,ITEST_id,AveKnow,AveCarelessness,AveCorrect,NumActions,AveResBored,AveResEngcon,AveResConf,AveResFrust,AveResOfftask,AveResGaming
0,8,0.352416,0.183276,0.483902,1056,0.208389,0.679126,0.115905,0.112408,0.156503,0.196561
1056,35,0.255164,0.158848,0.379658,993,0.222796,0.650079,0.069987,0.164347,0.153147,0.2368
2049,39,0.281693,0.152227,0.454545,418,0.2747,0.628075,0.11639,0.143827,0.221926,0.019817
2467,64,0.157938,0.098357,0.334038,1419,0.198394,0.670062,0.085895,0.132328,0.115773,0.303426
3886,77,0.191948,0.094195,0.413249,317,0.261455,0.638636,0.063971,0.129858,0.234331,0.045096


In [50]:
combined_df = pd.merge(left=training_label_df, right=student_specific_df, how='left')
X = combined_df[student_specific_columns].values
y = combined_df['isSTEM'].values

In [60]:
combined_df = pd.merge(left=validation_test_label_df, right=student_specific_df, how='left')
X_target = combined_df[student_specific_columns].values

# Scikit-learn method

In [57]:
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, roc_curve, mean_squared_error

sss = StratifiedShuffleSplit(n_splits=5, random_state=42)
for train_index, test_index in sss.split(X, y):
    X_train, y_train = X[train_index], y[train_index]
    X_test, y_test = X[test_index], y[test_index]
    
    model = GradientBoostingClassifier()
    model.fit(X_train, y_train)
    
    # test set evaluation
    y_pred = model.predict(X_test)
    fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)
    auc_test = auc(fpr, tpr)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # train set evaluation
    y_pred = model.predict(X_train)
    fpr, tpr, thresholds = roc_curve(y_train, y_pred, pos_label=1)
    auc_train = auc(fpr, tpr)
    rmse_train = np.sqrt(mean_squared_error(y_train, y_pred))
    
    print('Test: AUC: {:.5f}, RMSE: {:.5f}'.format(auc_test, rmse_test))
    print('Train:  AUC: {:.5f}, RMSE: {:.5f}'.format(auc_train, rmse_train))
    print("="*30)

Train: AUC: 0.62017, RMSE: 0.55470
Test:  AUC: 0.96599, RMSE: 0.14712
Train: AUC: 0.61933, RMSE: 0.53709
Test:  AUC: 0.94898, RMSE: 0.18019
Train: AUC: 0.63529, RMSE: 0.55470
Test:  AUC: 0.94580, RMSE: 0.19182
Train: AUC: 0.75210, RMSE: 0.45993
Test:  AUC: 0.95578, RMSE: 0.16775
Train: AUC: 0.72185, RMSE: 0.45993
Test:  AUC: 0.94898, RMSE: 0.18019


In [67]:
model = GradientBoostingClassifier()
model.fit(X, y)
y_target = model.predict_proba(X_target)

In [75]:
# prediction submit result
result = ','.join(["{:.5f}".format(i[1]) for i in y_target])
print(result)

'0.61921,0.59038,0.32891,0.23993,0.63220,0.74379,0.34435,0.57852,0.58853,0.09773,0.19914,0.33141,0.11149,0.51188,0.62870,0.52403,0.48054,0.18729,0.45806,0.06627,0.81072,0.05268,0.80350,0.31453,0.18760,0.32789,0.11479,0.04370,0.17218,0.31965,0.35144,0.15714,0.07110,0.61839,0.16736,0.57225,0.27351,0.22354,0.25955,0.71901,0.27875,0.41441,0.08022,0.26384,0.26927,0.08260,0.12545,0.21938,0.55250,0.04789,0.52754,0.08911,0.49590,0.10723,0.19107,0.82243,0.17529,0.40207,0.26001,0.62066,0.16531,0.51456,0.39432,0.08400,0.53511,0.68250,0.24000,0.63259,0.75695,0.30456,0.44533,0.07078,0.74525,0.14375,0.66972,0.16094,0.08068,0.68565,0.28065,0.43642,0.73147,0.80384,0.10196,0.25253,0.21948,0.45372,0.23051,0.06396,0.20761,0.15556,0.38898,0.09758,0.68496,0.24765,0.47147,0.69151,0.30053,0.59636,0.24219,0.50756,0.69032,0.66535,0.19947,0.52824,0.47725,0.48708,0.47691,0.05420,0.95628,0.20905,0.23647,0.12839,0.32019,0.09295,0.84258,0.81318,0.15375,0.52271,0.21582,0.09825,0.07791,0.33452,0.51064,0.03593,0.29691

# Neural Network

In [59]:
from keras.models import load_model
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Reshape, Flatten
from keras import regularizers
from keras.optimizers import Adam
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
set_session(tf.Session(config=config))

def hyper_parameter_search(input_shape, num_classes):
    num_hidden_layers = np.random.choice([3, 4], p=[0.5, 0.5])
    reg_lambda = np.random.uniform(low=0.001, high=0.01)
    hidden_layer_units = []
    for i in range(num_hidden_layers):
        # discrete uniform
        units = np.random.randint(low=50, high=200)
        hidden_layer_units.append(units)

    print("num_hidden_layers:", num_hidden_layers)
    print("lambda", reg_lambda)
    print("hidden_layer_units", hidden_layer_units)

    # create model
    model = Sequential()
    for units in hidden_layer_units:
        model.add(Dense(units, input_dim=input_shape,
                        kernel_regularizer=regularizers.l2(reg_lambda),
                        activation='relu'))
        input_shape = units

    assert(num_classes == 1)
    # output layer
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss="binary_crossentropy",
                  optimizer='Adam',
                  metrics=['accuracy'])
    return model

ModuleNotFoundError: No module named 'keras'