In [1]:
# Import packages
import pandas as pd
import numpy as np
import os
import random
import gc
import pickle
import math

from sklearn.decomposition import PCA

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from scipy.stats import uniform

In [2]:
# Load final exam and assignment data
data = pd.DataFrame(columns=['SubjectID', 'Assignment', 'CodeStateID'])
grades = {}
scores = {}
for directory in ['F19_All', 'S19_All']:
  base_path = os.path.join('data', directory, 'Train' if directory == 'F19_All' else '')
  grade_data = pd.read_csv(os.path.join(base_path, 'Data', 'LinkTables', 'Subject.csv'))
  for row in grade_data.itertuples():
    grades[row.SubjectID] = row._2 if directory == 'S19_All' else row._2 / 100
  main = pd.read_csv(os.path.join(base_path, 'Data', 'MainTable.csv'))
  for row in main[main['EventType'] == 'Run.Program'].itertuples():
    scores[row.CodeStateID] = row.Score
  main = main[(main['EventType'] == 'Compile') & (main['Compile.Result'] == 'Success')]
  main['Assignment'] = [f'{x[0]}-{x[1]}' for x in list(zip(main['AssignmentID'], main['ProblemID']))]
  data = pd.concat([data, main[['SubjectID', 'Assignment', 'CodeStateID']]], ignore_index=True)

data['Score'] = [scores[x] for x in data['CodeStateID']]
data['ExamGrade'] = [grades[x] if x in grades else 0 for x in data['SubjectID']]
data.head()

Unnamed: 0,SubjectID,Assignment,CodeStateID,Score,ExamGrade
0,014604ba54339d4b1266cf78e125053a5ac11dd861ef3c...,439-1,79c9e2da9bc703116d8768297056db22866c43d583e529...,1.0,0.97
1,014604ba54339d4b1266cf78e125053a5ac11dd861ef3c...,439-3,0e91ae5696da0d6a724380360147b8699a25c14ff46227...,1.0,0.97
2,014604ba54339d4b1266cf78e125053a5ac11dd861ef3c...,439-5,1b38c7b1c7ebca281bbbc270d1b896484de663045c715f...,1.0,0.97
3,014604ba54339d4b1266cf78e125053a5ac11dd861ef3c...,439-12,2a850f5de4aaf278f5752896aaee2df171a4a5f284ec8b...,1.0,0.97
4,014604ba54339d4b1266cf78e125053a5ac11dd861ef3c...,439-13,7ec7412b1bf528ea84551d5ef407d4b1682132332245e4...,1.0,0.97


In [3]:
# Find the Code IDs for the submissions with the highest score for each student for each assignment
students = list(data['SubjectID'].unique())
assignments = data['Assignment'].unique()
best_score_data = {}
for student in students:
  best_ids = [None] * len(assignments)
  s_filter = data[data['SubjectID'] == student]
  a_idx = 0
  for assignment in assignments:
    a_filter = s_filter[s_filter['Assignment'] == assignment].reset_index()
    if len(a_filter.index) > 0:
      best_ids[a_idx] = a_filter['CodeStateID'][np.argmax(a_filter['Score'])]
    a_idx += 1
  best_score_data[student] = best_ids
best_score_data[students[0]][0:5]

# Free some memory
del scores
del data
gc.collect()

44

In [4]:
# Divide data into 80% training data, 20% testing data
random.seed(1)
num_students = len(students)
test_students = set(random.sample(students, int(num_students * 0.2)))
train_students = set([x for x in students if x not in test_students])

test_grades = [grades[x] if x in grades else 0 for x in test_students]
train_grades = [grades[x] if x in grades else 0 for x in train_students]
print(f'Train avg: {np.mean(train_grades):0.2f} Test avg: {np.mean(test_grades):0.2f}')
print(f'Train dev: {np.std(train_grades):0.2f} Test dev: {np.std(test_grades):0.2f}')

# test_ids = set([x in test_students for x in students])
# train_ids = set([x in train_students for x in students])

Train avg: 0.59 Test avg: 0.63
Train dev: 0.25 Test dev: 0.23


In [5]:
# Given a dataframe, a vector length, and a number of components,
# Divide all the vectors into a train and test set and transform them using PCA with the given number of components
def train_test_vectors(df, vector_length):
  train_X = []
  train_y = []
  test_X = []
  test_y = []
  vector_lookup = {}
  for idx, row in df.iterrows():
    vector = np.array(row[0:vector_length])
    vector_lookup[row['CodeStateID']] = vector
  for student in students:
    all_assign = np.concatenate([vector_lookup[x] if x in vector_lookup else np.zeros(vector_length) for x in best_score_data[student]])
    if student in train_students:
      train_X.append(all_assign)
      train_y.append(grades[student] if student in grades else 0)
    elif student in test_students:
      test_X.append(all_assign)
      test_y.append(grades[student] if student in grades else 0)
#   pca = PCA(n_components=n_components, random_state=1)
#   train_X = pca.fit_transform(train_X)
#   test_X = pca.transform(test_X)
  return train_X, train_y, test_X, test_y

In [6]:
# code_states = {}
# for directory in ['F19_All', 'S19_All']:
#   base_path = os.path.join('data', directory, 'Train' if directory == 'F19_All' else '')
#   code_data = pd.read_csv(os.path.join(base_path, 'Data', 'CodeStates', 'CodeStates.csv'))
#   for row in code_data.itertuples():
#     code_states[row.CodeStateID] = row.Code

In [7]:
# random.seed(1)
# validation_set = set(random.sample(list(code_states.keys()), int(len(code_states) / 10)))
# for c_id in train_data['CodeStateID'].unique():
#   directory = 'Validate' if c_id in validation_set else 'Train' 
#   with open(os.path.join(directory, c_id + '.java'), 'w', encoding="utf-8") as f:
#     print(code_states[c_id], file=f)

In [8]:
# for c_id in test_data['CodeStateID'].unique():
#   with open(os.path.join('Test', c_id + '.java'), 'w', encoding="utf-8") as f:
#     print(code_states[c_id], file=f)

In [9]:
code2vec_vectors = pd.read_csv('data/Vectors/code2vec.csv')
train_X, train_y, test_X, test_y = train_test_vectors(code2vec_vectors, 384)

In [10]:
with open('saved_models', 'rb') as f:
  saved_models = pickle.load(f)

In [11]:
def tune_model(name, model, parameters, n_iter):
  pipeline = Pipeline(steps=[('pca', PCA(random_state=1)), ('model', model)])
  if name in saved_models:
    pipeline.set_params(**saved_models[name])
  else:
    new_params = {'pca__n_components': parameters['n_components']}
    del parameters['n_components']
    for key in parameters:
      new_params['model__' + key] = parameters[key]
    cv = RandomizedSearchCV(pipeline, new_params, random_state=1, scoring='neg_mean_absolute_error', n_iter=n_iter, verbose=1)
    cv.fit(train_X, train_y)
    saved_models[name] = cv.best_params_
    with open('saved_models', 'wb') as f:
      pickle.dump(saved_models, f)
    pipeline.set_params(**cv.best_params_)
#   print("Variance captured: %0.3f" % np.sum(pipeline.steps[0].explained_variance_ratio_))
  return pipeline

In [12]:
ridge = tune_model('code2vec_ridge', Ridge(random_state=1), {'n_components': list(range(10, 380, 10)),'alpha': uniform(scale=10)}, n_iter=10)
ridge.fit(train_X, train_y)

dummy = DummyRegressor()
dummy.fit(train_X, train_y)

DummyRegressor()

In [13]:
pred_y = dummy.predict(test_X)
print(f'MAE: {mean_absolute_error(test_y, pred_y):0.3f}\nRMSE: {math.sqrt(mean_squared_error(test_y, pred_y)):0.3f}')
pred_y = ridge.predict(test_X)
print(f'MAE: {mean_absolute_error(test_y, pred_y):0.3f}\nRMSE: {math.sqrt(mean_squared_error(test_y, pred_y)):0.3f}')

MAE: 0.185
RMSE: 0.229
MAE: 0.162
RMSE: 0.204


In [14]:
deepwalk_vectors = pd.read_csv('data/Vectors/deepwalk.csv')
train_X, train_y, test_X, test_y = train_test_vectors(deepwalk_vectors, 50)
ridge = tune_model('deepwalk_ridge', Ridge(random_state=1), {'n_components': list(range(5, 45, 5)),'alpha': uniform(scale=10)}, n_iter=10)
ridge.fit(train_X, train_y)

Pipeline(steps=[('pca', PCA(n_components=40, random_state=1)),
                ('model', Ridge(alpha=3.0233257263183977, random_state=1))])

In [15]:
pred_y = ridge.predict(test_X)
print(f'MAE: {mean_absolute_error(test_y, pred_y):0.3f}\nRMSE: {math.sqrt(mean_squared_error(test_y, pred_y)):0.3f}')

MAE: 0.145
RMSE: 0.193


-0.20431378