# Preprocessing

In [1]:
from os import path
import bz2
import pandas as pd
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras import regularizers
from operator import itemgetter as it
from itertools import repeat

Using TensorFlow backend.


In [2]:
%pwd

'/Users/hpitawela/resumes/projects'

In [3]:
fname_train = 'training.psv'
fname_eval = 'eval.psv'

In [4]:
# Reading files and populate dataframes
df_train = pd.read_csv(fname_train, delimiter="|")
df_eval = pd.read_csv(fname_eval, delimiter="|")

In [5]:
df_train.columns

Index(['student_id', 'level', 'course', 'grade', 'major'], dtype='object')

In [6]:
df_eval.columns

Index(['student_id', 'level', 'course', 'grade', 'major1', 'major2', 'major3'], dtype='object')

In [7]:
df_train.head()

Unnamed: 0,student_id,level,course,grade,major
0,ppVGBRKhtqqyxnVO,Freshman,SPAN:100,A,Business
1,PiPkSgMGbFIu5RwR,Freshman,CSI:160,S,International Relations
2,PiPkSgMGbFIu5RwR,Sophomore,EES:107,C,International Relations
3,PiPkSgMGbFIu5RwR,Senior,SPAN:201,B,International Relations
4,PiPkSgMGbFIu5RwR,Junior,ENTR:200,B+,International Relations


In [8]:
# grouping by student_id to extract majors
df2_train = df_train.groupby('student_id', as_index=False).last()
df2_train = df2_train[['student_id','major']]

In [9]:
# one hot encoding of majors in the training set
encoded_majors = pd.get_dummies(df2_train.major)
df2_train_st_major_headers = pd.concat([df2_train, encoded_majors], axis=1)
df2_train_major_headers = df2_train_st_major_headers.drop(columns=['major'])

# set student_id as index
df2_train_majors_with_st_index = df2_train_major_headers.set_index('student_id')
df2_train_majors_with_st_index.head()

Unnamed: 0_level_0,Accounting,Actuarial Science,Actuarial Sciences,African American Studies,Anthropology,Art,Asian Studies,Astronomy,Biochemistry,Biology,...,Religion,Russian,Social Work,Sociology,Spanish,Speech And Hearing Science,Statistics,Theatre,Therapeutic Recreation,Veterinary Medicine
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01DiJuoJAB395ucJ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01MhxeQl5FhRsf3f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01W7KB8TDNWNx4YW,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
042Rmpv5B2kXdfBR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04DuzbneGqk0o0jY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# grade values in training set
df1_train_grade = df_train.pivot(index='student_id',columns='course',values='grade')

In [11]:
df1_train_grade.shape

(10000, 2721)

In [12]:
# level values in training set
df1_train_level = df_train.pivot(index='student_id',columns='course',values='level')

In [13]:
df1_train_level.shape

(10000, 2721)

In [14]:
# grade values in eval set
df3_eval_grade = df_eval.pivot('student_id','course','grade')

In [16]:
# level values in eval set
df3_eval_level = df_eval.pivot('student_id','course','level')

## Approach 1 - One hot encoding of grade and level values

In [17]:
pd.unique(df1_train_grade[df1_train_grade.columns].values.ravel('K'))

array([nan, 'B', 'C', 'R', 'B+', 'S', 'B-', 'C-', 'A', 'A+', 'A-', 'C+',
       'WX', 'D', 'D+', 'D-', 'AUS', 'U', 'P', 'I', 'AUU', 'N'],
      dtype=object)

In [18]:
# One hot encoding for grades in training set
df1_train_grade_course_headers = pd.concat([pd.get_dummies(df1_train_grade[col]) for col in df1_train_grade], axis=1, keys=df1_train_grade.columns)

In [19]:
# One hot encoding for grades in eval set
df3_eval_grade_course_headers = pd.concat([pd.get_dummies(df3_eval_grade[col]) for col in df3_eval_grade], axis=1, keys=df3_eval_grade.columns)
df3_eval_grade_course_headers.head()

course,006:100,ABRD:302,ABRD:304,ABRD:306,ABRD:308,ABRD:309,ABRD:314,ABRD:321,ABRD:322,ABRD:324,...,WRIT:140,WRIT:140,WRIT:140,WRIT:160,WRIT:160,WRIT:160,WRIT:160,WRIT:310,WRIT:474,WRIT:476
Unnamed: 0_level_1,R,R,R,R,R,R,R,R,R,R,...,B,B+,C+,A,A-,B+,C+,A,A-,A+
student_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
01BVNwmXUXsoVHLd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03TWkCsakXIVrOtA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
067YeH4Acv00Bdvf,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
06TWQIIh4SaEAnOu,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
08dMDGiuTXbojMV0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Aligining courses columns of training and eval sets on grades
df3_eval_grade_course_headers_aligned, df1_train_grade_course_headers_aligned = df3_eval_grade_course_headers.align(df1_train_grade_course_headers, axis=1, fill_value=0)
df1_train_grade_course_headers_aligned, df3_eval_grade_course_headers_aligned = df1_train_grade_course_headers.align(df3_eval_grade_course_headers, axis=1, fill_value=0)

In [21]:
# check if aligning has worked
compare = df1_train_grade_course_headers_aligned.columns == df3_eval_grade_course_headers_aligned.columns
pd.unique(compare)

array([ True])

In [22]:
# One hot encoding of course levels in training set
df1_train_level_course_headers = pd.concat([pd.get_dummies(df1_train_level[col]) for col in df1_train_level], axis=1, keys=df1_train_level.columns)

# One hot encoding of course levels in eval set
df3_eval_level_course_headers = pd.concat([pd.get_dummies(df3_eval_level[col]) for col in df3_eval_level], axis=1, keys=df3_eval_level.columns)

In [23]:
# Aligining courses columns of training and eval sets on levels
df3_eval_level_course_headers_aligned, df1_train_level_course_headers_aligned = df3_eval_level_course_headers.align(df1_train_level_course_headers, axis=1, fill_value=0)
df1_train_level_course_headers_aligned, df3_eval_level_course_headers_aligned = df1_train_level_course_headers.align(df3_eval_level_course_headers, axis=1, fill_value=0)

In [24]:
# check if aligning has worked
compare = df1_train_level_course_headers_aligned.columns == df3_eval_level_course_headers_aligned.columns
pd.unique(compare)

array([ True])

In [25]:
# Transposing to get student headers for concatenation of grades and levels
df3_eval_level_st_headers = df3_eval_level_course_headers_aligned.T
df1_train_level_st_headers = df1_train_level_course_headers_aligned.T

# Transposing grades and levels for concatenation
df1_train_grade_st_headers = df1_train_grade_course_headers_aligned.T
df3_eval_grade_st_headers = df3_eval_grade_course_headers_aligned.T

In [26]:
# Merging grade and levels in training set
df1_train_st_headers_merged =  pd.concat([df1_train_grade_st_headers, df1_train_level_st_headers])

# Merging grade and levels in eval set
df3_eval_st_headers_merged = pd.concat([df3_eval_grade_st_headers, df3_eval_level_st_headers])

In [27]:
# check shape after merging
df3_eval_st_headers_merged.shape

(18434, 2000)

In [29]:
# set input features to feed the model
train = df1_train_st_headers_merged.T

In [30]:
train.head()

course,006:100,ABRD:301,ABRD:302,ABRD:302,ABRD:303,ABRD:304,ABRD:306,ABRD:307,ABRD:308,ABRD:309,...,WRIT:310,WRIT:310,WRIT:326,WRIT:374,WRIT:390,WRIT:400,WRIT:400,WRIT:474,WRIT:474,WRIT:476
Unnamed: 0_level_1,R,R,A+,R,R,R,R,R,R,R,...,Senior,Sophomore,Senior,Sophomore,Junior,Junior,Senior,Junior,Senior,Senior
student_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
01DiJuoJAB395ucJ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01MhxeQl5FhRsf3f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01W7KB8TDNWNx4YW,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
042Rmpv5B2kXdfBR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04DuzbneGqk0o0jY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
df2_train_majors_with_st_index.head()

Unnamed: 0_level_0,Accounting,Actuarial Science,Actuarial Sciences,African American Studies,Anthropology,Art,Asian Studies,Astronomy,Biochemistry,Biology,...,Religion,Russian,Social Work,Sociology,Spanish,Speech And Hearing Science,Statistics,Theatre,Therapeutic Recreation,Veterinary Medicine
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01DiJuoJAB395ucJ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01MhxeQl5FhRsf3f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01W7KB8TDNWNx4YW,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
042Rmpv5B2kXdfBR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04DuzbneGqk0o0jY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
#create model
model1 = Sequential()

#get number of columns in training data
n_cols = train.values.shape[1]

#add model layers
model1.add(Dropout(0.5, input_shape=(n_cols,)))
model1.add(Dense(200, activation='relu'))
model1.add(Dense(df2_train_majors_with_st_index.values.shape[1], activation='softmax'))

#compile model using mse as a measure of model performance
model1.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

#set early stopping monitor so the model stops training when it won't improve anymore
early_stopping_monitor = EarlyStopping(patience=3)
#train model
model1.fit(train, df2_train_majors_with_st_index, epochs=5, batch_size=64, validation_split=0.2, callbacks=[early_stopping_monitor])

Train on 8000 samples, validate on 2000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0xb33aa5c50>

## Approach 2 - Ignoring level values

In [33]:
#training set grades with student_id headers and one_hot encoded
df1_train_grade_st_headers.head()

Unnamed: 0_level_0,student_id,01DiJuoJAB395ucJ,01MhxeQl5FhRsf3f,01W7KB8TDNWNx4YW,042Rmpv5B2kXdfBR,04DuzbneGqk0o0jY,04TIITMjjPIVIkES,04vxIYe6guefIhGD,04yPdcfnDzHbSIsS,04zKgh2DJS9owZNA,059Ssc6DVmDrBM7o,...,ztxqzzstFQAUqFF5,zvxS0iHJO7zrsT4z,zw6BlxT3IPULAtyN,zwtG2OaFnK3NPeGm,zx6fskg55IgkzEDA,zzL9tMAJh28lzEb7,zzTJwgTritjISIOS,zzWgxK0AfTmln8wS,zzX9IzDFcBfprIhm,zzmL0Yv0ksPiCaaq
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
006:100,R,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABRD:301,R,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABRD:302,A+,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABRD:302,R,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABRD:303,R,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [34]:
#eval set grades with student_id headers and one_hot encoded
df3_eval_grade_st_headers.head()

Unnamed: 0_level_0,student_id,01BVNwmXUXsoVHLd,03TWkCsakXIVrOtA,067YeH4Acv00Bdvf,06TWQIIh4SaEAnOu,08dMDGiuTXbojMV0,0AZBfmHd9FV2FnKx,0Ag6mrfoNbm6galR,0CLEOGQtzygq3vMv,0EkR8DbOEjK78l59,0GNXLTwD25oyMmaM,...,zkDtTPIFME95aIVW,zq9QA2E0VeDT8CxN,zqm0Sv36yNKjRGVJ,zrU4Qu4QsTkl52kN,zs10WJ4zIy9p1Rz4,zw6ON6poVhhrnGG8,zx4TjtzsmEHeULeo,zz1XYEbqcaXz5ICW,zzOFhJF3YHyXmwbV,zznsrO2NfjMPqg4I
course,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
006:100,R,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABRD:301,R,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABRD:302,A+,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABRD:302,R,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ABRD:303,R,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [35]:
#training set labels with major headers and one_hot encoded
df2_train_majors_with_st_index

Unnamed: 0_level_0,Accounting,Actuarial Science,Actuarial Sciences,African American Studies,Anthropology,Art,Asian Studies,Astronomy,Biochemistry,Biology,...,Religion,Russian,Social Work,Sociology,Spanish,Speech And Hearing Science,Statistics,Theatre,Therapeutic Recreation,Veterinary Medicine
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01DiJuoJAB395ucJ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01MhxeQl5FhRsf3f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01W7KB8TDNWNx4YW,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
042Rmpv5B2kXdfBR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04DuzbneGqk0o0jY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04TIITMjjPIVIkES,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04vxIYe6guefIhGD,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04yPdcfnDzHbSIsS,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04zKgh2DJS9owZNA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
059Ssc6DVmDrBM7o,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
#create model
model2 = Sequential()

#get number of columns in training data
n_cols = df1_train_grade_st_headers.T.values.shape[1]

#add model layers
model2.add(Dropout(0.5, input_shape=(n_cols,)))
model2.add(Dense(250, activation='relu'))
model2.add(Dense(df2_train_majors_with_st_index.values.shape[1], activation='softmax'))

#compile model using mse as a measure of model performance
model2.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy'])

#set early stopping monitor so the model stops training when it won't improve anymore
early_stopping_monitor = EarlyStopping(patience=3)
#train model
model2.fit(df1_train_grade_st_headers.T, df2_train_majors_with_st_index, epochs=10, batch_size=64, validation_split=0.2, callbacks=[early_stopping_monitor])

Train on 8000 samples, validate on 2000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1a35098da0>

## Approach 3 - With Integer encoding ignoring level values

In [38]:
#training set grades with student_id headers
df1_train_grade.head()

course,006:100,ABRD:301,ABRD:302,ABRD:303,ABRD:304,ABRD:306,ABRD:307,ABRD:308,ABRD:309,ABRD:311,...,WRIT:100,WRIT:140,WRIT:160,WRIT:310,WRIT:326,WRIT:374,WRIT:390,WRIT:400,WRIT:474,WRIT:476
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01DiJuoJAB395ucJ,,,,,,,,,,,...,,,,,,,,,,
01MhxeQl5FhRsf3f,,,,,,,,,,,...,,,,,,,,,,
01W7KB8TDNWNx4YW,,,,,,,,,,,...,,,,,,,,,,
042Rmpv5B2kXdfBR,,,,,,,,,,,...,,,,,,,,,,
04DuzbneGqk0o0jY,,,,,,,,,,,...,,,,,,,,,,


In [39]:
# unique values in above dataframe
pd.unique(df1_train_grade[df1_train_grade.columns].values.ravel('K'))

array([nan, 'B', 'C', 'R', 'B+', 'S', 'B-', 'C-', 'A', 'A+', 'A-', 'C+',
       'WX', 'D', 'D+', 'D-', 'AUS', 'U', 'P', 'I', 'AUU', 'N'],
      dtype=object)

In [40]:
# no of unique values in above dataframe
pd.unique(df1_train_grade[df1_train_grade.columns].values.ravel('K')).size

22

In [41]:
# fill NaN wirh zeros
df1_train_grade_nan_0 = df1_train_grade.fillna(0)

# check if NaNs are filled with zeros
df1_train_grade_nan_0.head()

course,006:100,ABRD:301,ABRD:302,ABRD:303,ABRD:304,ABRD:306,ABRD:307,ABRD:308,ABRD:309,ABRD:311,...,WRIT:100,WRIT:140,WRIT:160,WRIT:310,WRIT:326,WRIT:374,WRIT:390,WRIT:400,WRIT:474,WRIT:476
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01DiJuoJAB395ucJ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01MhxeQl5FhRsf3f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01W7KB8TDNWNx4YW,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
042Rmpv5B2kXdfBR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04DuzbneGqk0o0jY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [42]:
# eval set with course headers
df3_eval_grade.head()

course,006:100,ABRD:302,ABRD:304,ABRD:306,ABRD:308,ABRD:309,ABRD:314,ABRD:321,ABRD:322,ABRD:324,...,ULIB:301,URES:399,URP:201,URP:300,URP:620,WRIT:140,WRIT:160,WRIT:310,WRIT:474,WRIT:476
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BVNwmXUXsoVHLd,,,,,,,,,,,...,,,,,,,,,,
03TWkCsakXIVrOtA,,,,,,,,,,,...,,,,,,,C+,,,
067YeH4Acv00Bdvf,,,,,,,,,,,...,,,,,,,,,,
06TWQIIh4SaEAnOu,,,,,,,,,,,...,,,,,,,,,,
08dMDGiuTXbojMV0,,,,,,,,,,,...,,,,,,,,,,


In [43]:
# unique values in eval set
pd.unique(df3_eval_grade[df3_eval_grade.columns].values.ravel('K'))

array([nan, 'A', 'C+', 'B', 'B-', 'C-', 'S', 'A-', 'B+', 'C', 'D+', 'A+',
       'D', 'R', 'U', 'D-', 'I', 'AUS', 'P', 'AUU', 'N', 'WX'],
      dtype=object)

In [44]:
# no of unique values in eval set
pd.unique(df3_eval_grade[df3_eval_grade.columns].values.ravel('K')).size

22

In [45]:
# filling NaN with zeros
df3_eval_grade_nan_0 = df3_eval_grade.fillna(0)

# check if NaNs are filled with zeros
df3_eval_grade_nan_0.head()

course,006:100,ABRD:302,ABRD:304,ABRD:306,ABRD:308,ABRD:309,ABRD:314,ABRD:321,ABRD:322,ABRD:324,...,ULIB:301,URES:399,URP:201,URP:300,URP:620,WRIT:140,WRIT:160,WRIT:310,WRIT:474,WRIT:476
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BVNwmXUXsoVHLd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03TWkCsakXIVrOtA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,C+,0,0,0
067YeH4Acv00Bdvf,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
06TWQIIh4SaEAnOu,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
08dMDGiuTXbojMV0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Integer encoding

In [None]:
#Since both eval and training sets have same 22 distinct values, let's integer encode them as per following:
cleanup_nums = {'A':20, 'C+':15, 'B':17, 'B-':16, 'C-':13, 'S':5, 'A-':19, 'B+':18, 'C':14, 'D+':12, 'A+':21,
       'D':11, 'R':9, 'U':2, 'D-':10, 'I':1, 'AUS':6, 'P':7, 'AUU':4, 'N':3, 'WX':8}

In [47]:
# interger encodng of training set
df1_train_grade_int_encoded = df1_train_grade_nan_0.replace(cleanup_nums)

In [48]:
# check if integer encoding has worked
pd.unique(df1_train_grade_int_encoded[df1_train_grade_int_encoded.columns].values.ravel('K'))

array([ 0,  9, 21,  6,  5, 15, 16, 17, 10, 13, 18, 14, 11, 19, 20,  7, 12,
        1,  8,  2,  4,  3])

In [49]:
# interger encodng of eval set
df3_eval_grade_int_encoded = df3_eval_grade_nan_0.replace(cleanup_nums)

In [50]:
# check if integer encoding has worked
pd.unique(df3_eval_grade_int_encoded[df3_eval_grade_int_encoded.columns].values.ravel('K'))

array([ 0,  9, 19, 20, 21, 17, 18, 15, 14, 13, 16, 11, 10, 12,  5,  7,  8,
        1,  6,  4,  2,  3])

### Aligning features in training and eval sets

In [51]:
# Aligining courses columns of training and eval sets on grades
df3_eval_grade_int_encoded_aligned, df1_train_grade_int_encoded_aligned = df3_eval_grade_int_encoded.align(df1_train_grade_int_encoded, axis=1, fill_value=0)
df1_train_grade_int_encoded_aligned, df3_eval_grade_int_encoded_aligned = df1_train_grade_int_encoded.align(df3_eval_grade_int_encoded, axis=1, fill_value=0)

In [52]:
# check if aligning has worked
compare = df1_train_grade_int_encoded_aligned.columns == df3_eval_grade_int_encoded_aligned.columns
pd.unique(compare)

array([ True])

### Normalization

This is not necessary as all input features come from same integer range 0-21. However this can improve the speed of learning via fast convergence.

In [53]:
# Normalizing training set
df1_train_grade_norm_transposed = (df1_train_grade_int_encoded_aligned.T - df1_train_grade_int_encoded_aligned.T.mean()) / (df1_train_grade_int_encoded_aligned.T.max() - df1_train_grade_int_encoded_aligned.T.min())
df1_train_grade_norm = df1_train_grade_norm_transposed.T

In [54]:
# check if normalization has worked
df1_train_grade_norm.head()

course,006:100,ABRD:301,ABRD:302,ABRD:303,ABRD:304,ABRD:306,ABRD:307,ABRD:308,ABRD:309,ABRD:311,...,WRIT:100,WRIT:140,WRIT:160,WRIT:310,WRIT:326,WRIT:374,WRIT:390,WRIT:400,WRIT:474,WRIT:476
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01DiJuoJAB395ucJ,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,...,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355
01MhxeQl5FhRsf3f,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,...,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355
01W7KB8TDNWNx4YW,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737,...,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737,-0.003737
042Rmpv5B2kXdfBR,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,...,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355
04DuzbneGqk0o0jY,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,...,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355


In [55]:
# Normalizing eval set
df3_eval_grade_norm_transposed = (df3_eval_grade_int_encoded_aligned.T - df3_eval_grade_int_encoded_aligned.T.mean()) / (df3_eval_grade_int_encoded_aligned.T.max() - df3_eval_grade_int_encoded_aligned.T.min())
df3_eval_grade_norm = df3_eval_grade_norm_transposed.T

In [56]:
# check if normalization has worked
df3_eval_grade_norm.head()

course,006:100,ABRD:301,ABRD:302,ABRD:303,ABRD:304,ABRD:306,ABRD:307,ABRD:308,ABRD:309,ABRD:311,...,WRIT:100,WRIT:140,WRIT:160,WRIT:310,WRIT:326,WRIT:374,WRIT:390,WRIT:400,WRIT:474,WRIT:476
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BVNwmXUXsoVHLd,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,...,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355
03TWkCsakXIVrOtA,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,...,-0.000355,-0.000355,0.999645,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355
067YeH4Acv00Bdvf,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,...,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355,-0.000355
06TWQIIh4SaEAnOu,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954,...,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954,-0.004954
08dMDGiuTXbojMV0,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883,...,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883,-0.004883


In [57]:
#training set labels with major headers and one_hot encoded
df2_train_majors_with_st_index.head()

Unnamed: 0_level_0,Accounting,Actuarial Science,Actuarial Sciences,African American Studies,Anthropology,Art,Asian Studies,Astronomy,Biochemistry,Biology,...,Religion,Russian,Social Work,Sociology,Spanish,Speech And Hearing Science,Statistics,Theatre,Therapeutic Recreation,Veterinary Medicine
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01DiJuoJAB395ucJ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01MhxeQl5FhRsf3f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01W7KB8TDNWNx4YW,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
042Rmpv5B2kXdfBR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04DuzbneGqk0o0jY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
#create model
model3 = Sequential()

#get number of columns in training data
n_cols = df1_train_grade_norm.values.shape[1]

#add model layers
model3.add(Dropout(0.5, input_shape=(n_cols,)))
model3.add(Dense(250, activation='relu'))
model3.add(Dense(df2_train_majors_with_st_index.values.shape[1], activation='softmax'))

#compile model using mse as a measure of model performance
model3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#set early stopping monitor so the model stops training when it won't improve anymore
early_stopping_monitor = EarlyStopping(patience=3)
#train model
model3.fit(df1_train_grade_norm, df2_train_majors_with_st_index, epochs=20, batch_size=16, validation_split=0.2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a364acd30>

## Approach 4 - With Integer encoding without ignoring level values

In [59]:
#training set levels with course headers
df1_train_level.head()

course,006:100,ABRD:301,ABRD:302,ABRD:303,ABRD:304,ABRD:306,ABRD:307,ABRD:308,ABRD:309,ABRD:311,...,WRIT:100,WRIT:140,WRIT:160,WRIT:310,WRIT:326,WRIT:374,WRIT:390,WRIT:400,WRIT:474,WRIT:476
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01DiJuoJAB395ucJ,,,,,,,,,,,...,,,,,,,,,,
01MhxeQl5FhRsf3f,,,,,,,,,,,...,,,,,,,,,,
01W7KB8TDNWNx4YW,,,,,,,,,,,...,,,,,,,,,,
042Rmpv5B2kXdfBR,,,,,,,,,,,...,,,,,,,,,,
04DuzbneGqk0o0jY,,,,,,,,,,,...,,,,,,,,,,


In [60]:
# unique values in the above dataframe
pd.unique(df1_train_level[df1_train_level.columns].values.ravel('K'))

array([nan, 'Senior', 'Freshman', 'Junior', 'Sophomore'], dtype=object)

In [61]:
#eval set levels with course headers
df3_eval_level.head()

course,006:100,ABRD:302,ABRD:304,ABRD:306,ABRD:308,ABRD:309,ABRD:314,ABRD:321,ABRD:322,ABRD:324,...,ULIB:301,URES:399,URP:201,URP:300,URP:620,WRIT:140,WRIT:160,WRIT:310,WRIT:474,WRIT:476
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BVNwmXUXsoVHLd,,,,,,,,,,,...,,,,,,,,,,
03TWkCsakXIVrOtA,,,,,,,,,,,...,,,,,,,Sophomore,,,
067YeH4Acv00Bdvf,,,,,,,,,,,...,,,,,,,,,,
06TWQIIh4SaEAnOu,,,,,,,,,,,...,,,,,,,,,,
08dMDGiuTXbojMV0,,,,,,,,,,,...,,,,,,,,,,


In [62]:
# unique values in the above dataframe
pd.unique(df3_eval_level[df3_eval_level.columns].values.ravel('K'))

array([nan, 'Sophomore', 'Junior', 'Senior', 'Freshman'], dtype=object)

In [63]:
# filling NaN with zeros in training set
df1_train_level_nan_0 = df1_train_level.fillna(0)

# check if filling has worked
df1_train_level_nan_0.head()

course,006:100,ABRD:301,ABRD:302,ABRD:303,ABRD:304,ABRD:306,ABRD:307,ABRD:308,ABRD:309,ABRD:311,...,WRIT:100,WRIT:140,WRIT:160,WRIT:310,WRIT:326,WRIT:374,WRIT:390,WRIT:400,WRIT:474,WRIT:476
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01DiJuoJAB395ucJ,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01MhxeQl5FhRsf3f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
01W7KB8TDNWNx4YW,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
042Rmpv5B2kXdfBR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04DuzbneGqk0o0jY,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [64]:
# filling NaN with zeros in training set
df3_eval_level_nan_0 = df3_eval_level.fillna(0)

# check if filling has worked
df3_eval_level_nan_0.head()

course,006:100,ABRD:302,ABRD:304,ABRD:306,ABRD:308,ABRD:309,ABRD:314,ABRD:321,ABRD:322,ABRD:324,...,ULIB:301,URES:399,URP:201,URP:300,URP:620,WRIT:140,WRIT:160,WRIT:310,WRIT:474,WRIT:476
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01BVNwmXUXsoVHLd,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03TWkCsakXIVrOtA,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,Sophomore,0,0,0
067YeH4Acv00Bdvf,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
06TWQIIh4SaEAnOu,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
08dMDGiuTXbojMV0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Integer encoding of levels

In [67]:
# encoding levels in training set
cleanup_levels = {'Sophomore':2, 'Junior':3, 'Senior':4, 'Freshman':1}
df1_train_level_int_encoded = df1_train_level_nan_0.replace(cleanup_levels)

# encoding levels in eval set
cleanup_levels = {'Sophomore':2, 'Junior':3, 'Senior':4, 'Freshman':1}
df3_eval_level_int_encoded = df3_eval_level_nan_0.replace(cleanup_levels)

In [70]:
# check if integer encoding has worked
pd.unique(df1_train_level_int_encoded[df1_train_level_int_encoded.columns].values.ravel('K'))

array([0, 4, 3, 2, 1])

### Aligning columns in training and eval sets for levels

In [71]:
# Aligining courses columns of training and eval sets on levels
df3_eval_level_int_encoded_aligned, df1_train_level_int_encoded_aligned = df3_eval_level_int_encoded.align(df1_train_level_int_encoded, axis=1, fill_value=0)
df1_train_level_int_encoded_aligned, df3_eval_level_int_encoded_aligned = df1_train_level_int_encoded.align(df3_eval_level_int_encoded, axis=1, fill_value=0)

In [72]:
# check if aligning has worked
compare = df1_train_level_int_encoded_aligned.columns == df3_eval_level_int_encoded_aligned.columns
pd.unique(compare)

array([ True])

### Normalization of levels

In [73]:
# Normalizing training set levels
df1_train_level_norm_transposed = (df1_train_level_int_encoded.T - df1_train_level_int_encoded.T.mean()) / (df1_train_level_int_encoded.T.max() - df1_train_level_int_encoded.T.min())
df1_train_level_norm = df1_train_level_norm_transposed.T

In [74]:
# check if normalization has worked
df1_train_level_norm.head()

course,006:100,ABRD:301,ABRD:302,ABRD:303,ABRD:304,ABRD:306,ABRD:307,ABRD:308,ABRD:309,ABRD:311,...,WRIT:100,WRIT:140,WRIT:160,WRIT:310,WRIT:326,WRIT:374,WRIT:390,WRIT:400,WRIT:474,WRIT:476
student_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
01DiJuoJAB395ucJ,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,...,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368
01MhxeQl5FhRsf3f,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,...,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368
01W7KB8TDNWNx4YW,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767,...,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767,-0.003767
042Rmpv5B2kXdfBR,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,...,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368
04DuzbneGqk0o0jY,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,...,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368,-0.000368


### Concatenating Normalized grades and levels

In [75]:
# concatenating normalized grades and levels
df1_train_norm_grades_levels = pd.concat([df1_train_grade_norm.T, df1_train_level_norm.T])

In [76]:
# check if concatenation has worked
df1_train_norm_grades_levels.shape

(5537, 10000)

### Model training

In [77]:
#create model
model4 = Sequential()

#get number of columns in training data
n_cols = df1_train_norm_grades_levels.T.values.shape[1]

#add model layers
model4.add(Dropout(0.5, input_shape=(n_cols,)))
model4.add(Dense(250, activation='relu'))
model4.add(Dense(df2_train_majors_with_st_index.values.shape[1], activation='softmax'))

#compile model using mse as a measure of model performance
model4.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

#set early stopping monitor so the model stops training when it won't improve anymore
early_stopping_monitor = EarlyStopping(patience=3)
#train model
model4.fit(df1_train_norm_grades_levels.T, df2_train_majors_with_st_index, epochs=20, batch_size=32, validation_split=0.2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a585c4390>

# Prediction

## Output dataframe preparation

In [78]:
# predicting using model3
p_test = model3.predict(df3_eval_grade_norm)

In [79]:
p_test_with_columns = pd.DataFrame(p_test, columns=df2_train_major_headers.columns[1:df2_train_major_headers.columns.size])

In [80]:
p_test_with_columns_st_column = p_test_with_columns

In [81]:
p_test_with_columns_st_column['student_id'] = df3_eval_grade.index

In [82]:
n = 3

# Extract top 3 majors to a list
new_d = (zip(repeat(row["student_id"]), map(it(0),(row[0:p_test_with_columns_st_column.columns.size-1].sort_values(ascending=False)[:n].iteritems())))
                 for _, row in p_test_with_columns_st_column.iterrows())


# Populate a dataframe with student_id and top 3 majors
rows = []
columns = ['student_id','major1','major2','major3']

for row in new_d:
    row_list = list(row)
    student_id = row_list[0][0]
    major_1 = row_list[0][1]
    major_2 = row_list[1][1]
    major_3 = row_list[2][1]
    row = [student_id, major_1, major_2, major_3]
    rows.append(row)

df_majors = pd.DataFrame(rows, columns=columns)
df_majors_st_index = df_majors.set_index('student_id')

In [83]:
df_eval_sorted = df_eval.sort_values(by=['student_id'])
df_eval_sorted_index = df_eval_sorted.set_index('student_id')

In [84]:
df_eval_final_merged = pd.merge(df_eval_sorted_index, df_majors_st_index, on='student_id')

In [85]:
df_eval_final_merged_dropped = df_eval_final_merged.drop(columns=['major1_x', 'major2_x',
       'major3_x'])

In [86]:
df_eval_final_merged_dropped.columns = ['level','course','grade','major1','major2','major3']

### Writing to pred.psv.file

In [87]:
# writing to pred.psv file
df_eval_final_merged_dropped.to_csv('pred.psv', sep='|')

# Thank you!!!