***Imports***

In [316]:
import pandas  as pd
import numpy   as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

***Load Data***

In [317]:
test_set = pd.read_csv('predict-student-performance-from-game-play/test.csv')
train_set = pd.read_csv('predict-student-performance-from-game-play/train.csv')
train_labels = pd.read_csv('predict-student-performance-from-game-play/train_labels.csv')

In [318]:
# dropping unimportant columns
train_set.drop(['index'], axis=1, inplace=True)
test_set.drop(['index'], axis=1, inplace=True)
test_set.drop(['session_level'], axis=1, inplace=True)

***Split Label Data into Session Id and Question***

In [319]:
session_ids = []
question = []

# split into session_id and question
def split_session_questions(session_id):
    parts = session_id.split('q')
    session_ids.append(parts[0][:-1])
    question.append(parts[1])

train_labels['session_id'].apply(split_session_questions)

# remove old session_id
train_labels = train_labels.drop(['session_id'], axis=1)

# turn into dataframe
session_id = pd.DataFrame(session_ids, columns=['session_id'])
question =  pd.DataFrame(question, columns=['question'])

# merge into one dataframe
train_labels = pd.concat([train_labels, session_id, question], axis=1)

***Creating One Row per Question and Session Id for Train Data***

In [320]:
# create dummies for event name column
dummies = pd.get_dummies(train_set['event_name'], dtype=int)
train_set.drop(['event_name'], axis=1, inplace=True)
train_set = pd.concat([dummies, train_set], axis=1)

In [321]:
# selecting columns to use for model
column_names = ['session_id', 'level_group', 'checkpoint', 'cutscene_click', 'map_click',
                 'map_hover', 'navigate_click', 'notebook_click', 'notification_click',
                 'object_click', 'object_hover', 'observation_click', 'person_click',
                 'elapsed_time']

In [322]:
# one row per session_id and different level group using mean
train = train_set[column_names].groupby(['session_id', 'level_group'], as_index=False).mean()

In [323]:
# changing datatype to int
train_labels['session_id'] = train_labels['session_id'].astype(int)
train_labels['question'] = train_labels['question'].astype(int)

In [324]:
# separating data based on level
level_1 = train[train['level_group'] == '0-4']
level_2 = train[train['level_group'] == '5-12']
level_3 = train[train['level_group'] == '13-22']

In [326]:
# separating label data based on level
label_level_1 = train_labels[(train_labels['question'] < 4) & (train_labels['question'] > 0)]
label_level_2 = train_labels[(train_labels['question'] < 14) & (train_labels['question'] > 3)]
label_level_3 = train_labels[(train_labels['question'] < 19) & (train_labels['question'] > 13)]

In [327]:
# merging level data
level_1 = pd.merge(level_1, label_level_1, on='session_id')
level_2 = pd.merge(level_2, label_level_2, on='session_id')
level_3 = pd.merge(level_3, label_level_3, on='session_id')

In [328]:
# merging all level data
train_data = pd.concat([level_1, level_2, level_3], axis=0)

In [329]:
train_data.head()

Unnamed: 0,session_id,level_group,checkpoint,cutscene_click,map_click,map_hover,navigate_click,notebook_click,notification_click,object_click,object_hover,observation_click,person_click,elapsed_time,correct,question
0,20090312431273200,0-4,0.006061,0.169697,0.012121,0.024242,0.490909,0.0,0.048485,0.066667,0.024242,0.024242,0.133333,85793.557576,1,1
1,20090312431273200,0-4,0.006061,0.169697,0.012121,0.024242,0.490909,0.0,0.048485,0.066667,0.024242,0.024242,0.133333,85793.557576,1,2
2,20090312431273200,0-4,0.006061,0.169697,0.012121,0.024242,0.490909,0.0,0.048485,0.066667,0.024242,0.024242,0.133333,85793.557576,1,3
3,20090312433251036,0-4,0.007194,0.258993,0.021583,0.021583,0.352518,0.014388,0.035971,0.107914,0.035971,0.014388,0.129496,97633.417266,0,1
4,20090312433251036,0-4,0.007194,0.258993,0.021583,0.021583,0.352518,0.014388,0.035971,0.107914,0.035971,0.014388,0.129496,97633.417266,1,2


***Creating One Row per Question and Session Id for Test Data***

In [330]:
# create dummies for event name column
dummies = pd.get_dummies(test_set['event_name'], dtype=int)
test_set.drop(['event_name'], axis=1, inplace=True)
test_set = pd.concat([dummies, test_set], axis=1)

In [331]:
# selecting columns to use for model
column_names = ['session_id', 'level_group', 'checkpoint', 'cutscene_click', 'map_click',
                 'map_hover', 'navigate_click', 'notebook_click', 'notification_click',
                 'object_click', 'object_hover', 'observation_click', 'person_click',
                 'elapsed_time']

In [332]:
# one row per session_id and different level group using mean
test = test_set[column_names].groupby(['session_id', 'level_group'], as_index=False).mean()

In [333]:
session_ids = []
question = []

# create questions for each session id
def create_questions(session_id):
    for x in range(19):
        session_ids.append(session_id)
        question.append(x)

test['session_id'].apply(create_questions)

# turn into dataframe
session_id = pd.DataFrame(session_ids, columns=['session_id'])
question =  pd.DataFrame(question, columns=['question'])

# combine into one dataframe
test_labels = pd.concat([session_id, question], axis=1)

In [334]:
# separating data based on level
level_1 = test[test['level_group'] == '0-4']
level_2 = test[test['level_group'] == '5-12']
level_3 = test[test['level_group'] == '13-22']

In [335]:
# separating label data based on level
label_level_1 = test_labels[(test_labels['question'] < 4) & (test_labels['question'] > 0)]
label_level_2 = test_labels[(test_labels['question'] < 14) & (test_labels['question'] > 3)]
label_level_3 = test_labels[(test_labels['question'] < 19) & (test_labels['question'] > 13)]

In [336]:
# merging level data
level_1 = pd.merge(level_1, label_level_1, on='session_id')
level_2 = pd.merge(level_2, label_level_2, on='session_id')
level_3 = pd.merge(level_3, label_level_3, on='session_id')

In [337]:
# merging all level data
test_data = pd.concat([level_1, level_2, level_3], axis=0, ignore_index=True)

***Split Data into X and Y***

In [338]:
# Don't want data to be trained on session_id
x_train = train_data.drop(['session_id', 'correct', 'level_group'], axis=1)
y_train = train_data['correct']
x_test = test_data.drop(['session_id', 'level_group'], axis=1)

***Train Model***

**Note:** Grid search would take too long for my computer to run. So we are using a standard XGBoost Classifier model

In [339]:
model = xgb.XGBClassifier()

model.fit(x_train, y_train)

***Run on Test Set***

In [340]:
predictions = model.predict(x_test)

***Creating Submission for Kaggle***

In [341]:
# combining question and session id together
def combine_session_question(row):
    session_id = str(row['session_id']) + '_q' + str(row['question'])
    return session_id

test_data['session_id'] = test_data.apply(combine_session_question, axis=1)

In [342]:
output = pd.DataFrame({"session_id":test_data['session_id'], "correct":predictions})
output.to_csv('submission.csv', index=False)