In [1]:
import pandas as pd

In [2]:
lecture_df = pd.read_csv("data/lectures.csv")
question_df = pd.read_csv("data/questions.csv")
response_df = pd.read_csv("data/responses.csv")
train_df = pd.read_csv("data/short_train.csv")

### Split Train Data

Split the train data on `content_type_id` (whether it was an exam or a lecture)

In [3]:
train_exam_filter = train_df['content_type_id'] == 0
train_lecture_filter = train_df['content_type_id'] == 1

train_exams_df = train_df[train_exam_filter]
train_lectures_df = train_df[train_lecture_filter]

Print a report

In [4]:
print('Exams', end='\n=====\n')
print(f'row count: {len(train_exams_df)}')
print(f'user count: {len(train_exams_df.groupby(["user_id"]))}')

print()

print('Lectures', end='\n========\n')
print(f'row count: {len(train_lectures_df)}')
print(f'user count: {len(train_lectures_df.groupby(["user_id"]))}')

Exams
=====
row count: 343298
user count: 1215

Lectures
row count: 6701
user count: 505


In [6]:
class User:
    def __init__(self, user_id, content_id, task_container_id, answered_correctly, prior_question_elapsed_time, prior_question_had_explanation):
        self.user_id = user_id
        self.content_id = content_id
        self.task_container_id = task_container_id
        self.answered_correctly = answered_correctly
        self.prior_question_elapsed_time = prior_question_elapsed_time
        self.prior_question_had_explanation = prior_question_had_explanation
        

In [10]:
import json

file_name = './data/slim_train.csv'
users = []
with open(file_name, 'r', encoding="utf-8") as in_file:
    for i, line in enumerate(in_file):

        # Skip headers
        if i == 0:
            continue

        line = line.split(',')

        user_id = int(line[0])
        content_id = int(line[1])
        task_container_id = int(line[2])
        answered_correctly = int(line[3]) == 1
        prior_question_elapsed_time = int(line[4])
        prior_question_had_explanation = bool(line[5])

        users.append(User(user_id, content_id, task_container_id, answered_correctly, prior_question_elapsed_time, prior_question_had_explanation))

In [11]:
from sklearn.model_selection import train_test_split

training_data, test_data = train_test_split(users, test_size=0.33, random_state=42)

print(len(training_data))
print(len(test_data))

230009
113289


In [8]:
user_id_train = [user.user_id for user in training_data]
timestamp_train = [user.timestamp for user in training_data]
answered_correctly_train = [user.answered_correctly for user in training_data]

user_id_test = [user.user_id for user in test_data]
timestamp_test = [user.timestamp for user in test_data]
answered_correctly_test = [user.answered_correctly for user in test_data]

In [10]:
print(f'answered correctly: {answered_correctly_train.count(True)}')
print(f'answered incorrectly: {answered_correctly_train.count(False)}')

answered correctly: 153737
answered incorrectly: 76272


### Training the algorithm!

Define helper classes

In [1]:
class Entry:
    def __init__(self, **kwargs):
        self.__dict__ = kwargs

Read from CSV

In [2]:
path = 'data/slim_train.csv'
entries = []
with open(path, 'r') as file:
    for i, line in enumerate(file):
        # Skip header
        if i == 0:
            continue

        u_id, c_id, tc_id, answered_correctly, pqet, pqhe = line.split(',')

        entries.append(Entry(
            user_id=u_id,
            content_id=c_id,
            task_container_id=tc_id,
            answered_correctly=True if answered_correctly == '1' else False,
            prior_question_elapsed_time=float(pqet),
            prior_question_had_explanation=bool(pqhe)
        ))

In [3]:
print(f'Entry count: {len(entries)}')

Entry count: 343298


Prepare data

In [4]:
from sklearn.model_selection import train_test_split

training_data, test_data = train_test_split(entries, test_size=0.33, random_state=42)

print(f'training_data count: {len(training_data)}')
print(f'test_data count: {len(test_data)}')

training_data count: 230009
test_data count: 113289


Split into X and Y

In [5]:
# Train
train_x = [dict(
    user_id=entry.user_id,
    content_id=entry.content_id,
    task_container_id=entry.task_container_id,
    prior_question_elapsed_time=entry.prior_question_elapsed_time,
    prior_question_had_explanation=entry.prior_question_had_explanation
) for entry in training_data]

train_y = [entry.answered_correctly for entry in training_data]

# Test
test_x = [dict(
    user_id=entry.user_id,
    content_id=entry.content_id,
    task_container_id=entry.task_container_id,
    prior_question_elapsed_time=entry.prior_question_elapsed_time,
    prior_question_had_explanation=entry.prior_question_had_explanation
) for entry in test_data]

test_y = [entry.answered_correctly for entry in test_data]

In [7]:
print(f'correct: {train_y.count(True)}')
print(f'incorrect: {train_y.count(False)}')

correct: 153737
incorrect: 76272


Determine the best algorithm

In [10]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

Decision Tree Classifier

In [12]:
from sklearn.tree import DecisionTreeClassifier

dec_cls = DecisionTreeClassifier()
dec_cls.fit(train_x_vectors, train_y)

print(f'DecisionTreeClassifier accuracy: {dec_cls.score(test_x_vectors, test_y)}')

DecisionTreeClassifier accuracy: 0.6550680118987722


Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression

log_cls = LogisticRegression(max_iter=2000)
log_cls.fit(train_x_vectors, train_y)

print(f'LogisticRegression accuracy: {log_cls.score(test_x_vectors, test_y)}')

LogisticRegression accuracy: 0.7117372383903116


### Test the algorithm!

In [20]:
u_id = input('user id: ')
c_id = input('content id: ')
tc_id = input('task container id: ')
pqet = float(input('prior question elapsed time: '))
pqhe = bool(input('prior question had explanation: '))

input_dict = dict(
    user_id=u_id,
    content_id=c_id,
    task_container_id=tc_id,
    prior_question_elapsed_time=pqet,
    prior_question_had_explanation=pqhe
)
input_vector = vectorizer.transform(input_dict)

prediction = log_cls.predict(input_vector)
print(f'The user is predicted to answer: {"correctly" if prediction[0] == 1 else "incorrectly"}')

The user is predicted to answer: correctly
