In [1]:
import sklearn
import pandas as pd
import math

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

In [2]:
class Entry:
    def __init__(self, **kwargs):
        self.__dict__ = kwargs

In [3]:
df = pd.read_csv('data/shifted_train.csv', nrows=1000000)

In [4]:
u_ids = df.user_id.values.tolist()
c_ids = df.content_id.values.tolist()
tc_ids = df.task_container_id.values.tolist()
acs = df.answered_correctly.values.tolist()
ets = df.elapsed_time.values.tolist()
hes = df.had_explanation.values.tolist()

entries = []
for i in range(len(df)):
    entries.append(Entry(
        user_id=u_ids[i],
        content_id=c_ids[i],
        task_container_id=tc_ids[i],
        answered_correctly=True if acs[i] == 1 else False,
        elapsed_time=0.0 if math.isnan(ets[i]) else ets[i],
        had_explanation=False if math.isnan(hes[i]) else True if hes[i] == 1.0 else False
    ))

In [5]:
training_data, test_data = train_test_split(entries, test_size=0.3, random_state=42)

In [6]:
print(f'training_data count: {len(training_data)}')
print(f'test_data count: {len(test_data)}')

training_data count: 700000
test_data count: 300000


In [7]:
# Split into X and Y
train_x = [dict(
    user_id=entry.user_id,
    content_id=entry.content_id,
    task_container_id=entry.task_container_id,
    elapsed_time=entry.elapsed_time,
    had_explanation=entry.had_explanation
) for entry in training_data]

train_y = [entry.answered_correctly for entry in training_data]

test_x = [dict(
    user_id=entry.user_id,
    content_id=entry.content_id,
    task_container_id=entry.task_container_id,
    elapsed_time=entry.elapsed_time,
    had_explanation=entry.had_explanation
) for entry in test_data]

test_y = [entry.answered_correctly for entry in test_data]

In [8]:
print(f'correct: {train_y.count(True)}')
print(f'incorrect: {train_y.count(False)}')

correct: 456034
incorrect: 243966


In [9]:
vectorizer = DictVectorizer()
train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

In [10]:
log_clf = LogisticRegression(max_iter=1000)
log_clf.fit(train_x_vectors, train_y)

LogisticRegression(max_iter=1000)

In [11]:
dec_clf = DecisionTreeClassifier()
dec_clf.fit(train_x_vectors, train_y)

DecisionTreeClassifier()

In [12]:
print(f'LogisticRegression accuracy: {log_clf.score(test_x_vectors, test_y)}')
print(f'DecisionTree accuracy: {dec_clf.score(test_x_vectors, test_y)}')

LogisticRegression accuracy: 0.6511566666666667
DecisionTree accuracy: 0.6052533333333333


In [16]:
u_id = int(input('Enter user id: '))
c_id = int(input('Enter content id: '))
tc_id = int(input('Enter task container id: '))
et = float(input('Enter elapsed time: '))
he = bool(input('Enter whether the question had explanation or not (True|False): '))

input_dict = dict(
    user_id=u_id,
    content_id=c_id,
    task_container_id=tc_id,
    elapsed_time=et,
    had_explanation=he
)
input_vector = vectorizer.transform(input_dict)
print(log_clf.predict(input_vector))
print(log_clf.predict_proba(input_vector))

[ True]
[[0.4154677 0.5845323]]


In [21]:
ids = [115, 124, 2746, 5382, 8623, 8701, 12741, 13134, 24418, 24600, 32421, 40828, 44331, 45001, 46886, 50132, 51285, 53842, 81002, 81429, 91216, 99521, 107002, 108310, 128919, 137455, 138650, 140969, 141455, 142896, 146023, 146403, 157207, 163243, 165081, 166728, 174754, 176102, 176303, 178445, 206168, 215672, 220268, 238966, 239323, 246496, 247749, 251201, 260489, 275762, 286187, 287029, 290191, 297533, 298022, 301590, 318683, 327181, 332303, 341420, 355138, 357056, 357865, 359283, 364452, 364932, 366174, 368624, 371296, 375732, 377398, 381754, 382571, 384661, 384745, 385548, 385630, 400241, 403483, 408119, 408250, 410315, 422628, 427355, 438357, 443506, 444790, 449925, 457531, 459017, 469162, 475020, 478631, 480368, 496840, 508795, 510560, 513434, 531584, 537732, 538701, 576369, 579346, 581706, 583550, 592864, 595960, 613296, 616513, 637773, 642997, 650076, 650467, 653826, 660672, 671022, 678363, 686708, 700316, 702591, 712805, 713424, 715253, 736891, 741296, 745460, 750330, 752718, 758895, 766758, 786638, 786789, 800824, 804652, 807172, 808053, 814879, 823804, 837033, 839109, 839139, 839808, 841970, 859071, 859265, 872693, 900423, 901198, 904515, 904619, 906666, 921758, 934487, 935388, 939887, 952772, 975595, 983864, 986231, 993627, 994085, 999270, 999548, 999788, 1001651, 1017117, 1024273, 1028074, 1036195, 1047929, 1050559, 1055635, 1056138, 1060266, 1067196, 1072296, 1084314]

for id in ids:
    input_dict = dict(
        user_id=id,
        content_id=6908,
        task_container_id=14,
        elapsed_time=16.0,
        had_explanation=False
    )
    input_vector = vectorizer.transform(input_dict)
    print(log_clf.predict(input_vector), log_clf.predict_proba(input_vector))

[ True] [[0.42585332 0.57414668]]
[ True] [[0.42585326 0.57414674]]
[ True] [[0.4258354 0.5741646]]
[ True] [[0.42581745 0.57418255]]
[ True] [[0.42579538 0.57420462]]
[ True] [[0.42579485 0.57420515]]
[ True] [[0.42576734 0.57423266]]
[ True] [[0.42576467 0.57423533]]
[ True] [[0.42568783 0.57431217]]
[ True] [[0.4256866 0.5743134]]
[ True] [[0.42563334 0.57436666]]
[ True] [[0.42557611 0.57442389]]
[ True] [[0.42555226 0.57444774]]
[ True] [[0.42554769 0.57445231]]
[ True] [[0.42553486 0.57446514]]
[ True] [[0.42551276 0.57448724]]
[ True] [[0.42550491 0.57449509]]
[ True] [[0.4254875 0.5745125]]
[ True] [[0.42530261 0.57469739]]
[ True] [[0.4252997 0.5747003]]
[ True] [[0.42523308 0.57476692]]
[ True] [[0.42517655 0.57482345]]
[ True] [[0.42512563 0.57487437]]
[ True] [[0.42511673 0.57488327]]
[ True] [[0.42497646 0.57502354]]
[ True] [[0.42491837 0.57508163]]
[ True] [[0.42491024 0.57508976]]
[ True] [[0.42489445 0.57510555]]
[ True] [[0.42489115 0.57510885]]
[ True] [[0.42488134 0