In [1]:
import pandas as pd

In [2]:
lecture_df = pd.read_csv("data/lectures.csv")
question_df = pd.read_csv("data/questions.csv")
response_df = pd.read_csv("data/responses.csv")
train_df = pd.read_csv("data/short_train.csv")

### Split Train Data

Split the train data on `content_type_id` (whether it was an exam or a lecture)

In [3]:
train_exam_filter = train_df['content_type_id'] == 0
train_lecture_filter = train_df['content_type_id'] == 1

train_exams_df = train_df[train_exam_filter]
train_lectures_df = train_df[train_lecture_filter]

Print a report

In [4]:
print('Exams', end='\n=====\n')
print(f'row count: {len(train_exams_df)}')
print(f'user count: {len(train_exams_df.groupby(["user_id"]))}')

print()

print('Lectures', end='\n========\n')
print(f'row count: {len(train_lectures_df)}')
print(f'user count: {len(train_lectures_df.groupby(["user_id"]))}')

Exams
=====
row count: 343298
user count: 1215

Lectures
row count: 6701
user count: 505


### Training the algorithm!

Define helper classes

In [1]:
class Entry:
    def __init__(self, **kwargs):
        self.__dict__ = kwargs

Read from CSV

In [2]:
path = 'data/slim_train.csv'
entries = []
with open(path, 'r') as file:
    for i, line in enumerate(file):
        # Skip header
        if i == 0:
            continue

        u_id, c_id, tc_id, answered_correctly, pqet, pqhe = line.split(',')

        entries.append(Entry(
            user_id=u_id,
            content_id=c_id,
            task_container_id=tc_id,
            answered_correctly=True if answered_correctly == '1' else False,
            prior_question_elapsed_time=float(pqet),
            prior_question_had_explanation=bool(pqhe)
        ))

In [3]:
print(f'Entry count: {len(entries)}')

Entry count: 343298


Prepare data

In [4]:
from sklearn.model_selection import train_test_split

training_data, test_data = train_test_split(entries, test_size=0.33, random_state=42)

print(f'training_data count: {len(training_data)}')
print(f'test_data count: {len(test_data)}')

training_data count: 230009
test_data count: 113289


Split into X and Y

In [5]:
# Train
train_x = [dict(
    user_id=entry.user_id,
    content_id=entry.content_id,
    task_container_id=entry.task_container_id,
    prior_question_elapsed_time=entry.prior_question_elapsed_time,
    prior_question_had_explanation=entry.prior_question_had_explanation
) for entry in training_data]

train_y = [entry.answered_correctly for entry in training_data]

# Test
test_x = [dict(
    user_id=entry.user_id,
    content_id=entry.content_id,
    task_container_id=entry.task_container_id,
    prior_question_elapsed_time=entry.prior_question_elapsed_time,
    prior_question_had_explanation=entry.prior_question_had_explanation
) for entry in test_data]

test_y = [entry.answered_correctly for entry in test_data]

In [6]:
print(f'correct: {train_y.count(True)}')
print(f'incorrect: {train_y.count(False)}')

correct: 153737
incorrect: 76272


Determine the best algorithm

In [7]:
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x)
test_x_vectors = vectorizer.transform(test_x)

Decision Tree Classifier

In [8]:
from sklearn.tree import DecisionTreeClassifier

dec_cls = DecisionTreeClassifier()
dec_cls.fit(train_x_vectors, train_y)

print(f'DecisionTreeClassifier accuracy: {dec_cls.score(test_x_vectors, test_y)}')

DecisionTreeClassifier accuracy: 0.6572571035140217


Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

log_cls = LogisticRegression(max_iter=2000)
log_cls.fit(train_x_vectors, train_y)

print(f'LogisticRegression accuracy: {log_cls.score(test_x_vectors, test_y)}')

LogisticRegression accuracy: 0.7117372383903116


### Test the algorithm!

In [15]:
u_id = input('user id: ')
c_id = input('content id: ')
tc_id = input('task container id: ')
pqet = float(input('prior question elapsed time: '))
pqhe = bool(input('prior question had explanation: '))

input_dict = dict(
    user_id=u_id,
    content_id=c_id,
    task_container_id=tc_id,
    prior_question_elapsed_time=pqet,
    prior_question_had_explanation=pqhe
)
input_vector = vectorizer.transform(input_dict)

prediction = log_cls.predict(input_vector)
print(f'The user is predicted to answer: {"correctly" if prediction[0] == 1 else "incorrectly"}')

The user is predicted to answer: correctly


Test every user!

In [10]:
users = [115,124,2746,5382,8623,8701,12741,13134,24418,24600,32421,40828,44331,45001,46886,50132,51285,53842,81002,81429,91216,99521,107002,108310,128919,137455,138650,140969,141455,142896,146023,146403,157207,163243,165081,166728,174754,176102,176303,178445,206168,215672,220268,238966,239323,246496,247749,251201,260489,275762,286187,287029,290191,297533,298022,301590,318683,327181,332303,341420,355138,357056,357865,359283,364452,364932,366174,368624,371296,375732,377398,381754,382571,384661,384745,385548,385630,400241,403483,408119,408250,410315,422628,427355,438357,443506,444790,449925,457531,459017,469162,475020,478631,480368,496840,508795,510560,513434,531584,537732,538701,576369,579346,581706,583550,592864,595960,613296,616513,637773,642997,650076,650467,653826,660672,671022,678363,686708,700316,702591,712805,713424,715253,736891,741296,745460,750330,752718,758895,766758,786638,786789,800824,804652,807172,808053,814879,823804,837033,839109,839139,839808,841970,859071,859265,872693,900423,901198,904515,904619,906666,921758,934487,935388,939887,952772,975595,983864,986231,993627,994085,999270,999548,999788,1001651,1017117,1024273,1028074,1036195,1047929,1050559,1055635,1056138,1060266,1067196,1072296,1084314,1095490,1095916,1097555,1102335,1108148,1108398,1110924,1111114,1121464,1123921,1124549,1135162,1161058,1163195,1167105,1173490,1181558,1186307,1189852,1192857,1195093,1214143,1214923,1219129,1222551,1228203,1232090,1243813,1244479,1250518,1250993,1254732,1255931,1260634,1263038,1272775,1273434,1273629,1282581,1282716,1283277,1283420,1290970,1298591,1308136,1321608,1335448,1345698,1359985,1360462,1369719,1370238,1375288,1376415,1379421,1380215,1384495,1388172,1400354,1409904,1411218,1413699,1416488,1440282,1455954,1459301,1470345,1474048,1483599,1502650,1504352,1510277,1512827,1515810,1520417,1524501,1524635,1526071,1527051,1533993,1534072,1534292,1544271,1546298,1547989,1551204,1567171,1567938,1581647,1586606,1602506,1603280,1606522,1609213,1609709,1615012,1618701,1638301,1647850,1648234,1657235,1666114,1668190,1674014,1681618,1707189,1710599,1713365,1716032,1722494,1724578,1726369,1733698,1744476,1746406,1749842,1760454,1765772,1772157,1779810,1807727,1820155,1822782,1823042,1824394,1829646,1830849,1838802,1857371,1864305,1864702,1880240,1890634,1891098,1891537,1904297,1906069,1917926,1919994,1923215,1927975,1946295,1950750,1959138,1960671,1962155,1963321,1980358,1984659,1985581,1989232,1996246,1998041,2000859,2013856,2019012,2019581,2026545,2026867,2033195,2034673,2035363,2035385,2038377,2041025,2048552,2051486,2058478,2066347,2067931,2073472,2078569,2090096,2090341,2094250,2098314,2101969,2108174,2109735,2116773,2121898,2122895,2124676,2127031,2134565,2136430,2138673,2140113,2146814,2148001,2160279,2181507,2184424,2187115,2190876,2198581,2199283,2201142,2201910,2206263,2210934,2211492,2212092,2220665,2223671,2226945,2246532,2250258,2252190,2252495,2257029,2263186,2265761,2273870,2275514,2277698,2280091,2285289,2301161,2305556,2309014,2315569,2318167,2320817,2327458,2332767,2345604,2349960,2354314,2355089,2359213,2360716,2362760,2364058,2370369,2377212,2377465,2378743,2381110,2393480,2393889,2394320,2402340,2417607,2418486,2433088,2435242,2445536,2446896,2472417,2473833,2475583,2478069,2489029,2490618,2491722,2497679,2499940,2499957,2500046,2504064,2516963,2524804,2533288,2546612,2547741,2552333,2559982,2562543,2568556,2586255,2601277,2606903,2612798,2618102,2626975,2627685,2631645,2634951,2641711,2655110,2659874,2674477,2677424,2678068,2685790,2686542,2689755,2694314,2695529,2706752,2712966,2718040,2720640,2721338,2722402,2724232,2726001,2727628,2731864,2738513,2741118,2774003,2777604,2786910,2792018,2799534,2799989,2800641,2808857,2844993,2852336,2852406,2859180,2861500,2874845,2879539,2891201,2891629,2892181,2897213,2898560,2909984,2916765,2942506,2943353,2944069,2953506,2955768,2957606,2963722,2966225,2966556,2967284,2976463,2982074,2990844,2992067,3000895,3002493,3004508,3014605,3016994,3035195,3036393,3036976,3039645,3040590,3044483,3051799,3058657,3058874,3058936,3065819,3071227,3075668,3077264,3083660,3084990,3091549,3093025,3095072,3098296,3112157,3117784,3117954,3126556,3127066,3129599,3132568,3148190,3157163,3168913,3171221,3175558,3180340,3182327,3189816,3192663,3194563,3194974,3200883,3202065,3202837,3203826,3216089,3218907,3222754,3224135,3228511,3230528,3232998,3233182,3234484,3237407,3240297,3242395,3254436,3258925,3259641,3259772,3270696,3276462,3279438,3286414,3297385,3311379,3312620,3320698,3334852,3337145,3338062,3355640,3357157,3375969,3381435,3382901,3392024,3392192,3393446,3400938,3447140,3456507,3456709,3457883,3459591,3460877,3470966,3488047,3489479,3499996,3502397,3504886,3504999,3505219,3515794,3523153,3532660,3536255,3538306,3539143,3539494,3549886,3550735,3551211,3551713,3553144,3554489,3556553,3559456,3559994,3569270,3573869,3574310,3580801,3609410,3621581,3622588,3623869,3624867,3633096,3637398,3637566,3645220,3646751,3654869,3667421,3673530,3673674,3676531,3680487,3685556,3686194,3695434,3700234,3706105,3716263,3717734,3725210,3728845,3731308,3734133,3734286,3737252,3737635,3742917,3749425,3754707,3754835,3755072,3758080,3763060,3770381,3777688,3782245,3783498,3783593,3794853,3797536,3809443,3811459,3820022,3824269,3832313,3833713,3838215,3839235,3841248,3848411,3852135,3852517,3855228,3883025,3890514,3891860,3892258,3892809,3893882,3897551,3901202,3910741,3914038,3915694,3916557,3916843,3918822,3920956,3924575,3929529,3929947,3943073,3945748,3950247,3952154,3955273,3958554,3963692,3972985,3975305,3981203,4010487,4022163,4028031,4037741,4041502,4058809,4061710,4068433,4069033,4071515,4075805,4086296,4088746,4089118,4108706,4109418,4111967,4112497,4118986,4125853,4135123,4142819,4146738,4153180,4161343,4170283,4172929,4191330,4191666,4194123,4201341,4202849,4203387,4205142,4205487,4222014,4222121,4226501,4231506,4241441,4247518,4247900,4254617,4259742,4261902,4267431,4272579,4275585,4276700,4280793,4288302,4298473,4306379,4312206,4318126,4319902,4324252,4325436,4335204,4344059,4348980,4355729,4357812,4361868,4365442,4367120,4367482,4368485,4368578,4386544,4387340,4387973,4392914,4396904,4401867,4408207,4417578,4420052,4420772,4421282,4424604,4433051,4437138,4441448,4465529,4477758,4479305,4482106,4486679,4488748,4495323,4496682,4496897,4498487,4500056,4500062,4500555,4500733,4505650,4508124,4508865,4525555,4529557,4530784,4533159,4539824,4550430,4553545,4577710,4580228,4589608,4593213,4601606,4605758,4607918,4608866,4618285,4624404,4625823,4631744,4638444,4650630,4663218,4663746,4669973,4672258,4678177,4679108,4688030,4699603,4700718,4702585,4703676,4711487,4719038,4719044,4727295,4732621,4734982,4740322,4742725,4746807,4751957,4754823,4757448,4760534,4761854,4769798,4774691,4777226,4780783,4822992,4827119,4828676,4828831,4848373,4857591,4861087,4869642,4872589,4876851,4877689,4879655,4883569,4887915,4892105,4902433,4905842,4911742,4927430,4964431,4968367,4973404,4977944,4980312,4990101,4992232,4994712,5000764,5002570,5007669,5009982,5013438,5017696,5050044,5062568,5087251,5091534,5092689,5103696,5105962,5107242,5109209,5115680,5115708,5116652,5126768,5129280,5133856,5140808,5159307,5161303,5161865,5166745,5167527,5182123,5194169,5199624,5211764,5212343,5219438,5223772,5229096,5239268,5240484,5241824,5243414,5243685,5246130,5253841,5256034,5257606,5268779,5271908,5274351,5275458,5282648,5289290,5294077,5294538,5295862,5298602,5298945,5321356,5336804,5340888,5347693,5350678,5355138,5365886,5367469,5367887,5369363,5373028,5381429,5386918,5393013,5394783,5397974,5401739,5405873,5430309,5432884,5440876,5443276,5452162,5452424,5461352,5465869,5468528,5471456,5481709,5482443,5488538,5491525,5493032,5494272,5496538,5496582,5504898,5508867,5521635,5524479,5529131,5541659,5545206,5545246,5554599,5555086,5566848,5568049,5571864,5577726,5587504,5588426,5596216,5603990,5612057,5613226,5615405,5625005,5631064,5643863,5648961,5653550,5657069,5657619,5659730,5661883,5664530,5665691,5666234,5697045,5698774,5699922,5705191,5712442,5715671,5715978,5719795,5720138,5720527,5729623,5745276,5749481,5766705,5768348,5768818,5768827,5783953,5794423,5794974,5796619,5803884,5819047,5822095,5823737,5831104,5832290,5840174,5849513,5852242,5855388,5857445,5861239,5894860,5902066,5903091,5904043,5909033,5911390,5923866,5923971,5941252,5946615,5947535,5953103,5954309,5972782,5975334,5984062,5984145,5985473,5986081,5987022,5987155,5987358,5989507,5989871,5996381,6002516,6004395,6008840,6028332,6035664,6039294,6041189,6047100,6052466,6059050,6070571,6078930,6079360,6087021,6092321,6093633,6100175,6101121,6102209,6104827,6105237,6105755,6109093,6139623,6141989,6144210,6146089,6151236,6159512,6163436,6164893,6177254,6184167,6187160,6201249,6204411,6205167,6208940,6211562,6215754,6254437,6266505,6266926,6269940,6270786,6279660,6288794,6290796,6294495,6299064,6304093,6306969,6307977,6316868,6319973,6323515,6323940,6324620,6333188,6333347,6339750,6340470,6344179,6347696,6356553,6382927,6398304,6398411,6399693,6402376,6413607,6422505,6426333,6426638,6430826,6431206,6439675,6460668,6461994,6468176,6475477,6491507,6515663,6533997,6543065,6550576,6556061,6561930,6566272,6567267,6572875,6574103,6581221,6585649,6588478,6602914,6606022,6606704,6610089,6612081,6613379,6618842,6619628,6637249,6639020,6639643,6653505,6655619,6659026,6665521,6667656,6674708,6679467,6680475,6685611,6685983,6699502,6713711,6716582,6723050,6726434,6727022,6727176,6743765,6747827,6750717,6753706,6775391,6776431,6783624,6786424,6793968,6798601,6799950,6802969,6804274,6811009,6812883]

for user in users:
    input_dict = dict(
    user_id=user,
    content_id='1212',
    task_container_id='19',
    prior_question_elapsed_time=1347,
    prior_question_had_explanation=True)

    input_vector = vectorizer.transform(input_dict)
    prediction = log_cls.predict(input_vector)

    print(f'The user {user} is predicted to answer: {"correctly" if prediction[0] == 1 else "incorrectly"}')

The user 115 is predicted to answer: incorrectly
The user 124 is predicted to answer: incorrectly
The user 2746 is predicted to answer: incorrectly
The user 5382 is predicted to answer: incorrectly
The user 8623 is predicted to answer: incorrectly
The user 8701 is predicted to answer: incorrectly
The user 12741 is predicted to answer: incorrectly
The user 13134 is predicted to answer: incorrectly
The user 24418 is predicted to answer: incorrectly
The user 24600 is predicted to answer: incorrectly
The user 32421 is predicted to answer: incorrectly
The user 40828 is predicted to answer: incorrectly
The user 44331 is predicted to answer: incorrectly
The user 45001 is predicted to answer: incorrectly
The user 46886 is predicted to answer: incorrectly
The user 50132 is predicted to answer: incorrectly
The user 51285 is predicted to answer: incorrectly
The user 53842 is predicted to answer: incorrectly
The user 81002 is predicted to answer: incorrectly
The user 81429 is predicted to answer: 