In [6]:
import gc
import pickle

# 모델링
import pandas as pd
import numpy as np
import lightgbm as lgb
from bitarray import bitarray
import pandas as pd
import datatable as dt
import elo

from tqdm import tqdm

# Columns

In [None]:
 analysis_columns = [
    # 기본 변수
    'answered_correctly',
    'timestamp',
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    'task_container_id',
    'bundle_id',

     # 파생 변수 
    'has_seen_question',
    'user_correct_ratio',
    'content_correct_ratio',
    'correct_streak',
    'incorrect_streak',
    'user_total_question_count',
    'part_correct_ratio',
    'time_since_last_timestamp',
    'time_since_last_lecture', 
    'tag_correct_ratio',
    'previous_question_same_bundle',
    'previous_correct_and_same_bundle',
    'previous_lecture_type',
    'previous_lecture_id',
    'user_part_correct_ratio',
    'user_part_avg_elapsed_time',
    'user_part_timestamp_diff',
    'user_tag_correct_ratio',
    'user_tag_avg_elapsed_time',
    'user_tag_timestamp_diff',
    'question_history',
    'lecture_count',
    'questions_since_last_lec',
    'hardest_question_correct_ratio',
    'easiest_question_correct_ratio',
    'smallest_pqet',
    'largest_pqet',
    'user_rating',
    'content_rating',
    'highest_elo',
    'lowest_elo',
    'elo_user_content',
    'elo_content_user',
    'tag',
    'irt_difficulty',
    'irt_discrimination',
    'timestamp_n1',
    'timestamp_n2', 
    'timestamp_n3',
    'tag_difficulty',
    'tag_discrimination',
    'pqhe_true_correct_ratio',
    'pqhe_false_correct_ratio',
]

base_cols = [
    'answered_correctly',
    'timestamp',
    'user_id',
    'content_id',
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    'content_type_id',
    'task_container_id',
]

In [None]:
def rightshift(ba, count):
    return (bitarray('0') * count) + ba[:-count]

class StatsTracker():

    def __init__(self):
        self.users = {}
        self.content = {}
        self.meta = {
            'total_question_count'  : 0,
            'total_correct_count'   : 0,
            'parts'                 : [[0,0] for _ in range(7)],
            'tags'                  : [[0,0] for _ in range(20)],
        }

    def _create_new_user(self, user_id):
        user = {
            'id'                     : user_id,
            'seen_questions'         : set(),
            'total_question_count'   : 0,
            'total_correct_count'    : 0,
            'correct_streak'         : 0,
            'incorrect_streak'       : 0,
            'last_lecture_timestamp' : -1,
            'last_timestamp'         : 0,
            'total_pqet'             : 0,

            'previous_question_bundle' : -1,
            'previous_lecture_type'    : -1,
            'previous_lecture_id'      : -1,
            'parts'                    : [[-1,0,0,0] for _ in range(7)],
            'tags'                     : [[-1,0,0,0] for _ in range(20)],
            'question_history'         : bitarray(64, endian='little'),
            'lecture_count'            : 0,
            'questions_since_last_lec' : 0,
            'hardest_question'         : -1,
            'easiest_question'         : -1,
            'smallest_pqet'            : -1,
            'largest_pqet'             : -1,
            'rating'                   : 1500,
            'highest_elo'              : 1500,
            'lowest_elo'               : 1500,
            'previous_timestamps'      : [-1,-1,-1],
        }
        return user


    def _create_new_content(self, content_id):
        content = {
            'id'            : content_id,
            'total_count'   : 0,
            'correct_count' : 0,
            'tag'           : -1,
            'part'          : -1,
            'bundle_id'     : -1,
            
            'rating'        : 1500,
            
            'irt_diff'      : -1,
            'irt_disc'      : -1,
            'tag_diff'      : -1,
            'tag_disc'      : -1,

            'pqhe_true_correct_count'  : 0,
            'pqhe_true_total_count'    : 0,
            'pqhe_false_correct_count' : 0,
            'pqhe_false_total_count'   : 0,
        }
        return content
    
    def safe_divide(self, num, denom):
        return num/denom if denom else 0

    def get_user(self, user_id):
        user = self.users.get(user_id, None)
        if not user:
            user = self._create_new_user(user_id)
            self.users[user_id] = user
        return user

    def get_content(self, content_id):
        content = self.content.get(content_id, None)
        if not content:
            content = self._create_new_content(content_id)
            content['tag']       = question_tags.get(content_id, -1)
            content['part']      = question_parts.get(content_id, -1)
            content['bundle_id'] = question_bundles.get(content_id, -1)
            content['irt_diff']  = item_difficulty.get(content_id, avg_item_difficulty)
            content['irt_disc']  = item_discrimination.get(content_id, avg_item_discrimination)
            content['tag_diff']  = kmeans_difficulty.get(content['tag'], avg_kmeans_difficulty)
            content['tag_disc']  = kmeans_discrimination.get(content['tag'], avg_kmeans_discrimination)

            self.content[content_id] = content
        return content

    def has_seen_content(self, user, content):
        if content['id'] not in user['seen_questions']:
            user['seen_questions'].add(content['id'])
            return 0
        return 1

    def update_user(self, user, content, answer, timestamp, pqet, ct_id, tcid, bundle_id):
        if ct_id != 0:
            # if lecture, update variables accordingly
            user['last_lecture_timestamp'] = timestamp
            user['last_timestamp']         = timestamp
            user['previous_lecture_type']  = lectures[content['id']]['type_of']
            user['previous_lecture_id']    = content['id']

            user['lecture_count'] += 1
            user['questions_since_last_lec'] = 0
            return

        # update question history
        user['question_history'] = rightshift(user['question_history'], 1)
        user['question_history'][0] = answer

        
        user['total_correct_count'] += answer
        user['total_question_count'] += 1
        user['total_pqet'] += pqet

        if answer:
            user['correct_streak'] += 1
            user['incorrect_streak'] = 0
        else:
            user['correct_streak'] = 0
            user['incorrect_streak'] += 1

        user['previous_question_bundle'] = bundle_id
        
        # parts
        part = content['part']
        user['parts'][part][0] = timestamp
        user['parts'][part][1] += answer
        user['parts'][part][2] += 1
        if pqet != -1:
            user['parts'][part][3] += pqet

        # tag
        tag = content['tag']
        if tag != -1:
            user['tags'][tag][0] = timestamp
            user['tags'][tag][1] += answer
            user['tags'][tag][2] += 1
            if pqet != -1:
                user['tags'][tag][3] += pqet

        user['questions_since_last_lec'] += 1
        user['last_timestamp'] = timestamp

        # content correct ratio
        ccr = self.safe_divide(content['correct_count'],content['total_count'])
        if ccr < user['easiest_question'] or user['easiest_question'] == -1:
            user['easiest_question'] = ccr
        if ccr > user['hardest_question'] or user['hardest_question'] == -1:
            user['hardest_question'] = ccr
            
        if pqet > user['largest_pqet'] or user['largest_pqet'] == -1:
            user['largest_pqet'] = pqet
        if pqet < user['smallest_pqet'] or user['smallest_pqet'] == -1:
            user['smallest_pqet'] = pqet


        # ELO
        user_old_rating = user['rating']
        if answer:
            user['rating'], content['rating'] = elo.rate_1vs1(user_old_rating, content['rating'])
        else:
            content['rating'], user['rating'] = elo.rate_1vs1(content['rating'], user_old_rating)

        if user['rating'] > user['highest_elo']:
            user['highest_elo'] = user['rating']
        if user['rating'] < user['lowest_elo']:
            user['lowest_elo'] = user['rating']

        # previous timestamps
        user['previous_timestamps'] = [timestamp] + user['previous_timestamps'][:-1]


    def update_content(self, content, answer, pqet, ct_id, pqhe):
        if ct_id != 0:
            # we will perform no updates for the content in the case of a lecture
            return

        content['correct_count'] += answer
        content['total_count'] += 1
        
        if pqhe:
            content['pqhe_true_correct_count'] += answer
            content['pqhe_true_total_count'] += 1
        else:
            content['pqhe_false_correct_count'] += answer
            content['pqhe_false_total_count'] += 1
        
        self.meta['total_correct_count'] += answer
        self.meta['total_question_count'] += 1
        
        part = content['part']
        self.meta['parts'][part][0] += answer
        self.meta['parts'][part][1] += 1
        
        tag = content['tag']
        if tag != -1:
            self.meta['tags'][tag][0] += answer
            self.meta['tags'][tag][1] += 1


    def group_update(self, update_list, answers):
        for (user,content,timestamp,pqet,ct_id,tcid,bundle_id,pqhe), answer in zip(update_list, answers):
            self.update_user(user, content, answer, timestamp, pqet, ct_id, tcid, bundle_id)
            self.update_content(content, answer, pqet, ct_id, pqhe)

# Utilities

In [1]:
def time_series_split(df, train_size=0.8, sample=0.25):
    sample = df[:int(df.nrows*sample), :]
    train = sample[:int(sample.nrows*train_size), :]
    test  = sample[int(sample.nrows*train_size):,:]
    return train,test

def create_train_data(df, stats_tracker):
    X = np.zeros((len(df),len(analysis_columns)), dtype=np.float32)
    
    i=0
    for row in tqdm(df[base_cols].values):
        answer     = row[0]
        timestamp  = int(row[1])*2.77777778e-7
        user_id    = int(row[2])
        content_id = int(row[3])
        pqet       = int(row[4])/1000 if not np.isnan(row[4]) else -1
        pqhe       = int(row[5]) if not np.isnan(row[5]) else 0
        ct_id      = row[6]
        tcid       = row[7]
        
        # get user & content
        user    = stats_tracker.get_user(user_id)
        content = stats_tracker.get_content(content_id)
        
        X[i, 0] = answer # anwered_correctly
        # 파생변수
        if ct_id == 0:
            X[i, 1] = timestamp # timestamp => hours
            X[i, 2] = pqet      # prior_question_elapsed_time => seconds
            X[i, 3] = pqhe      # prior_question_had_explanation
            X[i, 4] = tcid      # task_container_id
            X[i, 5] = content['bundle_id'] # bundle_id
            X[i, 6]  = stats_tracker.has_seen_content(user, content) # has_seen_question
            X[i, 7] = stats_tracker.safe_divide(user['total_correct_count'], user['total_question_count']) # user_correct_ratio
            X[i, 8] = stats_tracker.safe_divide(content['correct_count'],content['total_count']) # content_correct_ratio
            X[i, 9] = user['correct_streak'] # correct_streak
            X[i,10] = user['incorrect_streak'] # incorrect_streak
            X[i,11] = user['total_question_count'] # total_question_count
            X[i,12] = stats_tracker.safe_divide(stats_tracker.meta['parts'][content['part']][0], stats_tracker.meta['parts'][content['part']][1]) # part_correct_ratio
            X[i,13] = timestamp-user['last_timestamp'] # time_since_last_timestamp
            X[i,14] = timestamp-user['last_lecture_timestamp'] # time_since_last_lecture

            tag = content['tag']
            if tag != -1:
                X[i,15] = stats_tracker.safe_divide(stats_tracker.meta['tags'][tag][0], stats_tracker.meta['tags'][tag][1]) # tag_correct_ratio

            X[i,16] = 1 if user['previous_question_bundle'] == content['bundle_id'] else 0 # previous_question_same_bundle
            X[i,17] = (1 if user['correct_streak'] else 0)*(X[i,16]) # previous_correct_and_same_bundle
            X[i,18] = user['previous_lecture_type'] # previous_lecture_type
            X[i,19] = user['previous_lecture_id'] # previous_lecture_id

            if user['parts'][content['part']][0] != -1:
                X[i,20] = stats_tracker.safe_divide(user['parts'][content['part']][1],user['parts'][content['part']][2]) # user_part_correct_ratio
                X[i,21] = stats_tracker.safe_divide(user['parts'][content['part']][2],user['parts'][content['part']][3]) # user_part_avg_elapsed_time
                X[i,22] = timestamp-user['parts'][content['part']][0] # user_part_timestamp_diff
            else:
                X[i,20] = X[i,12]
                X[i,21] = -1
                X[i,22] = -1
                
            tag = content['tag']
            if tag != -1:
                if user['tags'][tag][0] != -1:
                    X[i,23] = stats_tracker.safe_divide(user['tags'][tag][1],user['tags'][tag][2]) # user_tag_correct_ratio
                    X[i,24] = stats_tracker.safe_divide(user['tags'][tag][2],user['tags'][tag][3]) # user_tag_avg_elapsed_time
                    X[i,25] = timestamp-user['tags'][tag][0] # user_tag_timestamp_diff
                else:
                    X[i,23] = X[i,15]
                    X[i,24] = -1
                    X[i,25] = -1

            X[i,26] = stats_tracker.safe_divide(int(user['question_history'].to01(),2), user['total_question_count'] if user['total_question_count'] <= 64 else 64) # question_history
            X[i,27] = user['lecture_count'] # lecture_count
            X[i,28] = user['questions_since_last_lec'] # questions_since_last_lec
            X[i,29] = user['hardest_question'] # hardest_question
            X[i,30] = user['easiest_question'] # easiest_question
            X[i,31] = user['smallest_pqet'] # smallest_pqet
            X[i,32] = user['largest_pqet'] # largest_pqet
            X[i,33] = user['rating'] # user_rating
            X[i,34] = content['rating'] # content_rating
            X[i,35] = user['highest_elo'] # highest_elo
            X[i,36] = user['lowest_elo'] # lowest_elo
            X[i,37] = elo.expect(user['rating'], content['rating']) # elo_uc
            X[i,38] = elo.expect(content['rating'], user['rating']) # elo_cu
            X[i,39] = content['tag'] # tag
            X[i,40] = content['irt_diff'] # irt_difficulty
            X[i,41] = content['irt_disc'] # irt_disrimination
            pt1,pt2,pt3 = user['previous_timestamps']
            X[i,42] = -1 if pt1==-1 else timestamp-pt1 # timestamp_n1
            X[i,43] = -1 if pt2==-1 else timestamp-pt2 # timestamp_n2
            X[i,44] = -1 if pt3==-1 else timestamp-pt3 # timestamp_n3

            X[i,45] = content['tag_diff'] # tag_difficulty (irt)
            X[i,46] = content['tag_disc'] # tag_discrimination (irt)
            
            X[i,47] = stats_tracker.safe_divide(content['pqhe_true_correct_count'], content['pqhe_true_total_count']) # pqhe_true_correct_ratio
            X[i,48] = stats_tracker.safe_divide(content['pqhe_false_correct_count'], content['pqhe_false_total_count']) # pqhe_false_correct_ratio


        # 정리
        stats_tracker.update_user(user, content, answer, timestamp, pqet, ct_id, tcid, content['bundle_id'])
        stats_tracker.update_content(content, answer, pqet, ct_id, pqhe)
        i += 1

    return X

# Various Meta Data

In [31]:
# Questions
questions = pd.read_csv('../riiid_data/questions.csv')[['question_id', 'part', 'tags', 'bundle_id']]
question_parts = {a:b-1 for a,b in questions[['question_id', 'part']].values}
questions.tags = questions.tags.map(lambda x: str(x).split())

ptags = []
for t in questions.tags:
    arr = []
    for _t in t:
        if _t=='nan':
            _t = -1
        else:
            _t = int(_t)
        arr.append(_t)
    ptags.append(arr)
questions.tags = ptags

with open('../kmeans_labels.pkl', 'rb') as f:
    kmeans = pickle.load(f)
f = lambda x,y: kmeans[x] if y!=-1 else -1

question_tags = {a:f(a,b[0]) for a,b in questions[['question_id', 'tags']].values}
question_bundles = {a:b for a,b in questions[['question_id', 'bundle_id']].values}


# Lectures
lectures = pd.read_csv('../riiid_data/lectures.csv')
lectures['type_of'] = pd.Categorical(lectures.type_of).codes
lectures = {id:{'tag':t, 'part':p, 'type_of':to,} for id,t,p,to in lectures.values}


# IRT results
item_difficulty       = {int(i):dif for i,dif,dis in np.load('../item_array.pkl', allow_pickle=True)}
item_discrimination   = {int(i):dis for i,dif,dis in np.load('../item_array.pkl', allow_pickle=True)}
kmeans_difficulty     = {int(i):dif for i,dif,dis in np.load('../irt_kmeans_array.pkl', allow_pickle=True)}
kmeans_discrimination = {int(i):dis for i,dif,dis in np.load('../irt_kmeans_array.pkl', allow_pickle=True)}

avg_item_difficulty       = np.mean(list(item_difficulty.values()))
avg_item_discrimination   = np.mean(list(item_discrimination.values()))
avg_kmeans_difficulty     = np.mean(list(kmeans_difficulty.values()))
avg_kmeans_discrimination = np.mean(list(kmeans_discrimination.values()))