In [1]:
import csv, json
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix

df = pd.read_csv('/Users/jilljenn/code/qna/data/factor-analysis-data.csv')
qmatrix = pd.read_csv('/Users/jilljenn/code/qna/data/q-info.csv')

In [2]:
df.head()

Unnamed: 0,stu_id,q_txt_id,q_seen_timestamp,answer_timestamp,is_correct
0,1011,1076,1472651060,1472651117,1
1,1011,1077,1472651117,1472651126,0
2,1011,1077,1472651126,1472651138,1
3,1011,1078,1472651138,1472651144,1
4,1011,1079,1472651144,1472651149,1


In [3]:
students = df['stu_id'].unique()
questions = pd.concat((df['q_txt_id'], qmatrix['q_txt_id'])).unique()
USER_NUM = len(students)
ITEM_NUM = len(questions)

encode_stu = dict(zip(students, range(USER_NUM)))
encode_q = dict(zip(questions, range(ITEM_NUM)))

In [4]:
df['user_id'] = df['stu_id'].map(encode_stu)
df['item_id'] = df['q_txt_id'].map(encode_q)

In [5]:
df.head()

Unnamed: 0,stu_id,q_txt_id,q_seen_timestamp,answer_timestamp,is_correct,user_id,item_id
0,1011,1076,1472651060,1472651117,1,0,0
1,1011,1077,1472651117,1472651126,0,0,1
2,1011,1077,1472651126,1472651138,1,0,1
3,1011,1078,1472651138,1472651144,1,0,2
4,1011,1079,1472651144,1472651149,1,0,3


In [6]:
len(questions)

246

# QMatrix

In [7]:
qmatrix.head()

Unnamed: 0,qset_id,q_txt_id
0,3,1070
1,3,1071
2,3,1072
3,3,1073
4,3,1074


In [8]:
skills = qmatrix['qset_id'].unique()
SKILL_NUM = len(skills)
encode_skill = dict(zip(skills, range(SKILL_NUM)))

qmatrix['item_id'] = qmatrix['q_txt_id'].map(encode_q)
qmatrix['skill_id'] = qmatrix['qset_id'].map(encode_skill)

In [9]:
qmatrix.head()

Unnamed: 0,qset_id,q_txt_id,item_id,skill_id
0,3,1070,14,0
1,3,1071,15,0
2,3,1072,16,0
3,3,1073,17,0
4,3,1074,18,0


In [10]:
rows = qmatrix['item_id']
cols = qmatrix['skill_id']
sp_qmatrix = coo_matrix(([1] * ITEM_NUM, (rows, cols)), shape=(ITEM_NUM, SKILL_NUM)).tocsr()

In [11]:
from scipy.sparse import save_npz
import os.path

DATA_DIR = '/Users/jilljenn/code/TF-recomm/data/berkeley0/'
save_npz(os.path.join(DATA_DIR, 'qmatrix.npz'), sp_qmatrix)

# Number of attempts

In [12]:
sp_qmatrix[14].indices[0]

0

In [17]:
from collections import Counter

acc_wins = Counter()
acc_fails = Counter()
nb_wins = []
nb_fails = []
for user_id, work_id, outcome in np.array(df[['user_id', 'item_id', 'is_correct']]):
    skill_id = sp_qmatrix[work_id].indices[0]
    nb_wins.append(acc_wins[user_id, skill_id])
    nb_fails.append(acc_fails[user_id, skill_id])
    if outcome == 1:
        acc_wins[user_id, skill_id] += 1
    else:
        acc_fails[user_id, skill_id] += 1

In [18]:
df['nb_wins'] = nb_wins
df['nb_fails'] = nb_fails

In [19]:
df.head()

Unnamed: 0,stu_id,q_txt_id,q_seen_timestamp,answer_timestamp,is_correct,user_id,item_id,wins,fails
0,1011,1076,1472651060,1472651117,1,0,0,0.0,0.0
1,1011,1077,1472651117,1472651126,0,0,1,0.0,0.0
2,1011,1077,1472651126,1472651138,1,0,1,0.0,1.0
3,1011,1078,1472651138,1472651144,1,0,2,0.0,0.0
4,1011,1079,1472651144,1472651149,1,0,3,0.0,0.0


In [21]:
len(df)

562201

In [14]:
import numpy as np

nb_users = len(encode_stu) # 2
nb_items = len(encode_q) # 3
nb_skills = len(encode_skill) # 3
count_item_wins = np.zeros((nb_users, nb_items))
count_item_fails = np.zeros((nb_users, nb_items))
count_skill_wins = np.zeros((nb_users, nb_skills))
count_skill_fails = np.zeros((nb_users, nb_skills))
all_skill_wins = []
all_skill_fails = []
all_item_wins = []
all_item_fails = []
for user_id, item_id, outcome in np.array(df[['user_id', 'item_id', 'is_correct']]):
    skill_ids = sp_qmatrix[item_id]
    
    item_wins = count_item_wins[user_id, item_id]
    item_fails = count_item_fails[user_id, item_id]
    all_item_wins.append(item_wins)
    all_item_fails.append(item_fails)
    
    skill_wins = skill_ids.multiply(count_skill_wins[user_id])
    skill_fails = skill_ids.multiply(count_skill_fails[user_id])
    all_skill_wins.append(skill_wins)
    all_skill_fails.append(skill_fails)
    if outcome == 1:
        count_item_wins[user_id, item_id] += 1
        count_skill_wins[user_id, skill_ids.indices] += 1
    else:
        count_item_fails[user_id, item_id] += 1
        count_skill_fails[user_id, skill_ids.indices] += 1

In [15]:
df['wins'] = all_item_wins
df['fails'] = all_item_fails

In [16]:
df[['user_id', 'item_id', 'is_correct', 'wins', 'fails']].to_csv('/Users/jilljenn/code/TF-recomm/data/berkeley0/all.csv', index=False, header=False)

In [17]:
from scipy.sparse import vstack

skill_wins = vstack(all_skill_wins).tocsr()
save_npz('/Users/jilljenn/code/TF-recomm/data/berkeley0/skill_wins.npz', skill_wins)
skill_wins.shape

(562201, 29)

In [18]:
skill_fails = vstack(all_skill_fails).tocsr()
save_npz('/Users/jilljenn/code/TF-recomm/data/berkeley0/skill_fails.npz', skill_fails)
skill_fails.shape

(562201, 29)

# Cross-validation

In [20]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df[['user_id', 'item_id', 'is_correct', 'nb_wins', 'nb_fails']], test_size=0.2)

In [21]:
len(train), len(test)

(449760, 112441)

In [22]:
train.to_csv(os.path.join(DATA_DIR, 'train.csv'), header=False, index=False)
test.to_csv(os.path.join(DATA_DIR, 'val.csv'), header=False, index=False)
test.to_csv(os.path.join(DATA_DIR, 'test.csv'), header=False, index=False)

In [23]:
import yaml

with open(os.path.join(DATA_DIR, 'config.yml'), 'w') as f:
    config = {
        'USER_NUM': USER_NUM,
        'ITEM_NUM': ITEM_NUM,
        'NB_CLASSES': 2,
        'BATCH_SIZE': 0
    }
    f.write(yaml.dump(config, default_flow_style=False))