In [1]:
import pandas as pd
import numpy as np
import os
import json
import torch
from torch import nn
from ast import literal_eval

In [104]:
# 문자열을 리스트로 변환
def str_to_list(x):
    try:
        return literal_eval(x)
    except: #해당 값이 null값이거나 오류가 있을 때, None을 return 하기
        return None
    
# 딕셔너리에서 키값만 반환
def dic_to_list(x):
    try:
        temp = []
        for i in x:
            temp.append(i["key"])
        if not temp:
            return None
        else:
            return temp
    except: #해당 값이 null값이거나 오류가 있을 때, None을 return 하기
        return None
    
def numerize(tp, profile2id, show2id):
    uid = tp['handle'].apply(lambda x: profile2id[x])
    sid = tp['solved_problem'].apply(lambda x: show2id[x])
    return pd.DataFrame(data={'uid': uid, 'sid': sid}, columns=['uid', 'sid'])

def split_train_test_proportion(data, test_prop=0.2):
    '''
    data -> DataFrame
    
    train과 test를 8:2 비율로 나눠주는 함수.
    '''
    data_grouped_by_user = data.groupby('handle')
    tr_list, te_list = list(), list()

    np.random.seed(98765)
    
    for _, group in data_grouped_by_user:
        n_items_u = len(group)
        
        if n_items_u >= 5:
            idx = np.zeros(n_items_u, dtype='bool') # 'False'가 n_items_u개 만큼 채워진 array
            
            # n_items_u개 중에서 20%의 인덱스를 랜덤으로 뽑아서 해당 인덱스를 'True'로 바꿈
            idx[np.random.choice(n_items_u, size=int(test_prop * n_items_u), replace=False).astype('int64')] = True
                    
            tr_list.append(group[np.logical_not(idx)]) # 'False'인 것을 tr_list에 추가
            te_list.append(group[idx]) # 'True'인 것을 te_list에 추가
        
        else:
            tr_list.append(group)
    
    data_tr = pd.concat(tr_list)
    data_te = pd.concat(te_list)

    return data_tr, data_te

In [105]:
df_problems = pd.read_csv('../rawdata/problems.csv', encoding='utf-8')
df_users = pd.read_csv('../rawdata/users.csv', encoding='utf-8')
df_problems_solved = pd.read_csv('../rawdata/solved_problems.csv')

#문자열 리스트로
df_problems['tags'] = df_problems['tags'].apply(lambda x: str_to_list(x)) 
df_problems_solved['solved_problem'] = df_problems_solved['solved_problem'].apply(lambda x: str_to_list(x)) 

#딕셔너리 값 추출
df_problems['tags'] = df_problems['tags'].apply(lambda x: dic_to_list(x)) 

In [106]:
# users_drop = ['bio', 'badgeId', 'backgroundId', 'profileImageUrl', 'voteCount', 'classDecoration', 'ratingByVoteCount', 
#              'coins', 'stardusts', 'joinedAt', 'bannedUntil', 'proUntil', 'isRival', 'isReverseRival']
# ban_users = pd.concat([df_users[(df_users.solvedCount == 0) & (df_users.rating !=0)],users[(df_users.solvedCount != 0) & (df_users.rating ==0)]]).index
# df_users = df_users.drop(columns=users_drop, axis=1)
# df_users = df_users.drop(index=ban_users).reset_index(drop=True)
# df_users

In [107]:
problems_drop = ['titles', 'isPartial', 'votedUserCount', 'sprout', 
                'givesNoRating', 'givesNoRating', 'metadata', 'isLevelLocked']
gudegi = [24900, 24901, 24902, 24903, 24904, 24905, 24906, 24907, 24908, 24909, 24910, 24911, 
          21292, 21293, 21294, 21295, 21296, 21297, 21298, 21299, 
           18821, 18822, 18823, 18824, 18825, 18826, 18827, 18828, 18829, 18830, 18831, 18832, 18833, 18834, 18835, 18836, 
            17106, 17107, 17108, 17109, 17110, 17111, 17112, 17113, 17114, 17115, 17116, 17117, 17118, 17119, 17120, 
           15629, 15630, 15631, 15632, 15633, 15634, 15635, 15636, 15637, 15638, 15639, 15640, 15641, 15642, 15643]
df_problems = df_problems.drop(columns=problems_drop, axis=1)
df_problems = df_problems.drop(index = df_problems.loc[df_problems['isSolvable'] == False].index)
df_problems = df_problems.drop(index = df_problems.loc[df_problems['official'] == False].index)
df_problems = df_problems[~df_problems.problemId.isin(gudegi)]
df_problems = df_problems[~df_problems.tags.isnull()].reset_index(drop=True)
df_problems

Unnamed: 0,problemId,titleKo,isSolvable,acceptedUserCount,level,averageTries,official,tags
0,1000,A+B,True,245074,1,2.4848,True,"[implementation, arithmetic, math]"
1,1001,A-B,True,208618,1,1.4197,True,"[implementation, arithmetic, math]"
2,1002,터렛,True,32966,8,4.4892,True,"[case_work, geometry, math]"
3,1003,피보나치 함수,True,44843,8,3.0853,True,[dp]
4,1004,어린 왕자,True,13464,8,2.1931,True,"[geometry, math]"
...,...,...,...,...,...,...,...,...
19112,28228,Parking Party,True,2,10,1.0000,True,"[greedy, implementation]"
19113,28229,Ammunition Storage,True,1,19,5.0000,True,"[binary_search, data_structures, multi_segtree..."
19114,28233,Magic with Cards,True,4,10,1.5000,True,"[bfs, graphs, graph_traversal]"
19115,28246,돌 가져가기 게임,True,7,23,1.7143,True,"[dp, game_theory, knapsack]"


In [108]:
temp = df_problems_solved.explode('solved_problem').dropna().reset_index(drop=True)
temp = temp.astype({'handle':'str', 'solved_problem':'int'})
temp = temp[temp['solved_problem'].isin(df_problems.problemId.values)].reset_index(drop=True)
temp.solved_problem = temp.solved_problem.astype(str)
#temp = temp.groupby('handle')['solved_problem'].apply(lambda x: "%s" % ','.join(x))
temp = temp.groupby('handle')['solved_problem'].apply(list)
temp = pd.DataFrame(temp)
temp = temp.reset_index()
temp

Unnamed: 0,handle,solved_problem
0,000000,"[2557, 15552, 10430, 8393, 1330, 2739, 1008, 2..."
1,0000000000,"[13324, 1517, 1546, 2693, 25591, 27110, 1644, ..."
2,000000hj,"[10172, 1330, 9653, 4344, 18108, 1110, 15552, ..."
3,00000133,[2557]
4,0000064,"[10886, 8958, 10718, 11653, 1629, 10156, 2304,..."
...,...,...
115665,zzzzz9887,"[10171, 9498, 2750, 1000, 2959, 2437, 10825, 1..."
115666,zzzzzz,"[1212, 2775, 2178, 10996, 11654, 23803, 3052, ..."
115667,zzzzzzzbob,"[11720, 2557, 1152, 10951, 10430, 1330, 1008, ..."
115668,zzzzzzzz,"[2751, 1463, 2609, 1065, 1934, 11047, 10250, 3..."


In [109]:
unique_uid = pd.unique(temp['handle'])
print('len(unique_uid): ', len(unique_uid))
print("(BEFORE) unique_uid: ", unique_uid)
np.random.seed(2023)
idx_perm = np.random.permutation(unique_uid.size)
unique_uid = unique_uid[idx_perm]
print("(AFTER) unique_uid: ", unique_uid)

len(unique_uid):  115670
(BEFORE) unique_uid:  ['000000' '0000000000' '000000hj' ... 'zzzzzzzbob' 'zzzzzzzz' 'zzzzzzzzu2']
(AFTER) unique_uid:  ['minieyes85' 'ljw93072' 'design20458' ... 'jun991207' 'jhysym' 'dl45664']


In [110]:
n_users = unique_uid.size
n_heldout_users = int(n_users * 0.1)

tr_users = unique_uid[:(n_users - n_heldout_users * 2)]
val_users = unique_uid[(n_users - n_heldout_users * 2): (n_users - n_heldout_users)]
te_users = unique_uid[(n_users - n_heldout_users):]

print("train데이터에 사용할 사용자 수: ", len(tr_users))
print("valid데이터에 사용할 사용자 수: ", len(val_users))
print("test데이터에 사용할 사용자 수: ", len(te_users))

train데이터에 사용할 사용자 수:  92536
valid데이터에 사용할 사용자 수:  11567
test데이터에 사용할 사용자 수:  11567


In [111]:
tr_plays = temp.loc[temp['handle'].isin(tr_users)]
tr_plays = pd.DataFrame(tr_plays.explode('solved_problem'))
tr_plays

Unnamed: 0,handle,solved_problem
0,000000,2557
0,000000,15552
0,000000,10430
0,000000,8393
0,000000,1330
...,...,...
115668,zzzzzzzz,1969
115668,zzzzzzzz,2441
115668,zzzzzzzz,10951
115668,zzzzzzzz,10869


In [4]:
unique_sid = pd.unique(temp['solved_problem'].explode())
pro_dir = os.path.join('./', 'dataset')

if not os.path.exists(pro_dir):
    os.makedirs(pro_dir)

item2id = dict((sid, i) for (i, sid) in enumerate(unique_sid))
user2id = dict((pid, i) for (i, pid) in enumerate(unique_uid))

NameError: name 'temp' is not defined

In [113]:
with open(os.path.join(pro_dir, 'item2id.json'), 'w', encoding="utf-8") as f:
    json.dump(item2id, f, ensure_ascii=False, indent="\t")
        
with open(os.path.join(pro_dir, 'user2id.json'), 'w', encoding="utf-8") as f:
    json.dump(user2id, f, ensure_ascii=False, indent="\t")

with open(os.path.join(pro_dir, 'unique_sid.txt'), 'w') as f:
    for sid in unique_sid:
        f.write('%s\n' % sid)

with open(os.path.join(pro_dir, 'unique_uid.txt'), 'w') as f:
    for uid in unique_uid:
        f.write('%s\n' % uid)

#Validation과 Test에는 input으로 사용될 tr 데이터와 정답을 확인하기 위한 te 데이터로 분리되었습니다.
print('Data Split Start!')
val_plays = temp.loc[temp['handle'].isin(val_users)]
val_plays = pd.DataFrame(val_plays.explode('solved_problem'))
val_plays_tr, val_plays_te = split_train_test_proportion(val_plays)

te_plays = temp.loc[temp['handle'].isin(te_users)]
te_plays = pd.DataFrame(te_plays.explode('solved_problem'))
te_plays_tr, te_plays_te = split_train_test_proportion(te_plays)

train_data = numerize(tr_plays, user2id, item2id)
train_data.to_csv(os.path.join(pro_dir, 'train.csv'), index=False)

vad_data_tr = numerize(val_plays_tr, user2id, item2id)
vad_data_tr.to_csv(os.path.join(pro_dir, 'validation_tr.csv'), index=False)

vad_data_te = numerize(val_plays_te, user2id, item2id)
vad_data_te.to_csv(os.path.join(pro_dir, 'validation_te.csv'), index=False)

test_data_tr = numerize(te_plays_tr, user2id, item2id)
test_data_tr.to_csv(os.path.join(pro_dir, 'test_tr.csv'), index=False)

test_data_te = numerize(te_plays_te, user2id, item2id)
test_data_te.to_csv(os.path.join(pro_dir, 'test_te.csv'), index=False)
print("Data Split Done!")


Data Split Start!
Data Split Done!


In [197]:
# item_tag_emb
print("Item tag emb start!")
set_tags = set()
for tags in df_problems['tags'].dropna().values:
    for tag in tags:
        set_tags.add(tag)
        
df_tags = df_problems[['problemId', 'tags']]
df_tags = df_tags.explode('tags').dropna().reset_index(drop=True)
df_tags = df_tags[df_tags['problemId'].astype(str).isin(unique_sid)].reset_index(drop=True)

emb = nn.Embedding(len(set_tags), 300)
tag_emb = pd.DataFrame(df_tags['tags'].value_counts().index.values, columns=['tags'])

dict_tag_idx = dict()
for i, j in enumerate(df_tags['tags'].value_counts().index.values):
    dict_tag_idx[j] = i

list_emb = []
dict_tag_emb = dict()
for i in df_tags['tags'].value_counts().index.values:
    list_emb.append(emb(torch.tensor(dict_tag_idx[i])).detach().numpy())
    dict_tag_emb[i] = emb(torch.tensor(dict_tag_idx[i])).detach().numpy()

df_tag_emb = pd.concat([tag_emb, pd.DataFrame(list_emb)], axis=1)
df_tags2 = pd.merge(df_tags, df_tag_emb, on='tags', how='left')
tag2emb = df_tags2.iloc[:, 2:].values
df_tags['emb'] = list(tag2emb) 

total = []
def item_genre_emb_mean(i):
    total.append(np.mean(df_tags[df_tags['problemId'] == i].emb))

item_genre_emb_idx = pd.DataFrame(list(df_tags['problemId'].unique()), columns=['problemId'])
item_genre_emb_idx.problemId.apply(lambda x: item_genre_emb_mean(x))
item_genre_emb = pd.DataFrame(total)
item_genre_emb.index = df_tags['problemId'].unique()

item_genre_emb = item_genre_emb.reset_index()
item_genre_emb['index'] = item_genre_emb['index'].astype(str).apply(lambda x : item2id[x])
item_genre_emb = item_genre_emb.set_index('index')
item_genre_emb = item_genre_emb.sort_index()

item_genre_emb = item_genre_emb.T
print(item_genre_emb.shape)

item_genre_emb.to_csv(pro_dir + '/item_tag_emb.csv', index=False)
print('Item tag emb Done!')

Item tag emb start!
(300, 19093)
Item tag emb Done!


In [4]:
model_score = dict()
model_score['recvae'] = 0
model_score['multivae'] = 0
model_score['multidae'] = 0
with open(os.path.join(pro_dir, 'model_score.json'), 'w', encoding="utf-8") as f:
    json.dump(model_score, f, ensure_ascii=False, indent="\t")