In [None]:
import pandas as pd
import re
import numpy as np
from sklearn.utils import shuffle
from tqdm import tqdm
import random
from itertools import product
import pickle

# 1. 데이터 생성 및 전처리 과정

## 1-1 기존 데이터 (user_course에 학과, 과목 삭제 조건 적용)

In [None]:
# data initialization
user_major = {}
category_course = {}
major_category = pd.read_csv('./data/major_category.csv', encoding='CP949', header=None)
major_course={}
userlist=[]
courselist=[]
elelist=[]
ele_course=[]

# hakno re
coursere=re.compile('^[A-Z]{3}\d*')

# category, deluniv input
with open('./data/catlist.txt', encoding='UTF-8') as f:
    valid_category = [cate.split("\t")[0] for cate in f.readlines()]
with open('./data/elelist.txt', encoding='UTF-8') as f:
    elelist = [cate.split("\t")[0] for cate in f.readlines()]
with open('./data/deluniv.txt', encoding='UTF-8') as f:
    deluniv = [univ[:-1] for univ in f.readlines()]
    
# data creation
user_course = pd.read_csv('./data/course_userdata.csv', low_memory=False, header=None)
for row in user_course.values:
    if (row[1] in deluniv or str(row[4]) not in valid_category or re.match(coursere, row[5]) == None):
        continue
    user = row[0]
    major = row[1] + '_' + row[2]
    category = row[4]
    course = re.search(coursere, row[5]).group()
    year = row[6]
    term = row[7]
    if user not in user_major:
        user_major[user] = [major]
        userlist.append(user)
    elif major not in user_major[user]:
        user_major[user].append(major)

    if category not in category_course:
        category_course[category] = [course]
    elif course not in category_course[category]:
        category_course[category].append(course)

    if (category in elelist and course not in ele_course):
        ele_course.append(course)

    if course not in courselist:
        courselist.append(course)

for row in major_category.values:
    if row[0] not in major_course:
        major_course[row[0]] = category_course[row[1]]
    else:
        major_course[row[0]] += category_course[row[1]]

In [None]:
print(len(user_course))
user_course.head()

## 1-2 학습을 위한 추가 전처리

In [None]:
# 데이터 수정을 위해 컬럼명 지정
user_course.columns = ['a','b','c','d','e','f','g','h','i']

#a:삭제 대상 학과, b:고려대상 과목, c:조건에 해당하는 user, d:15,16,17년도 각각 조건 적용
a = ~user_course.b.isin(deluniv)
b = user_course.f.str.contains(coursere)
c = user_course.a.isin(userlist)
d = user_course.g.isin([2015,2016,2017])
user_course = user_course[a & b & c & d]

#j 컬럼에 학정번호 앞부분 저장(분반제거) ex) CEE4402-01 > CEE4402
user_course['j'] = user_course['f'].str.findall(coursere)
user_course['j'] = user_course.f.str.split('-').str[0]
user_course = user_course[user_course['j'].isin(courselist)]

#17-2에 수업 들었던 학생(user_20172), 15,16,17-1에 수업 들었던 학생(user_left)
#17년 2월에 수업 들었던 학생 중 나머지 학기에도 들었던 학생(user_fit)리스트 저장
user_20172 = list(set(user_course[(user_course['g'] == 2017) & (user_course['h'] == 2)]['a'].tolist()))
user_left = list(set(user_course[~((user_course['g'] == 2017) & (user_course['h'] == 2))]['a'].tolist()))
user_fit = []
for i in user_20172:
    if i in user_left:
        user_fit.append(i)

#user_fit에 해당하는 학생들의 데이터만 남겨서 user_course에 filtering
user_course = user_course[user_course.a.isin(user_fit)]
        
#2017년 2학기 개강한 수업(course_20172_list)
course_20172_list = list(set(user_course[(user_course['g'] == 2017) & (user_course['h'] == 2)]['j'].tolist()))

print("17-2에 수업 들었던 학생 : {}".format(len(user_20172)))
print("15,16,17-1에 수업 들었던 학생 : {}".format(len(user_left)))
print("17년 2월에 수업 들었던 학생 중 나머지 학기에도 들었던 학생 : {}".format(len(user_fit)))

In [None]:
# user_course (1920266 > 285482)
print(len(user_course))
user_course.head()

# 2. NCF 학습

## 2-1 학습 데이터 생성

user_course['user_id'] = user_course['a'].astype("category").cat.codes
user_course['item_id'] = user_course['j'].astype("category").cat.codes
df_to_node2vec = user_course[['a','b','c','d','j','i','g','h','user_id','item_id']].copy()
df_to_node2vec.columns = ['user', 'major1', 'major2', 'item_name', 'item_num', 'item_major', 'year', 'semester', 'user_id', 'item_id']
df_to_node2vec = df_to_node2vec.reset_index(drop=True)

##### 데이터 전달
df_to_node2vec.to_csv('./data/df_to_node2vec.csv',encoding="CP949")

In [None]:
user_course_inter_df = user_course[['a','j','g','h']].copy()
user_course_inter_df.columns = ['user','item','year','sem']
user_course_inter_df = user_course_inter_df.reset_index(drop=True)

user_course_train = user_course_inter_df[~((user_course_inter_df['year'] == 2017) & (user_course_inter_df['sem'] == 2))].copy()
user_course_test = user_course_inter_df[(user_course_inter_df['year'] == 2017) & (user_course_inter_df['sem'] == 2)].copy()

# item_id 부여 + user_id column 추가
user_course_train['user_id'] = user_course_train['user'].astype("category").cat.codes
user_course_train['item_id'] = user_course_train['item'].astype("category").cat.codes

users = list(np.sort(user_course_train.user_id.unique()))
items = list(np.sort(user_course_train.item_id.unique()))

print("# of train users : {}".format(len(users)))
print("# of train items : {}".format(len(items)))
print("# of interactions : {}".format(len(user_course_inter_df)))

print("train inter : {}".format(len(user_course_train)))
print("test inter : {}".format(len(user_course_test)))

In [None]:
user_dict = dict(zip(user_course_train['user_id'].tolist(), user_course_train['user'].tolist()))
item_dict = dict(zip(user_course_train['item_id'].tolist(), user_course_train['item'].tolist()))
item_dict_T = dict(zip(user_course_train['item'].tolist(), user_course_train['item_id'].tolist()))

## 2-2 negative case 생성

In [None]:
import random

In [None]:
neg_num=4
neg_full_u = []
neg_full_i = []

for u in tqdm(users):
    train_num = len(user_course_train[user_course_train['user_id']==u])
    full = {i for i in range(len(items))}
    asis = set(user_course_train[user_course_train['user_id']==u]['item_id'].tolist())
    
    temp_neg_u = [u for i in range(train_num*neg_num)]
    temp_neg_i = random.sample(list(full-asis), train_num*neg_num)
    
    neg_full_u.extend(temp_neg_u)
    neg_full_i.extend(temp_neg_i)  

## 2-3 train_df 만들기

In [None]:
u_df = user_course_train['user_id'].tolist()
u_df.extend(neg_full_u)

i_df = user_course_train['item_id'].tolist()
i_df.extend(neg_full_i)

p_df = [1 for i in range(len(user_course_train))]
p_df.extend([0 for i in range(len(neg_full_u))])

print(len(u_df),len(i_df),len(p_df))

train_df = pd.DataFrame({'user_id':u_df, 'item_id':i_df, 'plays':p_df})

## 모델 인풋 생성 + shuff

In [None]:
# train user, item 리스트 생성
rows = train_df['user_id'].astype(int)
cols = train_df['item_id'].astype(int)
values = list(train_df.plays)

uids = np.array(rows.tolist())
iids = np.array(cols.tolist())

user_input = uids.tolist()
item_input = iids.tolist()
labels = values

user_data_shuff, item_data_shuff, label_data_shuff = shuffle(user_input, item_input, labels)
user_data_shuff = np.array(user_data_shuff).reshape(-1,1)
item_data_shuff = np.array(item_data_shuff).reshape(-1,1)
label_data_shuff = np.array(label_data_shuff).reshape(-1,1)

print(len(user_data_shuff))
print(len(item_data_shuff))
print(len(label_data_shuff))

In [None]:
user_course_train

# Node2Vec embedding 설정

In [None]:
# node2vec.wv.vectors.shape
# node2vec.wv.vectors
# node2vec.wv.index2entity

emb_size = 64

In [None]:
with open('./data/student_emb'+str(emb_size)+'.pickle', 'rb') as f:
    node2vec_u = pickle.load(f)
print(node2vec_u.vectors.shape)

node2vec_u_sort = []

for i in tqdm(range(len(users))):
    idx=node2vec_u.index2entity.index(str(user_dict[i]))
    node2vec_u_sort.append(list(node2vec_u.vectors[idx]))

node2vec_u_sort = np.array(node2vec_u_sort,dtype=np.float32)
node2vec_u_sort.shape

In [None]:
with open('./data/course_emb'+str(emb_size)+'.pickle', 'rb') as f:
    node2vec = pickle.load(f)
print(node2vec.wv.vectors.shape)

node2vec_sort = []

for i in tqdm(range(len(items))):
    idx=node2vec.wv.index2entity.index(item_dict[i])
    node2vec_sort.append(list(node2vec.wv.vectors[idx]))

node2vec_sort = np.array(node2vec_sort,dtype=np.float32)
node2vec_sort.shape

# 모델

In [None]:
from model.NeuMF_node import NeuMF

In [None]:
nmf = NeuMF(len(users), len(items), emb_size,node2vec_u_sort ,node2vec_sort)
model=nmf.get_model()

In [None]:
model.summary()

In [None]:
model.fit([user_data_shuff, item_data_shuff], label_data_shuff, epochs=20,
               batch_size=256, verbose=1)

In [None]:
# model.save('./pretrain/ncf'+str(emb_size)+'-node2.h5')

# Predict

In [None]:
def predict_return(u, K):
    user=user_course_train[user_course_train['user_id']==u]['user'].iloc[0]
    
    full = set([i for i in range(len(items))])
    asis = set(user_course_train[user_course_train['user_id']==u]['item_id'].tolist())
    pred_cand_list = list(full-asis)
    pred_user_list = [u for i in range(len(pred_cand_list))]
    
    pred_user = np.array(pred_user_list).reshape(-1,1)
    pred_cand = np.array(pred_cand_list).reshape(-1,1)
    
    # predict 진행
    predictions = model.predict([pred_user, pred_cand])
    predictions = predictions.flatten().tolist()

    # (full) predict한 item 이름으로 변겅해서 저장
    pred_cand_list_name = list(map(lambda x:item_dict[x], pred_cand_list)) 
    
    pred_df = pd.DataFrame({'item_id':pred_cand_list,'item':pred_cand_list_name, 'score':predictions})
    
    # (in) 유저의 전공과목에 해당되는지
    temp_user_major = major_course[user_major[user][0]]
    pred_cand_list_name = [x for x in pred_cand_list_name if x in temp_user_major]
    
    # (in) test 데이터 안에 있는 과목인지 (2017-2 개설과목 조건)
    temp_test_full_course = list(set(user_course_test['item']))
    pred_cand_list_name = [x for x in pred_cand_list_name if x in temp_test_full_course]
    
    # 최종 output 생성
    req_df = pred_df[pred_df['item'].isin(pred_cand_list_name)]
    
    req_list = req_df.sort_values(by='score',ascending=False)['item'].tolist()
    
    return(req_list[0:K], user)

In [None]:
predict_return(1,10)

# test case load

In [None]:
def asis_return(u):
    user=user_course_train[user_course_train['user_id']==u]['user'].iloc[0]

    # 유저에 해당되는 전공 과목 리스트 가져옴
    temp_user_major = major_course[user_major[user][0]]

    temp_asis_df = user_course_test[user_course_test['user']==user].copy()

    req_asis = temp_asis_df[temp_asis_df['item'].isin(temp_user_major)]

    return(req_asis['item'].tolist(), user)

In [None]:
# user_id 만 집어넣음
asis_return(1)

# Evaluate

In [None]:
import numpy as np

def recall_(p,r):
    try:
        return(int(sum(p))/int(len(r)))
    except:
        return(0)

def precision_at_k(r, k):
    assert k >= 1
    r = np.asarray(r)[:k] != 0
    #print(r)
    if r.size != k:
        raise ValueError('Relevance score length < k')
    return np.mean(r)


def average_precision(r):
    r = np.asarray(r) != 0
    out = [precision_at_k(r, k + 1) for k in range(r.size) if r[k]]
    if not out:
        return 0.
    return np.mean(out)


def mean_average_precision(rs):
    return np.mean([average_precision(r) for r in rs])

def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.

def ndcg_at_k(r, k=20, method=1):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [None]:
p=[] #precision
r=[] #recall
m=[] #map
n=[] #ndcg
correct_list=[]
K=10

for i in tqdm(range(12031)):
    eval_gt = asis_return(i)
    eval_pred = predict_return(i,K) # pred개수

    temp_req=[1 if c in eval_gt[0] else 0 for c in eval_pred[0]]
    
    correct_list.append(temp_req)
    
    if eval_gt[0] != []:
        p.append(precision_at_k(temp_req,len(temp_req)))
        r.append(recall_(temp_req, eval_gt[0]))
        m.append(mean_average_precision([temp_req]))
        n.append(ndcg_at_k(temp_req,len(temp_req)))
        
print("{:.4f} : Precision".format(sum(p)/len(p)))
print("{:.4f} : Recall".format(sum(r)/len(r)))
print("{:.4f} : MAP".format(sum(m)/len(m)))
print("{:.4f} : NDCG".format(sum(n)/len(n)))