In [11]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

In [12]:
def str_to_list(string):
    if string:
        return string.split(',')
    else:
        return []

In [13]:
def remove_self(x):
    if x[0] in x[1]:
        return np.delete(x[1],np.where(x[0]==x[1])[0])
    else:
        return x[1][:6]

In [14]:
def load_data():
    problem = pd.read_csv('data/problems.csv', encoding='utf-8')  # 백준 문제 데이터
    users = pd.read_csv('data/users.csv', encoding='utf-8')  # solved.ac이랑 연동된 유저 데이터
    solved_problem = pd.read_csv('data/solved_problems.csv')  # solved.ac 유저별 푼 문제 데이터

    # csv파일 str list제거 후 list화
    # problem에서는 태그와 이름 
    # solved_problem에서는 푼 문제들
    problem['tags'] = problem['tags'].apply(lambda x: str_to_list(x))
    problem['titles'] = problem['titles'].apply(lambda x: str_to_list(x))
    solved_problem['problems'] = solved_problem['problems'].apply(lambda x: str_to_list(x) if isinstance(x, str) else [])
    problem['tags'] = problem['tags'].apply(lambda x: list(x))

    # 전처리
    problem = problem[problem.isSolvable == True]
    problem = problem[problem.official == True]
    problem['tags'].loc[problem.tags.isnull()] = ''
    problem = problem[problem['level'] != 0]
    
    solved_problem.drop(solved_problem[solved_problem.problems == ''].index, axis=0, inplace=True)
    users = users[users.handle.isin(solved_problem.handle)]
    users.drop(users[users.handle.isin(list(set(users[users.solvedCount == 0].handle) - set(solved_problem.handle)))].index, axis=0, inplace=True)
    users = users[users.handle.isin(solved_problem.handle)]
    problem.reset_index(inplace=True, drop=True)
    solved_problem.reset_index(inplace=True, drop=True)
    users.reset_index(inplace=True, drop=True)

    return problem, solved_problem, users

In [15]:
problem, solved_problem, users = load_data()

In [18]:
problem

Unnamed: 0,problemId,titleKo,titles,isSolvable,isPartial,acceptedUserCount,level,votedUserCount,sprout,givesNoRating,isLevelLocked,averageTries,official,tags,metadata
0,1000,A+B,"[[{'language': 'en', 'languageDisplayName': '...",True,False,245074,1,17,True,False,True,2.4848,True,"[[{'key': 'implementation', 'isMeta': False, ...",{}
1,1001,A-B,"[[{'language': 'ko', 'languageDisplayName': '...",True,False,208618,1,8,True,False,True,1.4197,True,"[[{'key': 'implementation', 'isMeta': False, ...",{}
2,1002,터렛,"[[{'language': 'ko', 'languageDisplayName': '...",True,False,32966,8,211,False,False,False,4.4892,True,"[[{'key': 'case_work', 'isMeta': False, 'boj...",{}
3,1003,피보나치 함수,"[[{'language': 'ko', 'languageDisplayName': '...",True,False,44843,8,181,False,False,False,3.0853,True,"[[{'key': 'dp', 'isMeta': False, 'bojTagId':...",{}
4,1004,어린 왕자,"[[{'language': 'ko', 'languageDisplayName': '...",True,False,13464,8,121,False,False,False,2.1931,True,"[[{'key': 'geometry', 'isMeta': False, 'bojT...",{}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19948,28228,Parking Party,"[[{'language': 'en', 'languageDisplayName': '...",True,False,2,10,2,False,False,False,1.0000,True,"[[{'key': 'greedy', 'isMeta': False, 'bojTag...",{}
19949,28229,Ammunition Storage,"[[{'language': 'en', 'languageDisplayName': '...",True,False,1,19,1,False,False,False,5.0000,True,"[[{'key': 'binary_search', 'isMeta': False, ...",{}
19950,28233,Magic with Cards,"[[{'language': 'en', 'languageDisplayName': '...",True,False,4,10,2,False,False,False,1.5000,True,"[[{'key': 'bfs', 'isMeta': False, 'bojTagId'...",{}
19951,28246,돌 가져가기 게임,"[[{'language': 'ko', 'languageDisplayName': '...",True,False,7,23,6,False,False,False,1.7143,True,"[[{'key': 'dp', 'isMeta': False, 'bojTagId':...",{}


In [19]:
solved_problem

Unnamed: 0,handle,problems
0,koosaga,"[['2502', '12106', '18483', '2795', '18947..."
1,cki86201,"[['2502', '11254', '12106', '16928', '1848..."
2,mitnegativeinfinity,"[['18855', '18483', '11622', '1372', '2466..."
3,ainta,"[['11865', '1372', '24270', '2795', '10826..."
4,yclock,"[['5565', '16928', '20090', '10759', '2753..."
...,...,...
117495,youhk43,[[]]
117496,youjin1952,[[]]
117497,youjong12,[[]]
117498,youngin39,[[]]


In [22]:
def preprocess_rival(problem, solved_problem, users):
    
    # 레벨별 문제풀이수
    user_problems = solved_problem[['handle', 'problems']]
    user_problems['problems'] = user_problems['problems'].astype(str)
    user_problems['problems'] = user_problems['problems'].str.replace('[', '').str.replace(']', '')
    user_problems['problems'] = user_problems['problems'].str.split(',')
    user_problems = user_problems.explode('problems')
    user_problems['problems'] = user_problems['problems'].str.strip()
    user_problems['problems'] = user_problems['problems'].str.strip("'\"")
    user_problems['problems'] = user_problems['problems'].str.replace("'", "").str.strip()
    user_problems['problems'] = user_problems['problems'].fillna(0)
    user_problems['problems'] = user_problems['problems'].replace('', 0)
    user_problems['problems'] = user_problems['problems'].astype(int)
    user_problems = user_problems.dropna()

    user_problems = pd.merge(user_problems, problem[['problemId', 'level']], left_on='problems', right_on='problemId', how='left')
    user_problems = user_problems.groupby(['handle', 'level']).size().reset_index(name='count')
    user_problems = pd.DataFrame(user_problems)
    user_problems.reset_index(inplace=True)
    user_problems['problems'] = 1
    user_problems = user_problems.pivot_table(index="handle", columns=["level"], aggfunc=np.sum, values='problems', fill_value=0)
    user_problems = pd.DataFrame(user_problems)
    users_info = user_problems.reset_index()

    # 정규화  
    scaler = MinMaxScaler()
    num_vars = list(users_info.columns[1:])
    if not users_info.empty:
        users_info[num_vars] = scaler.fit_transform(users_info[num_vars])
    else:
        print("Error: Empty DataFrame")

    real_users = users[['handle', 'solvedCount', 'class', 'tier', 'ratingByClass', 'ratingBySolvedCount', 'ratingByProblemsSum']]
    scaler = MinMaxScaler()
    num_vars = ['solvedCount', 'class', 'ratingByClass', 'ratingBySolvedCount', 'ratingByProblemsSum']
    real_users[num_vars] = scaler.fit_transform(real_users[num_vars])

    real_users = pd.merge(users_info, real_users, on='handle')
    real_users = real_users.sort_values('handle')
    real_users.reset_index(drop=True, inplace=True)
    
    return real_users

In [20]:
users

Unnamed: 0,handle,bio,badgeId,backgroundId,profileImageUrl,solvedCount,voteCount,class,classDecoration,rivalCount,...,ratingByVoteCount,maxStreak,coins,stardusts,joinedAt,bannedUntil,proUntil,rank,isRival,isReverseRival
0,koosaga,,orangecup,s2020-ruby1,https://static.solved.ac/uploads/profile/koosa...,12668,3411,10,gold,5,...,25,24,0,81200,2021-06-19T00:00:00.000Z,1970-01-01T00:00:00.000Z,2025-12-31T23:59:59.000Z,1,False,False
1,cki86201,,,abstract_001_light,,6245,263,10,gold,0,...,25,5,0,16600,2021-06-19T00:00:00.000Z,1970-01-01T00:00:00.000Z,1970-01-01T00:00:00.000Z,2,False,False
2,mitnegativeinfinity,,beta_contributor,balloon_001,,2292,462,10,none,0,...,25,8,0,19375,2021-06-19T00:00:00.000Z,1970-01-01T00:00:00.000Z,1970-01-01T00:00:00.000Z,3,False,False
3,ainta,,,s2020-ruby1,,4309,133,10,none,0,...,25,2,0,18755,2021-06-19T00:00:00.000Z,1970-01-01T00:00:00.000Z,1970-01-01T00:00:00.000Z,4,False,False
4,yclock,Let yclock Win ICPC WF! // Twitter: @youngyoju...,beta_contributor,ucpc2022,https://static.solved.ac/uploads/profile/ycloc...,2967,643,10,none,7,...,25,6,0,166994,2021-06-19T00:00:00.000Z,1970-01-01T00:00:00.000Z,1970-01-01T00:00:00.000Z,5,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117495,youhk43,,,balloon_003,,0,0,0,none,0,...,0,0,0,0,2022-06-27T14:06:15.000Z,1970-01-01T00:00:00.000Z,1970-01-01T00:00:00.000Z,115105,False,False
117496,youjin1952,,,balloon_003,,0,0,0,none,0,...,0,0,0,0,2022-12-29T09:34:49.000Z,1970-01-01T00:00:00.000Z,1970-01-01T00:00:00.000Z,115105,False,False
117497,youjong12,,,balloon_001,,0,0,0,none,0,...,0,0,0,0,2021-06-19T00:00:00.000Z,1970-01-01T00:00:00.000Z,1970-01-01T00:00:00.000Z,115105,False,False
117498,youngin39,,,abstract_002_light,,0,0,0,none,0,...,0,0,0,0,2022-12-20T03:51:01.000Z,1970-01-01T00:00:00.000Z,1970-01-01T00:00:00.000Z,115105,False,False


In [23]:
data = preprocess_rival(problem, solved_problem, users)

In [24]:
data

Unnamed: 0,handle,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0,9.0,...,27.0,28.0,29.0,30.0,solvedCount,class,tier,ratingByClass,ratingBySolvedCount,ratingByProblemsSum
0,000000,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002976,0.1,3,0.10,0.171429,0.013732
1,0000000000,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.103078,0.7,21,0.88,1.000000,0.654577
2,000000hj,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.005718,0.1,6,0.10,0.308571,0.063028
3,00000133,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0,0.00,0.000000,0.000000
4,0000064,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.010026,0.1,9,0.10,0.474286,0.188028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115686,zzzzz9887,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002037,0.1,2,0.10,0.120000,0.011620
115687,zzzzzz,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.002898,0.0,8,0.00,0.171429,0.146479
115688,zzzzzzzbob,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.002820,0.0,4,0.00,0.165714,0.036268
115689,zzzzzzzz,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.007363,0.1,10,0.10,0.377143,0.238732


In [25]:
knn = NearestNeighbors(n_neighbors=7, p=1)
df_data = np.array(data.iloc[:, 1:])
knn.fit(df_data)
rival_idx = knn.kneighbors(df_data, return_distance=False)

print('knn 학습 완료')

knn 학습 완료


In [26]:
result = ([[k, v] for k, v in zip(list(range(len(rival_idx))), rival_idx)])
    

In [52]:
data['handle'][24641]

'dongglee8908'

In [27]:
result

[[0, array([    0, 53262, 22561, 30944, 56823, 32127, 81930])],
 [1, array([     1,  56188,  44469,  93725, 111528, 112034,  75762])],
 [2, array([76297,     2, 20454, 56225, 31958, 70995, 37776])],
 [3, array([190, 304, 205, 192,  94,   3,  13])],
 [4, array([     4,  58690, 107973,  54783,  65592,  35402,  14167])],
 [5, array([    5, 61852, 37944, 62901, 15447, 43938, 22573])],
 [6, array([    6, 31854, 11908, 99457, 94900, 18605, 97095])],
 [7, array([    7, 44339, 38467, 90172, 90874, 50761, 55125])],
 [8, array([     8,  93224,  21772,  83762, 100701,   5463,  66699])],
 [9, array([     9,  46846,   7431,  93712,  53374, 104686,  19707])],
 [10, array([ 21974,     10,  11091, 112416,  49025,  10223,  73388])],
 [11, array([    11, 112556, 102966,  87977,  82075,  77776,  79775])],
 [12, array([   12, 12277,  5859, 10419,  5493,  6205,  1678])],
 [13, array([190, 304, 205, 192,  94,   3,  13])],
 [14, array([   14, 40750,  4685, 65878, 53576, 85041, 41642])],
 [15, array([    15, 

In [67]:
df_result = pd.DataFrame(result, columns=['handle', 'rec_rival'])

In [39]:
df_result

Unnamed: 0,handle,rec_rival
0,0,"[0, 53262, 22561, 30944, 56823, 32127, 81930]"
1,1,"[1, 56188, 44469, 93725, 111528, 112034, 75762]"
2,2,"[76297, 2, 20454, 56225, 31958, 70995, 37776]"
3,3,"[190, 304, 205, 192, 94, 3, 13]"
4,4,"[4, 58690, 107973, 54783, 65592, 35402, 14167]"
...,...,...
115686,115686,"[115686, 23419, 91441, 49680, 96529, 108929, 4..."
115687,115687,"[115687, 96735, 1064, 23077, 63453, 64782, 35643]"
115688,115688,"[115688, 36376, 81455, 49018, 50767, 64966, 10..."
115689,115689,"[115689, 107485, 10233, 52700, 22452, 72483, 1..."


In [68]:
df_result[1] = df_result.apply(remove_self, axis=1)

In [69]:
df_result

Unnamed: 0,handle,rec_rival,1
0,0,"[0, 53262, 22561, 30944, 56823, 32127, 81930]","[53262, 22561, 30944, 56823, 32127, 81930]"
1,1,"[1, 56188, 44469, 93725, 111528, 112034, 75762]","[56188, 44469, 93725, 111528, 112034, 75762]"
2,2,"[76297, 2, 20454, 56225, 31958, 70995, 37776]","[76297, 20454, 56225, 31958, 70995, 37776]"
3,3,"[190, 304, 205, 192, 94, 3, 13]","[190, 304, 205, 192, 94, 13]"
4,4,"[4, 58690, 107973, 54783, 65592, 35402, 14167]","[58690, 107973, 54783, 65592, 35402, 14167]"
...,...,...,...
115686,115686,"[115686, 23419, 91441, 49680, 96529, 108929, 4...","[23419, 91441, 49680, 96529, 108929, 46802]"
115687,115687,"[115687, 96735, 1064, 23077, 63453, 64782, 35643]","[96735, 1064, 23077, 63453, 64782, 35643]"
115688,115688,"[115688, 36376, 81455, 49018, 50767, 64966, 10...","[36376, 81455, 49018, 50767, 64966, 10080]"
115689,115689,"[115689, 107485, 10233, 52700, 22452, 72483, 1...","[107485, 10233, 52700, 22452, 72483, 100722]"


In [70]:
df_result["rec_users_list"] = "hh"


In [71]:
df_result["rec_users_list"][0]=1

In [72]:
df_result

Unnamed: 0,handle,rec_rival,1,rec_users_list
0,0,"[0, 53262, 22561, 30944, 56823, 32127, 81930]","[53262, 22561, 30944, 56823, 32127, 81930]",1
1,1,"[1, 56188, 44469, 93725, 111528, 112034, 75762]","[56188, 44469, 93725, 111528, 112034, 75762]",hh
2,2,"[76297, 2, 20454, 56225, 31958, 70995, 37776]","[76297, 20454, 56225, 31958, 70995, 37776]",hh
3,3,"[190, 304, 205, 192, 94, 3, 13]","[190, 304, 205, 192, 94, 13]",hh
4,4,"[4, 58690, 107973, 54783, 65592, 35402, 14167]","[58690, 107973, 54783, 65592, 35402, 14167]",hh
...,...,...,...,...
115686,115686,"[115686, 23419, 91441, 49680, 96529, 108929, 4...","[23419, 91441, 49680, 96529, 108929, 46802]",hh
115687,115687,"[115687, 96735, 1064, 23077, 63453, 64782, 35643]","[96735, 1064, 23077, 63453, 64782, 35643]",hh
115688,115688,"[115688, 36376, 81455, 49018, 50767, 64966, 10...","[36376, 81455, 49018, 50767, 64966, 10080]",hh
115689,115689,"[115689, 107485, 10233, 52700, 22452, 72483, 1...","[107485, 10233, 52700, 22452, 72483, 100722]",hh


In [73]:
def index_to_users(index_list):
    users_list = [data['handle'][i] for i in index_list]
    return users_list

In [74]:
for i,rows in df_result.iterrows():
    df_result["handle"][i] = data['handle'][df_result["handle"][i]]
    df_result["rec_users_list"][i] = index_to_users(rows[1])

In [76]:
df_result.drop(['rec_rival',1],axis=1,inplace=True)

In [79]:
df_result.to_csv("./rec_rival.csv")

In [36]:
def rival_knn_main():
    problem, solved_problem, users= load_data()
    print('데이터 로드 완료')
    
    data = preprocess_rival(problem, solved_problem, users)
    print('데이터 전처리 완료')
    
    knn = NearestNeighbors(n_neighbors=7, p=1)
    df_data = np.array(data.iloc[:, 1:])
    knn.fit(df_data)
    rival_idx = knn.kneighbors(df_data, return_distance=False)
    print('knn 학습 완료')
    
    result = ([[k, v] for k, v in zip(list(range(len(rival_idx))), rival_idx)])
    df_result = pd.DataFrame(result, columns=['handle', 'rec_rival'])
    df_result[1] = df_result.apply(remove_self, axis=1)
    df_result = df_result[1]
    df_result['rec_rival'] = df_result['rec_rival'].str.strip(",")
    lst_rivals = [','.join(list(data.loc[x, 'handle'])) for x in df_result.index]

    target_users = list(df_data.handle)
    output = pd.DataFrame(target_users, columns=['handle'])
    output['rec_rivals'] = lst_rivals
    output.index += 1
    output.index.name = 'id'
    output.to_csv('rec_rival_knn.csv')

    print('라이벌 추천 완료!')
    return output

In [37]:
# KNN 모델 학습 및 라이벌 추천 수행
output = rival_knn_main()
print(output)

데이터 로드 완료
데이터 전처리 완료
knn 학습 완료


KeyError: 'rec_rival'