In [17]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

In [18]:
def str_to_list(string):
    if string:
        return string.split(',')
    else:
        return []

In [19]:
def remove_self(x):
    if x[0] in x[1]:
        return np.delete(x[1],np.where(x[0]==x[1])[0])
    else:
        return x[1][:6]

In [24]:
def load_data():
    problem = pd.read_csv('problems.csv', encoding='utf-8')  # 백준 문제 데이터
    users = pd.read_csv('users.csv', encoding='utf-8')  # solved.ac이랑 연동된 유저 데이터
    solved_problem = pd.read_csv('solved_problems.csv')  # solved.ac 유저별 푼 문제 데이터

    problem['tags'] = problem['tags'].apply(lambda x: str_to_list(x))
    problem['titles'] = problem['titles'].apply(lambda x: str_to_list(x))
    solved_problem['problems'] = solved_problem['problems'].apply(lambda x: str_to_list(x) if isinstance(x, str) else [])
    problem['tags'] = problem['tags'].apply(lambda x: list(x))

    # 전처리
    problem = problem[problem.isSolvable == True]
    problem = problem[problem.official == True]
    problem['tags'].loc[problem.tags.isnull()] = ''
    problem = problem[problem['level'] != 0]
    
    solved_problem.drop(solved_problem[solved_problem.problems == ''].index, axis=0, inplace=True)
    users = users[users.handle.isin(solved_problem.handle)]
    users.drop(users[users.handle.isin(list(set(users[users.solvedCount == 0].handle) - set(solved_problem.handle)))].index, axis=0, inplace=True)
    users = users[users.handle.isin(solved_problem.handle)]
    problem.reset_index(inplace=True, drop=True)
    solved_problem.reset_index(inplace=True, drop=True)
    users.reset_index(inplace=True, drop=True)

    return problem, solved_problem, users

In [25]:
def preprocess_rival(problem, solved_problem, users):
    
    # 레벨별 문제풀이수
    user_problems = solved_problem[['handle', 'problems']]
    user_problems['problems'] = user_problems['problems'].astype(str)
    user_problems['problems'] = user_problems['problems'].str.replace('[', '').str.replace(']', '')
    user_problems['problems'] = user_problems['problems'].str.split(',')
    user_problems = user_problems.explode('problems')
    user_problems['problems'] = user_problems['problems'].str.strip()
    user_problems['problems'] = user_problems['problems'].str.strip("'\"")
    user_problems['problems'] = user_problems['problems'].str.replace("'", "").str.strip()
    user_problems['problems'] = user_problems['problems'].fillna(0)
    user_problems['problems'] = user_problems['problems'].replace('', 0)
    user_problems['problems'] = user_problems['problems'].astype(int)
    user_problems = user_problems.dropna()

    user_problems = pd.merge(user_problems, problem[['problemId', 'level']], left_on='problems', right_on='problemId', how='left')
    user_problems = user_problems.groupby(['handle', 'level']).size().reset_index(name='count')
    user_problems = pd.DataFrame(user_problems)
    user_problems.reset_index(inplace=True)
    user_problems['problems'] = 1
    user_problems = user_problems.pivot_table(index="handle", columns=["level"], aggfunc=np.sum, values='problems', fill_value=0)
    user_problems = pd.DataFrame(user_problems)
    users_info = user_problems.reset_index()

    # 정규화  
    scaler = MinMaxScaler()
    num_vars = list(users_info.columns[1:])
    if not users_info.empty:
        users_info[num_vars] = scaler.fit_transform(users_info[num_vars])
    else:
        print("Error: Empty DataFrame")

    real_users = users[['handle', 'solvedCount', 'class', 'tier', 'ratingByClass', 'ratingBySolvedCount', 'ratingByProblemsSum']]
    scaler = MinMaxScaler()
    num_vars = ['solvedCount', 'class', 'ratingByClass', 'ratingBySolvedCount', 'ratingByProblemsSum']
    real_users[num_vars] = scaler.fit_transform(real_users[num_vars])

    real_users = pd.merge(users_info, real_users, on='handle')
    real_users = real_users.sort_values('handle')
    real_users.reset_index(drop=True, inplace=True)
    
    return real_users

In [36]:
def rival_knn_main():
    problem, solved_problem, users= load_data()
    print('데이터 로드 완료')
    
    data = preprocess_rival(problem, solved_problem, users)
    print('데이터 전처리 완료')
    
    knn = NearestNeighbors(n_neighbors=7, p=2)
    df_data = np.array(data.iloc[:, 1:])
    knn.fit(df_data)
    rival_idx = knn.kneighbors(df_data, return_distance=False)
    print('knn 학습 완료')
    
    result = ([[k, v] for k, v in zip(list(range(len(rival_idx))), rival_idx)])
    df_result = pd.DataFrame(result, columns=['handle', 'rec_rival'])
    df_result[1] = df_result.apply(remove_self, axis=1)
    df_result = df_result[1]
    df_result['rec_rival'] = df_result['rec_rival'].str.strip(",")
    lst_rivals = [','.join(list(data.loc[x, 'handle'])) for x in df_result.index]

    target_users = list(df_data.handle)
    output = pd.DataFrame(target_users, columns=['handle'])
    output['rec_rivals'] = lst_rivals
    output.index += 1
    output.index.name = 'id'
    output.to_csv('rec_rival_knn.csv')

    print('라이벌 추천 완료!')
    return output

In [37]:
# KNN 모델 학습 및 라이벌 추천 수행
output = rival_knn_main()
print(output)

데이터 로드 완료
데이터 전처리 완료
knn 학습 완료


KeyError: 'rec_rival'