In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext

# create the Spark Session
spark = SparkSession.builder.appName("spark").getOrCreate()

# create the Spark Context
sc = spark.sparkContext



In [54]:
####################################
# [0] = book unique id -> 1
# [1:2856] = genre vector -> 2855
# [2856] = rating -> 1
# [2857] = page -> 1
# [2858] = author -> 1
# [2859] = publisher -> 1
# 하나의 book profile 크기 = 2860
####################################
import time
import pickle
with open("../data/KorBookMatrix.pkl", "rb") as f:
    KorBookMatrix = pickle.load(f)
print(KorBookMatrix.shape)

import numpy as np
with open(f"../data/korean-all-genres-dic.pkl", "rb") as f:
    genres_dic = pickle.load(f)
print(len(genres_dic))

(43954, 2860)
2855


# genre만 받았을 경우

In [19]:
genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"]

# 0을 남겨둠
userProfile = np.zeros((1,len(genres_dic)+1))
for genre in genres:
    userProfile[0,genres_dic[genre]] = 1.

for i in range(len(userProfile[0])):
    if userProfile[0,i] == 1.0:
        print(i)

5
27
39
287
1010


### 일반적인 content based search

In [20]:
from numpy.linalg import norm

def cos_similarity(x :np.array, y :np.array):
    return ( (np.dot(x,y))/(norm(x)*norm(y)) )

In [50]:
def linear_search(KorBookMatrix, userProfile, topNum):
    similarity = map(
        lambda bookProfile: \
            (int(bookProfile[0]),cos_similarity(bookProfile[1:2856], userProfile[0,1:]))
            ,KorBookMatrix)
    best = sorted(list(similarity), key=lambda t: -t[1])
    return best[:topNum]


In [60]:
start = time.time()
print(linear_search(KorBookMatrix, userProfile, 100)[:5])
print("="*100)
print(time.time()-start,"(second)")

[(0, 0.9999999999999998), (42, 0.9999999999999998), (129, 0.9999999999999998), (184, 0.9999999999999998), (198, 0.9999999999999998)]
0.7161579132080078 (second)


In [66]:
# 평균 타임 계속해서 0.86s 정도 나옴
iterNum = 10
sizes = np.random.randint(low=1, high=10, size=iterNum)
addTime = 0
for i, size in enumerate(sizes):
    r = np.random.randint(low=1, high=2855, size=size)
    userProfile = np.zeros((1,len(genres_dic)+1))
    userProfile[0,r] = 1.0
    start = time.time()
    best = linear_search(KorBookMatrix, userProfile, 100)
    addTime += time.time()-start
    print(f"iter{i+1}|",best[:2])

print("="*100)
print(addTime/iterNum)

iter1| [(24416, 0.2886751345948129), (24424, 0.2886751345948129)]
iter2| [(13909, 0.35355339059327373), (13925, 0.35355339059327373)]
iter3| [(57152, 0.5), (57159, 0.5)]
iter4| [(52734, 0.2581988897471611), (52744, 0.2581988897471611)]
iter5| [(15370, 0.19245008972987526), (15371, 0.19245008972987526)]
iter6| [(29704, 0.35355339059327373), (29705, 0.35355339059327373)]
iter7| [(49520, 0.3086066999241838), (997, 0.23570226039551587)]
iter8| [(14374, 0.40824829046386296), (14380, 0.40824829046386296)]
iter9| [(27673, 0.2581988897471611), (27693, 0.2581988897471611)]
iter10| [(17408, 0.2581988897471611), (17436, 0.2581988897471611)]
0.8629757642745972


In [69]:
x = np.array([1,2,1])
y = np.array([2,2,2])
print(cos_similarity(x, y))
y = np.array([1,1,1])
print(cos_similarity(x, y))

0.9428090415820635
0.9428090415820635


### LSH를 통한 content based search

In [None]:
### 충분히 빠르기 때문에 할 필요가 없음

# book과 genre를 같이 받았을 경우

### book과 genre 모두를 각각 vector로 하여 user profile을 matrix으로 만들어 search 

### book과 genre에 대한 정보를 하나로 합쳐 하나의 vector를 가지고 search

### 위 하나의 vector를 가지고 LSH search

In [None]:

def linear_search(A, query_index, num_neighbors):
    all_neighbors = filter(lambda i: i != query_index, range(len(A)))
    distances = map(lambda r: (r, l1(A[r], A[query_index])), all_neighbors)
    best_neighbors = sorted(distances, key = lambda t: t[1])[:num_neighbors]

    return [t[0] for t in best_neighbors]