In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext

# create the Spark Session
spark = SparkSession.builder.appName("spark").getOrCreate()

# create the Spark Context
sc = spark.sparkContext



In [3]:
####################################
# [0] = book unique id -> 1
# [1:983] = genre vector -> 982
# [983] = rating -> 1
# [984] = page -> 1
# [985] = author -> 1
# [986] = publisher -> 1
# 하나의 book profile 크기 = 987
####################################
import time
import pickle
with open("../data/ForBookMatrix.pkl", "rb") as f:
    ForBookMatrix = pickle.load(f)
print(ForBookMatrix.shape)

import numpy as np
with open(f"../data/foreign-all-genres-dic.pkl", "rb") as f:
    genres_dic = pickle.load(f)
print(len(genres_dic))

(47835, 987)
982


# genre만 받았을 경우

In [13]:
genres = ["Young Adult", "Fiction", "Dystopia", "Fantasy", "Science Fiction", "Romance", "Adventure", "Teen"]

# 0을 남겨둠
userProfile = np.zeros((1,len(genres_dic)+1))
for genre in genres:
    userProfile[0,genres_dic[genre]] = 1.

for i in range(len(userProfile[0])):
    if userProfile[0,i] == 1.0:
        print(i)

1
2
3
4
13
17
35
57


### 일반적인 content based search

In [14]:
from numpy.linalg import norm

def cos_similarity(x :np.array, y :np.array):
    return ( (np.dot(x,y))/(norm(x)*norm(y)) )

In [15]:
def linear_search(ForBookMatrix, userProfile, topNum):
    similarity = map(
        lambda bookProfile: \
            (int(bookProfile[0]),cos_similarity(bookProfile[1:983], userProfile[0,1:]))
            ,ForBookMatrix)
    best = sorted(list(similarity), key=lambda t: -t[1])
    return best[:topNum]


In [17]:
start = time.time()
print(linear_search(ForBookMatrix, userProfile, 100)[:5])
print("="*100)
print(time.time()-start,"(second)")

[(0, 0.8944271909999159), (21, 0.8944271909999159), (151, 0.8944271909999159), (184, 0.8944271909999159), (221, 0.8944271909999159)]
0.6767556667327881 (second)


In [19]:
# 평균 타임 계속해서 0.86s 정도 나옴
iterNum = 10
sizes = np.random.randint(low=1, high=10, size=iterNum)
addTime = 0
for i, size in enumerate(sizes):
    r = np.random.randint(low=1, high=982, size=size)
    userProfile = np.zeros((1,len(genres_dic)+1))
    userProfile[0,r] = 1.0
    start = time.time()
    best = linear_search(ForBookMatrix, userProfile, 100)
    addTime += time.time()-start
    print(f"iter{i+1}|",best[:2])

print("="*100)
print(addTime/iterNum)

iter1| [(31847, 0.25), (45117, 0.25)]
iter2| [(8315, 0.4472135954999579), (11355, 0.4472135954999579)]
iter3| [(26846, 0.40824829046386296), (38573, 0.40824829046386296)]
iter4| [(39420, 0.2886751345948129), (38724, 0.20412414523193154)]
iter5| [(11181, 0.3333333333333333), (12297, 0.3333333333333333)]
iter6| [(17474, 0.31622776601683794), (21312, 0.31622776601683794)]
iter7| [(2374, 0.5773502691896258), (4363, 0.5773502691896258)]
iter8| [(25805, 0.33333333333333337), (11727, 0.2886751345948129)]
iter9| [(19226, 0.5), (37779, 0.5)]
iter10| [(46894, 0.31622776601683794), (29507, 0.2357022603955158)]
0.6068754911422729


### LSH를 통한 content based search

In [None]:
### 충분히 빠르기 때문에 할 필요가 없음