In [13]:
import hashlib

def myHash(s):
    if s == None:
        return 0
    else:
        return int(hashlib.sha1(s.encode("utf-8")).hexdigest(), 16) % (10 ** 9)

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext

# create the Spark Session
spark = SparkSession.builder.appName("spark").getOrCreate()

# create the Spark Context
sc = spark.sparkContext



In [147]:
####################################
# [0] = book unique id -> 1
# [1:2856] = genre vector -> 2855
# [2856] = author -> 1
# 하나의 book profile 크기 = 2857
####################################
import time
import pickle
with open("../data/KorBookMatrix.pkl", "rb") as f:
    KorBookMatrix = pickle.load(f)
print(KorBookMatrix.shape)

import numpy as np
with open(f"../data/korean-all-genres-dic.pkl", "rb") as f:
    genres_dic = pickle.load(f)
print(len(genres_dic))

(43925, 2857)
2855


In [148]:
print(KorBookMatrix[0,0])
print(KorBookMatrix[0,2850])
print(KorBookMatrix[0,2856])
print(myHash("김훈"))

0
0
765562941
765562941


In [331]:
GENRE_DIM = 2855

# genre만 받았을 경우

### 일반적인 content based search

In [322]:
from numpy.linalg import norm

def cos_similarity(x :np.array, y :np.array):
    return ( (np.dot(x,y))/(norm(x)*norm(y)) )

In [335]:
def linear_search(KorBookMatrix, userProfile, topNum, genre_dim):
    similarity = map(
        lambda bookProfile: \
            (int(bookProfile[0]),cos_similarity(bookProfile[1:genre_dim+1], userProfile[0,1:]))
            ,KorBookMatrix)
    best = sorted(similarity, key=lambda t: -t[1])[:topNum]
    return [t[0] for t in best]
    # return [t for t in best]


In [336]:

# test genres
genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"]

# 0을 남겨둠
userProfile = np.zeros((1,len(genres_dic)+1))
for genre in genres:
    userProfile[0,genres_dic[genre]] = 1.

for i in range(len(userProfile[0])):
    if userProfile[0,i] == 1.0:
        print(i,end=", ")

5, 27, 39, 287, 1010, 

In [337]:
start = time.time()
print(linear_search(KorBookMatrix, userProfile, 100, GENRE_DIM)[:10])
print("="*100)
print(time.time()-start,"(second)")

[0, 42, 129, 184, 198, 202, 205, 216, 246, 306]
1.0172805786132812 (second)


In [326]:
iterNum = 10
sizes = np.random.randint(low=1, high=10, size=iterNum)
addTime = 0
for i, size in enumerate(sizes):
    r = np.random.randint(low=1, high=2855, size=size)
    userProfile = np.zeros((1,len(genres_dic)+1))
    userProfile[0,r] = 1.0
    start = time.time()
    best = linear_search(KorBookMatrix, userProfile, 100)
    addTime += time.time()-start
    print(f"iter{i+1}|",best[:10])

print("="*100)
print("average time:",addTime/iterNum)

iter1| [4473, 4591, 4787, 6591, 4848, 51489, 11255, 28714, 25019, 0]
iter2| [37489, 37616, 46904, 46905, 46906, 46907, 46910, 46911, 46914, 46916]
iter3| [8397, 8419, 8434, 34954, 35024, 35092, 35122, 8423, 8452, 8454]
iter4| [15375, 15400, 15405, 15448, 15460, 15478, 15495, 15506, 15507, 15508]
iter5| [30211, 30212, 30228, 30232, 30233, 30238, 30239, 30242, 30250, 30257]
iter6| [54524, 54560, 38411, 38415, 38458, 38473, 38504, 38628, 38656, 54547]
iter7| [51489, 29420, 29625, 0, 1, 2, 3, 4, 5, 6]
iter8| [17368, 17370, 17371, 17372, 17373, 17375, 17376, 17377, 17380, 17384]
iter9| [38926, 38942, 38951, 38952, 39038, 39100, 39154, 39490, 39527, 38932]
iter10| [34906, 51950, 51957, 51990, 51996, 52020, 52078, 52133, 52142, 25562]
average time: 0.9456707239151001


### LSH를 통한 content based search

In [327]:
# k == the number of "row"
# L == the number of "band"
DIMENSION = 2855

def create_function(dimensions, thresholds):
    def f(v):
        boolarray = [v[dimensions[i]] >= thresholds[i] for i in range(len(dimensions))]
        return "".join(map(str, map(int, boolarray)))
    return f


def create_functions(k, L, num_dimensions = DIMENSION, min=0, max=1):
    functions = []
    for i in range(L):
        dimensions = np.random.randint(low = 0,
                                        high = num_dimensions,
                                        size = k)
        thresholds = np.random.randint(low = min,
                                        high = max+1,
                                        size = k)
        functions.append(create_function(dimensions, thresholds))
    return functions


# 하나의 shingling에 대한 hash_vector 추출
def hash_vector(functions, v):
    return np.array([f(v) for f in functions])


def hash_data(functions, A):
    # v(shape:(400,)) is a each row of A(matrix)(shape:(59499, 400))
    return np.array(list(map(lambda v: hash_vector(functions, v), A)))


# 하나의 band라도 맞으면 filtering됨.
def get_candidates(hashed_A, hashed_point):
    return filter(lambda i: any(hashed_point == hashed_A[i]), range(len(hashed_A)))


# return == (hash functions, signature matrix of A)
def lsh_setup(A, k = 20, L = 70):
    functions = create_functions(k = k, L = L)
    hashed_A = hash_data(functions, A)
    return (functions, hashed_A)


def lsh_search(A, hashed_A, functions, userProfile, topNum = 10):
    # A == KorBookProfile
    # hashed_A == signature matrix of KorBookProfile
    # 원래 데이터 vector에서 hash-vector 추출
    hashed_userProfile = hash_vector(functions, userProfile[0])
    candidate_row_nums = get_candidates(hashed_A, hashed_userProfile)
    similarity = map(lambda r: (A[r,0], cos_similarity(A[r,1:DIMENSION+1], userProfile[0,1:])), candidate_row_nums)
    best = sorted(similarity, key=lambda t: -t[1])[:topNum]
    return [int(t[0]) for t in best]
    # return [t for t in best]

In [328]:
A = KorBookMatrix[:,1:2856]
print(A.shape)
funcs, hashed_A = lsh_setup(A, k=1, L=1)
print(hashed_A.shape)

(43925, 2855)
(43925, 1)


In [329]:
# test genres
genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"]

# 0을 남겨둠
userProfile = np.zeros((1,len(genres_dic)+1))
for genre in genres:
    userProfile[0,genres_dic[genre]] = 1.

for i in range(len(userProfile[0])):
    if userProfile[0,i] == 1.0:
        print(i,end=", ")

print()

5, 27, 39, 287, 1010, 


In [321]:
start = time.time()
print(lsh_search(KorBookMatrix, hashed_A, funcs, userProfile, 100)[:10])
print("="*100)
print(time.time()-start,"(second)")

[0, 42, 129, 184, 198, 202, 205, 216, 246, 306]
1.1160027980804443 (second)


In [141]:
x = np.array(lsh_search(KorBookMatrix, hashed_A, funcs, userProfile, 100))
y = np.array(linear_search(KorBookMatrix, userProfile, 100))
print("error:",cos_similarity(x, y))

error: 0.9999999999999998


In [330]:
iterNum = 10
sizes = np.random.randint(low=1, high=10, size=iterNum)
addTime = 0
# avgErr = 0
for i, size in enumerate(sizes):
    r = np.random.randint(low=1, high=DIMENSION, size=size)
    userProfile = np.zeros((1,len(genres_dic)+1))
    userProfile[0,r] = 1.0
    start = time.time()
    best = lsh_search(KorBookMatrix, hashed_A, funcs, userProfile, 100)
    addTime += time.time()-start
    x = np.array(best)
    y = np.array(linear_search(KorBookMatrix, userProfile, 100))
    print(f"iter{i+1}|",best[:10])
    print(f"iter{i+1}| similarity: ",cos_similarity(x, y))
    print("-"*50)

print("="*50)
print("average time:",addTime/iterNum)

iter1| [57399, 57400, 57401, 57402, 57403, 57404, 57405, 57406, 57407, 57409]
iter1| similarity:  1.0
--------------------------------------------------
iter2| [27263, 27264, 27292, 27294, 27305, 27346, 33907, 33909, 33910, 33912]
iter2| similarity:  1.0000000000000002
--------------------------------------------------
iter3| [13399, 13617, 33232, 33286, 33306, 33320, 33324, 33325, 33356, 50036]
iter3| similarity:  1.0
--------------------------------------------------
iter4| [10630, 10668, 50712, 42907, 42910, 42918, 42920, 42922, 42923, 42931]
iter4| similarity:  1.0
--------------------------------------------------
iter5| [10402, 10406, 10445, 10647, 10651, 10686, 10689, 10704, 10849, 13381]
iter5| similarity:  1.0000000000000002
--------------------------------------------------
iter6| [27667, 27672, 27679, 27692, 27698, 27701, 27712, 27719, 27731, 27740]
iter6| similarity:  1.0000000000000002
--------------------------------------------------
iter7| [18214, 17870, 17871, 17876, 1

# book과 genre를 같이 받았을 경우

In [None]:
import hashlib
def myHash(s):
    if s == None:
        return 0
    else:
        return int(hashlib.sha1(s.encode("utf-8")).hexdigest(), 16) % (10 ** 9)

In [153]:
print(type(KorBookMatrix[0,1]))
print((KorBookMatrix[0,1]))

<class 'numpy.int64'>
0


### book과 genre 모두를 각각 vector로 하여 user profile을 matrix으로 만들어 search 

In [156]:
# from numpy.linalg import norm

# def mySimilarity(x :np.array, user :np.array):
#     similarity = 0
#     for i in user:
#         similarity += cos_similarity(i,x)
#     return ( similarity/user.shape[0] )

In [204]:
# # test setting
# genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"]
# data = [
#     {
#         "author":"정지아",
#         "id":1,
#         "genres":["소설", "한국소설","한국소설일반"]
#     },
#     {
#         "author":"이미예",
#         "id":8,
#         "genres":["소설", "한국소설","판타지소설","장르소설"]
#     }
# ]

# # 0을 남겨둠
# usrPro01 = np.zeros((1, len(genres_dic)+2))
# for genre in genres:
#     usrPro01[0,genres_dic[genre]] = 1.

# userProfile = [usrPro01]
# for one in data:
#     temp = np.zeros((1, len(genres_dic)+2))
#     for genre in one["genres"]:
#         temp[0,genres_dic[genre]] = 1.
#     temp[0,0] = (one["id"])
#     temp[0,-1] = myHash(one["author"])
#     userProfile.append(temp)

# userProfile = np.concatenate(userProfile)
# print(userProfile.shape)
# print(userProfile[0,-1])
# print(userProfile[1,-1])
# print(myHash("정지아"))
# print(userProfile[2,-1])
# print(myHash("이미예"))

(3, 2857)
0.0
470449543.0
470449543
514441238.0
514441238


In [206]:
# print(userProfile[1,-1])
# print(myHash("정지아"))
# if userProfile[1,-1]==(myHash("정지아")):
#     print("true")

470449543.0
470449543
true


In [197]:
# temp = userProfile[:,1:]
# print(temp.shape)
# print(temp[0,-1])
# print(temp[1,-1])

(3, 2856)
0.0
470449543.0


In [198]:
# def linear_search(KorBookMatrix, userProfile, topNum):
#     similarity = map(
#         lambda bookProfile: \
#             (int(bookProfile[0]),mySimilarity(bookProfile[1:], userProfile[:,1:]))
#             ,KorBookMatrix)
#     best = sorted(similarity, key=lambda t: -t[1])[:topNum]
#     return [t[0] for t in best]
#     # return [t for t in best]


In [199]:
# ## 너무 오래 걸림
# start = time.time()
# print(linear_search(KorBookMatrix, userProfile, 100)[:10])
# print("="*100)
# print(time.time()-start,"(second)")

  after removing the cwd from sys.path.


KeyboardInterrupt: 

### book과 genre에 대한 정보를 하나로 합쳐 하나의 vector를 가지고 search

In [319]:
# test setting
genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"]
data = [
    {
        "title":"아버지의 해방일지",
        "author":"정지아",
        "id":1,
        "genres":["소설", "한국소설","한국소설일반"]
    },
    {
        "title":"달러구트 꿈 백화점",
        "author":"이미예",
        "id":8,
        "genres":["소설", "한국소설","판타지소설","장르소설"]
    }
]

# 0을 남겨둠
genreVec = np.zeros((len(genres_dic)))
for genre in genres:
    genreVec[genres_dic[genre]] = 1.

idVec = []
authorVec = []
for one in data:
    for genre in one["genres"]:
        print(genres_dic[genre]-1)
        genreVec[genres_dic[genre]-1] += 1.
    idVec.append(one["id"])
    authorVec.append(myHash(one["author"]))

userProfile = [idVec, genreVec, authorVec]
print(type(userProfile))
print()
print(len(userProfile[0]))
print(userProfile[0])
print()
print(userProfile[1][5])
print(userProfile[1].shape)
print(userProfile[1].sum())
print()
print(len(userProfile[2]))
print(userProfile[2])

4
38
123
4
38
304
26
<class 'list'>

2
[1, 8]

1.0
(2855,)
12.0

2
[470449543, 514441238]


In [312]:
print(userProfile[1].shape)

(2855,)


In [313]:
print(KorBookMatrix[1,0])
print(userProfile[0])
print(KorBookMatrix[1,0]==userProfile[0])
print(any(KorBookMatrix[1,0]==userProfile[0]))
print()
print(KorBookMatrix[1,-1])
print(userProfile[2])
print(KorBookMatrix[1,-1]==userProfile[2])
print(any(KorBookMatrix[1,-1]==userProfile[2]))
print()
print(KorBookMatrix[2,-1])
print(userProfile[2])
print(KorBookMatrix[2,-1]==userProfile[2])
print(any(KorBookMatrix[2,-1]==userProfile[2]))


1
[1, 8]
[ True False]
True

470449543
[470449543, 514441238]
[ True False]
True

543238319
[470449543, 514441238]
[False False]
False


In [314]:
# temp = np.zeros((1, len(genres_dic)+1))
# print(temp[0,userProfile[1]])
from numpy.linalg import norm
from math import sqrt
x = np.array([0,0,1,0])
y = np.array([2.,0,1.,0])
print(np.dot(x,y))
# print(y[:-1])
# print((y>0).sum())
# print(norm((y>0)))
# print(sqrt(2))
# z = np.ones(tuple(y),dtype=bool)
# print(z)
print(userProfile[1].sum())
print((userProfile[1]>0).sum())

1.0
12.0
10


In [315]:
from numpy.linalg import norm


def mySimilarity(x :np.array, user :np.array):
    # print(user[2])
    # print(x[-1])
    if(any(x[0]==user[0])):
        return 0
    elif(any(x[-1]==user[2])):
        # print("001")
        # print(np.dot(x[:-1],user[1]))
        return 1.3*((np.dot(x[1:-1],user[1]))/(norm(x[1:-1])*norm((user[1]>0))))
    else:
        # print("002")
        return ( (np.dot(x[1:-1],user[1]))/(norm(x[1:-1])*norm((user[1]>0))) )

In [316]:
print(mySimilarity(KorBookMatrix[1,:],userProfile)) # 같은 북이 있어서 0이다.
print(cos_similarity(KorBookMatrix[1,1:2856],userProfile[1]))
print()

print(mySimilarity(KorBookMatrix[2,:],userProfile))
print(cos_similarity(KorBookMatrix[2,1:2856],userProfile[1]))
print()

print(KorBookMatrix[1,1:-1].shape)
print(KorBookMatrix[1,1:-1][4])
print(userProfile[1].shape)
print(userProfile[1][4])

0
0.7216878364870323

0.9128709291752769
0.7216878364870323

(2855,)
1
(2855,)
2.0


In [317]:
from numpy.linalg import norm

def mySimilarity(x :np.array, user :np.array):
    if(any(x[0]==user[0])):
        return 0
    elif(any(x[-1]==user[2])):
        return 1.3*((np.dot(x[1:-1],user[1]))/(norm(x[1:-1])*norm((user[1]>0))))
    else:
        return ( (np.dot(x[1:-1],user[1]))/(norm(x[1:-1])*norm((user[1]>0))) )

def my_search(KorBookMatrix, userProfile, topNum):
    similarity = map(
        lambda bookProfile: \
            (int(bookProfile[0]),mySimilarity(bookProfile, userProfile))
            ,KorBookMatrix)
    best = sorted(similarity, key=lambda t: -t[1])[:topNum]
    return [t[0] for t in best]
    # return [t for t in best]


In [318]:
start = time.time()
print(my_search(KorBookMatrix, userProfile, 100)[:10])
print("="*100)
print(time.time()-start,"(second)")

[11, 77, 87, 170, 74, 147, 213, 220, 262, 471]
1.6823859214782715 (second)
