In [6]:
import hashlib

def myHash(s):
    if s == None:
        return 0
    else:
        return int(hashlib.sha1(s.encode("utf-8")).hexdigest(), 16) % (10 ** 9)

In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext

# create the Spark Session
spark = SparkSession.builder.appName("spark").getOrCreate()

# create the Spark Context
sc = spark.sparkContext



In [140]:
####################################
# [0] = book unique id -> 1
# [1:2856] = genre vector -> 2848
# [2856] = author -> 1
# 하나의 book profile 크기 = 2850
####################################
GENRE_DIM = 2848
import time
import pickle
with open("../data/KorBookMatrix.pkl", "rb") as f:
    KorBookMatrix = pickle.load(f)
print(KorBookMatrix.shape)

import numpy as np
with open(f"../data/korean-all-genres-dic.pkl", "rb") as f:
    genres_dic = pickle.load(f)
print(len(genres_dic))

(44909, 2850)
2848


In [14]:
print(KorBookMatrix[0,0])
print(KorBookMatrix[0,2844])
print(KorBookMatrix[0,2849])
print(myHash("김훈"))

0
0
765562941
765562941


# only get "genre" from user.

### general "content based linear search"

In [15]:
from numpy.linalg import norm

def cos_similarity(x :np.array, y :np.array):
    return ( (np.dot(x,y))/(norm(x)*norm(y)) )

In [16]:
def linear_search(KorBookMatrix, userProfile, topNum, genre_dim):
    similarity = map(
        lambda bookProfile: \
            (int(bookProfile[0]),cos_similarity(bookProfile[1:genre_dim+1], userProfile[0,1:]))
            ,KorBookMatrix)
    best = sorted(similarity, key=lambda t: -t[1])[:topNum]
    return [t[0] for t in best]
    # return [t for t in best]


In [17]:

# test genres
genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"]

# 0을 남겨둠
userProfile = np.zeros((1,len(genres_dic)+1))
for genre in genres:
    userProfile[0,genres_dic[genre]] = 1.

for i in range(len(userProfile[0])):
    if userProfile[0,i] == 1.0:
        print(i,end=", ")

5, 27, 40, 282, 1001, 

In [18]:
start = time.time()
print(linear_search(KorBookMatrix, userProfile, 100, GENRE_DIM)[:10])
print("="*100)
print(time.time()-start,"(second)")

[0, 42, 122, 129, 184, 198, 202, 205, 216, 246]
1.0399692058563232 (second)


In [21]:
iterNum = 20
sizes = np.random.randint(low=1, high=10, size=iterNum)
addTime = 0
for i, size in enumerate(sizes):
    r = np.random.randint(low=1, high=GENRE_DIM, size=size)
    userProfile = np.zeros((1,len(genres_dic)+1))
    userProfile[0,r] = 1.0
    start = time.time()
    best = linear_search(KorBookMatrix, userProfile, 100, GENRE_DIM)
    addTime += time.time()-start
    print(f"iter{i+1}|",best[:10])

print("="*100)
print("average time:",addTime/iterNum) # linear search average time

iter1| [14877, 14910, 14912, 14928, 14930, 14937, 14944, 14964, 14968, 14970]
iter2| [46654, 46655, 46656, 46657, 46658, 46660, 46661, 46662, 46663, 46664]
iter3| [11372, 11373, 11377, 11394, 11407, 11410, 11414, 11418, 11420, 11431]
iter4| [31421, 31424, 31443, 31471, 31485, 31489, 31492, 31503, 31505, 31515]
iter5| [55662, 55669, 55674, 55675, 55679, 55680, 55681, 55694, 55696, 55704]
iter6| [54909, 54912, 54918, 54926, 54933, 54938, 54946, 54948, 54956, 54960]
iter7| [35409, 35415, 35468, 35520, 35540, 35541, 35594, 27428, 27429, 27433]
iter8| [11367, 12369, 12370, 12371, 12373, 12374, 12375, 12376, 12382, 12384]
iter9| [24941, 0, 1, 2, 6, 7, 12, 13, 14, 15]
iter10| [4370, 4371, 4372, 4373, 4374, 4375, 4376, 4377, 4378, 4380]
iter11| [13397, 13404, 13411, 13416, 13417, 13420, 13422, 13425, 13428, 13432]
iter12| [22926, 22936, 22964, 22966, 23006, 23015, 23024, 23046, 23056, 23071]
iter13| [24916, 24919, 24936, 24937, 24941, 24947, 24948, 24959, 24969, 24974]
iter14| [28945, 28959, 2

### LSH를 통한 content based search

In [22]:
# k == the number of "row"
# L == the number of "band"
def create_function(dimensions, thresholds):
    def f(v):
        boolarray = [v[dimensions[i]] >= thresholds[i] for i in range(len(dimensions))]
        return "".join(map(str, map(int, boolarray)))
    return f


def create_functions(k, L, num_dimensions = GENRE_DIM, min=0, max=1):
    functions = []
    for i in range(L):
        dimensions = np.random.randint(low = 0,
                                        high = num_dimensions,
                                        size = k)
        thresholds = np.random.randint(low = min,
                                        high = max+1,
                                        size = k)
        functions.append(create_function(dimensions, thresholds))
    return functions


# 하나의 shingling에 대한 hash_vector 추출
def hash_vector(functions, v):
    return np.array([f(v) for f in functions])


def hash_data(functions, A):
    # v(shape:(400,)) is a each row of A(matrix)(shape:(59499, 400))
    return np.array(list(map(lambda v: hash_vector(functions, v), A)))


# 하나의 band라도 맞으면 filtering됨.
def get_candidates(hashed_A, hashed_point):
    return filter(lambda i: any(hashed_point == hashed_A[i]), range(len(hashed_A)))


# return == (hash functions, signature matrix of A)
def lsh_setup(A, k = 20, L = 70):
    functions = create_functions(k = k, L = L)
    hashed_A = hash_data(functions, A)
    return (functions, hashed_A)


def lsh_search(A, hashed_A, functions, userProfile, topNum = 10):
    # A == KorBookProfile
    # hashed_A == signature matrix of KorBookProfile
    # 원래 데이터 vector에서 hash-vector 추출
    hashed_userProfile = hash_vector(functions, userProfile[0])
    candidate_row_nums = get_candidates(hashed_A, hashed_userProfile)
    similarity = map(lambda r: (A[r,0], cos_similarity(A[r,1:GENRE_DIM+1], userProfile[0,1:])), candidate_row_nums)
    best = sorted(similarity, key=lambda t: -t[1])[:topNum]
    return [int(t[0]) for t in best]
    # return [t for t in best]

In [23]:
A = KorBookMatrix[:,1:GENRE_DIM]
print(A.shape)
funcs, hashed_A = lsh_setup(A, k=1, L=1)
print(hashed_A.shape)

(43511, 2847)
(43511, 1)


In [24]:
# test genres
genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"]

# 0을 남겨둠
userProfile = np.zeros((1,len(genres_dic)+1))
for genre in genres:
    userProfile[0,genres_dic[genre]] = 1.

for i in range(len(userProfile[0])):
    if userProfile[0,i] == 1.0:
        print(i,end=", ")

print()

5, 27, 40, 282, 1001, 


In [25]:
start = time.time()
print(lsh_search(KorBookMatrix, hashed_A, funcs, userProfile, 100)[:10])
print("="*100)
print(time.time()-start,"(second)")

[0, 42, 122, 129, 184, 198, 202, 205, 216, 246]
1.0741381645202637 (second)


In [27]:
x = np.array(lsh_search(KorBookMatrix, hashed_A, funcs, userProfile, 100))
y = np.array(linear_search(KorBookMatrix, userProfile, 100, GENRE_DIM))
print("error:",cos_similarity(x, y))

error: 1.0


In [29]:
iterNum = 10
sizes = np.random.randint(low=1, high=10, size=iterNum)
addTime = 0
# avgErr = 0
for i, size in enumerate(sizes):
    r = np.random.randint(low=1, high=GENRE_DIM, size=size)
    userProfile = np.zeros((1,len(genres_dic)+1))
    userProfile[0,r] = 1.0
    start = time.time()
    best = lsh_search(KorBookMatrix, hashed_A, funcs, userProfile, 100)
    addTime += time.time()-start
    x = np.array(best)
    y = np.array(linear_search(KorBookMatrix, userProfile, 100, GENRE_DIM))
    print(f"iter{i+1}|",best[:10])
    print(f"iter{i+1}| similarity: ",cos_similarity(x, y))
    print("-"*50)

print("="*50)
print("average time:",addTime/iterNum) # lsh search average time

iter1| [55564, 7404, 7467, 7508, 7627, 7634, 7661, 7867, 42910, 42940]
iter1| similarity:  1.0000000000000002
--------------------------------------------------
iter2| [30918, 30943, 30956, 30969, 30983, 31034, 31054, 50659, 50672, 50688]
iter2| similarity:  1.0000000000000002
--------------------------------------------------
iter3| [42283, 43163, 43248, 43283, 41663, 40158, 40165, 40239, 40245, 40252]
iter3| similarity:  1.0
--------------------------------------------------
iter4| [54682, 54741, 54747, 54750, 54762, 54765, 54770, 54775, 54785, 54791]
iter4| similarity:  1.0
--------------------------------------------------
iter5| [14870, 14871, 14872, 14877, 14882, 14883, 14887, 14891, 14893, 14894]
iter5| similarity:  0.9999999999999998
--------------------------------------------------
iter6| [28763, 28772, 28789, 28824, 28857, 28876, 28891, 34712, 8298, 28696]
iter6| similarity:  0.9999999999999998
--------------------------------------------------
iter7| [48654, 48655, 48656, 4

# book과 genre를 같이 받았을 경우

In [99]:
import hashlib
def myHash(s):
    if s == None:
        return 0
    else:
        return int(hashlib.sha1(s.encode("utf-8")).hexdigest(), 16) % (10 ** 9)

### book과 genre 모두를 각각 vector로 하여 user profile을 matrix으로 만들어 search 

In [101]:
def multi_linear_search(ForBookMatrix, userProfile, topNum, genre_dim):
    similarity = map(
        lambda bookProfile: (
            int(bookProfile[0]),
            np.max([ cos_similarity(bookProfile[1:genre_dim+1],userProfile[i,1:-1]) for i in range(len(userProfile)) ])
        )
        ,KorBookMatrix
    )
    best = sorted(similarity, key=lambda t: -t[1])[:topNum]
    return [t[0] for t in best]
    # return [t for t in best]


In [108]:
# test setting
# data = [
#     {
#         "author":"정지아",
#         "id":1,
#         "genres":["소설", "한국소설","한국소설일반"]
#     },
#     {
#         "author":"이미예",
#         "id":8,
#         "genres":["소설", "한국소설","판타지소설","장르소설"]
#     }
# ]

import sqlite3

tableName = "Korean_book"
con = sqlite3.connect("../Books.db")
cur = con.cursor()

data = []
count = 0
SIZE_OF_USER_PROFILE = 5
rowid_list = np.random.randint(low=1, high=10000, size=100000)
for rowid in rowid_list:
    if count >= SIZE_OF_USER_PROFILE:
        break
    cur.execute(f"SELECT id, author, genres FROM {tableName} WHERE id=={rowid}")
    try:
        temp = cur.fetchall()[0]
    except:
        continue
    temp = {
        "id": temp[0],
        "author": temp[1],
        "genres": temp[2].split(",")[1:]
    }
    data.append(temp)
    count+=1

genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"]
print(len(data))
print(count)


# 0을 남겨둠
usrPro01 = np.zeros((1, len(genres_dic)+2))
for genre in genres:
    usrPro01[0,genres_dic[genre]] = 1.

userProfile = [usrPro01]
for one in data:
    temp = np.zeros((1, len(genres_dic)+2))
    for genre in one["genres"]:
        temp[0,genres_dic[genre]] = 1.
    temp[0,0] = (one["id"])
    temp[0,-1] = myHash(one["author"])
    userProfile.append(temp)
userProfile = np.concatenate(userProfile)


print(userProfile.shape)
print()
print([int(userProfile[i,0]) for i in range(len(userProfile))])
print()
print([userProfile[i,-1] for i in range(len(userProfile))])
print()
print([int(userProfile[i,1:-1].sum()) for i in range(len(userProfile))])
print()

5
5
(6, 2850)

[0, 5153, 2675, 1893, 1808, 1246]

[0.0, 611530178.0, 268103269.0, 113137678.0, 132245888.0, 300236986.0]

[5, 6, 4, 3, 4, 4]



In [111]:
### vector가 만아질수록 linear하게 time이 증가한다!
import sqlite3

tableName = "Korean_book"
con = sqlite3.connect("../Books.db")
cur = con.cursor()

data = []
count = 0
list = [i for i in range(1,6)]
rowid_list = np.random.randint(low=1, high=10000, size=100000)

for sizeof_userProfile in list:
    for rowid in rowid_list:
        if count >= sizeof_userProfile:
            break
        cur.execute(f"SELECT id, author, genres FROM {tableName} WHERE id=={rowid}")
        try:
            temp = cur.fetchall()[0]
        except:
            continue
        temp = {
            "id": temp[0],
            "author": temp[1],
            "genres": temp[2].split(",")[1:]
        }
        data.append(temp)
        count+=1
    genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"]
    usrPro01 = np.zeros((1, len(genres_dic)+2))
    for genre in genres:
        usrPro01[0,genres_dic[genre]] = 1.

    userProfile = [usrPro01]
    for one in data:
        temp = np.zeros((1, len(genres_dic)+2))
        for genre in one["genres"]:
            temp[0,genres_dic[genre]] = 1.
        temp[0,0] = (one["id"])
        temp[0,-1] = myHash(one["author"])
        userProfile.append(temp)
    userProfile = np.concatenate(userProfile)

    start = time.time()
    print("best: ",multi_linear_search(KorBookMatrix, userProfile, 100, GENRE_DIM)[:10])
    print("average time: ",time.time()-start,"(second)")
    print("shape of userProfile: ",userProfile.shape)
    print("="*100)

best:  [9373, 9375, 9378, 9380, 9382, 9384, 9386, 9387, 9394, 9396]
average time:  2.3276937007904053 (second)
shape of userProfile:  (2, 2850)
best:  [9373, 9375, 9378, 9380, 9382, 9384, 9386, 9387, 9394, 9396]
average time:  3.153485059738159 (second)
shape of userProfile:  (3, 2850)
best:  [9373, 9375, 9378, 9380, 9382, 9384, 9386, 9387, 9394, 9396]
average time:  3.9905173778533936 (second)
shape of userProfile:  (4, 2850)
best:  [9373, 9375, 9378, 9380, 9382, 9384, 9386, 9387, 9394, 9396]
average time:  5.025768518447876 (second)
shape of userProfile:  (5, 2850)
best:  [9373, 9375, 9378, 9380, 9382, 9384, 9386, 9387, 9394, 9396]
average time:  5.791831731796265 (second)
shape of userProfile:  (6, 2850)


### book과 genre에 대한 정보를 하나로 합쳐 하나의 vector를 가지고 search

In [134]:
# test setting
genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"]
data = [
    {
        "title":"아버지의 해방일지",
        "author":"정지아",
        "id":1,
        "genres":["소설", "한국소설","한국소설일반"]
    },
    {
        "title":"달러구트 꿈 백화점",
        "author":"이미예",
        "id":8,
        "genres":["소설", "한국소설","판타지소설","장르소설"]
    }
]

# 0을 남겨둠
genreVec = np.zeros((len(genres_dic)))
for genre in genres:
    genreVec[genres_dic[genre]] = 1.

idVec = []
authorVec = []
for one in data:
    for genre in one["genres"]:
        print(genres_dic[genre]-1)
        genreVec[genres_dic[genre]-1] += 1.
    idVec.append(one["id"])
    authorVec.append(myHash(one["author"]))

userProfile = [idVec, genreVec, authorVec]
print(type(userProfile))
print()
print(len(userProfile[0]))
print(userProfile[0])
print()
print(userProfile[1][5])
print(userProfile[1].shape)
print(userProfile[1].sum())
print()
print(len(userProfile[2]))
print(userProfile[2])

4
39
125
4
39
312
26
<class 'list'>

2
[1, 8]

1.0
(2848,)
12.0

2
[470449543, 514441238]


In [135]:
from numpy.linalg import norm

def mySimilarity(x :np.array, user :np.array):
    if(any(x[0]==user[0])):
        return 0
    elif(any(x[-1]==user[2])):
        return 1.3*((np.dot(x[1:-1],user[1]))/(norm(x[1:-1])*norm((user[1]>0))))
    else:
        return ( (np.dot(x[1:-1],user[1]))/(norm(x[1:-1])*norm((user[1]>0))) )

In [136]:
print(mySimilarity(KorBookMatrix[1,:],userProfile)) # 같은 북이 있어서 0이다.
print(cos_similarity(KorBookMatrix[1,1:GENRE_DIM+1],userProfile[1]))
print()

print(mySimilarity(KorBookMatrix[2,:],userProfile)) # 같은 북이 아니지만 author같아 값이 높아짐.
print(cos_similarity(KorBookMatrix[2,1:GENRE_DIM+1],userProfile[1]))
print()

0
0.7216878364870323

0.9128709291752769
0.7216878364870323



In [137]:
from numpy.linalg import norm

def mySimilarity(x :np.array, user :np.array):
    if(any(x[0]==user[0])):
        return 0
    elif(any(x[-1]==user[2])):
        return 1.3*((np.dot(x[1:-1],user[1]))/(norm(x[1:-1])*norm((user[1]>0))))
    else:
        return ( (np.dot(x[1:-1],user[1]))/(norm(x[1:-1])*norm((user[1]>0))) )

def my_search(Matrix, userProfile, topNum):
    similarity = map(
        lambda bookProfile: \
            (int(bookProfile[0]),mySimilarity(bookProfile, userProfile))
            ,Matrix)
    best = sorted(similarity, key=lambda t: -t[1])[:topNum]
    return [t[0] for t in best]
    # return [t for t in best]


In [127]:
### vector가 만아질수록 time이 증가하지 않는다!!!
import sqlite3

tableName = "Korean_book"
con = sqlite3.connect("../Books.db")
cur = con.cursor()

data = []
count = 0
list = [i for i in range(1,6)]
rowid_list = np.random.randint(low=1, high=10000, size=100000)

for sizeof_userProfile in list:
    for rowid in rowid_list:
        if count >= sizeof_userProfile:
            break
        cur.execute(f"SELECT id, author, genres FROM {tableName} WHERE id=={rowid}")
        try:
            temp = cur.fetchall()[0]
        except:
            continue
        temp = {
            "id": temp[0],
            "author": temp[1],
            "genres": temp[2].split(",")[1:]
        }
        data.append(temp)
        count+=1
    genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"] # user에게서 받은 장르라 가정
    genreVec = np.zeros((len(genres_dic)))
    for genre in genres:
        genreVec[genres_dic[genre]] = 1.

    idVec = []
    authorVec = []
    for one in data:
        for genre in one["genres"]:
            genreVec[genres_dic[genre]-1] += 1.
        idVec.append(one["id"])
        authorVec.append(myHash(one["author"]))
    userProfile = [idVec, genreVec, authorVec]

    start = time.time()
    print("original size of userProfile: ", sizeof_userProfile+1)
    print("best: ",my_search(KorBookMatrix, userProfile, 100)[:10])
    print("average time: ",time.time()-start,"(second)")
    print("shape of genreVec: ", genreVec.shape)
    print("="*100)

original size of userProfile:  2
best:  [6548, 6555, 6825, 4960, 4974, 4998, 5002, 5050, 5085, 5123]
average time:  1.6758043766021729 (second)
shape of genreVec:  (2848,)
original size of userProfile:  3
best:  [6548, 6555, 6825, 4960, 4974, 4998, 5002, 5050, 5085, 5123]
average time:  1.5315368175506592 (second)
shape of genreVec:  (2848,)
original size of userProfile:  4
best:  [6548, 6555, 6825, 4960, 4974, 4998, 5002, 5050, 5085, 5123]
average time:  1.5557630062103271 (second)
shape of genreVec:  (2848,)
original size of userProfile:  5
best:  [6548, 6555, 6825, 4960, 4974, 4998, 5002, 5050, 5085, 5123]
average time:  1.5238525867462158 (second)
shape of genreVec:  (2848,)
original size of userProfile:  6
best:  [6548, 6555, 6825, 4960, 4974, 4998, 5002, 5050, 5085, 5123]
average time:  1.5219948291778564 (second)
shape of genreVec:  (2848,)


In [139]:
### 비교
## 확실히 my idea가
import sqlite3

tableName = "Korean_book"
con = sqlite3.connect("../Books.db")
cur = con.cursor()

data = []
count = 0
list = [i for i in range(1,6)]
rowid_list = np.random.randint(low=1, high=10000, size=100000)

for sizeof_userProfile in list:
    for rowid in rowid_list:
        if count >= sizeof_userProfile:
            break
        cur.execute(f"SELECT id, author, genres FROM {tableName} WHERE id=={rowid}")
        try:
            temp = cur.fetchall()[0]
        except:
            continue
        temp = {
            "id": temp[0],
            "author": temp[1],
            "genres": temp[2].split(",")[1:]
        }
        data.append(temp)
        count+=1
    genres = ["소설", "한국소설", "역사/대하소설", "장르소설", "역사소설"]


    genreVec = np.zeros((len(genres_dic)))
    for genre in genres:
        genreVec[genres_dic[genre]] = 1.
    idVec = []
    authorVec = []
    for one in data:
        for genre in one["genres"]:
            genreVec[genres_dic[genre]-1] += 1.
        idVec.append(one["id"])
        authorVec.append(myHash(one["author"]))
    MySearchUserProfile = [idVec, genreVec, authorVec]


    usrPro01 = np.zeros((1, len(genres_dic)+2))
    for genre in genres:
        usrPro01[0,genres_dic[genre]] = 1.
    userProfile = [usrPro01]
    for one in data:
        temp = np.zeros((1, len(genres_dic)+2))
        for genre in one["genres"]:
            temp[0,genres_dic[genre]] = 1.
        temp[0,0] = (one["id"])
        temp[0,-1] = myHash(one["author"])
        userProfile.append(temp)
    LinearUserProfile = np.concatenate(userProfile)

    start = time.time()
    print("best: ",multi_linear_search(KorBookMatrix, LinearUserProfile, 100, GENRE_DIM)[:10])
    print("average time: ",time.time()-start,"(second)")
    print("shape of userProfile: ",LinearUserProfile.shape)
    print("-"*100)
    start = time.time()
    print("best: ",my_search(KorBookMatrix, MySearchUserProfile, 100)[:10])
    print("average time: ",time.time()-start,"(second)")
    print("shape of userProfile: ", genreVec.shape[0]+2)
    print("#"*100)

best:  [2992, 2999, 3007, 3014, 3029, 3054, 3075, 3079, 3086, 3091]
average time:  2.191152811050415 (second)
shape of userProfile:  (2, 2850)
----------------------------------------------------------------------------------------------------
best:  [3173, 3353, 3360, 3365, 3153, 3287, 2992, 2999, 3007, 3014]
average time:  1.5144336223602295 (second)
shape of userProfile:  2850
####################################################################################################
best:  [2992, 2999, 3007, 3014, 3029, 3054, 3075, 3079, 3086, 3091]
average time:  3.085005044937134 (second)
shape of userProfile:  (3, 2850)
----------------------------------------------------------------------------------------------------
best:  [3173, 3353, 3360, 3365, 3153, 3287, 2992, 2999, 3007, 3014]
average time:  1.5324220657348633 (second)
shape of userProfile:  2850
####################################################################################################
best:  [2992, 2999, 3007, 3014, 