In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
import pandas as pd

# create the Spark Session
spark = SparkSession.builder.appName("spark").getOrCreate()

# create the Spark Context
sc = spark.sparkContext



## check the total number of genres

In [2]:
import sqlite3
con = sqlite3.connect("../Books.db")
cur = con.cursor()
cur.execute("SELECT genres FROM Foreign_book")
data = cur.fetchall()
print(len(data))
print(type(data))
print(type(data[0]))
print(data[0])

52478
<class 'list'>
<class 'tuple'>
('Young Adult, Fiction, Dystopia, Fantasy, Science Fiction, Romance, Adventure, Teen, Post Apocalyptic, Action',)


In [28]:
rdd = sc.parallelize(data).map(lambda genres: genres[0])
print(rdd.take(1))

['Young Adult, Fiction, Dystopia, Fantasy, Science Fiction, Romance, Adventure, Teen, Post Apocalyptic, Action']


## select representative genres

In [4]:
rdd1 = rdd.map(lambda line: tuple(line.split(", ")))
print(rdd1.take(1))
print(rdd1.count())

[('Young Adult', 'Fiction', 'Dystopia', 'Fantasy', 'Science Fiction', 'Romance', 'Adventure', 'Teen', 'Post Apocalyptic', 'Action')]
52478


In [5]:
rdd2 = rdd1.flatMap(lambda x: x)
print(rdd2.take(5))
print(rdd2.count())

['Young Adult', 'Fiction', 'Dystopia', 'Fantasy', 'Science Fiction']
412341


In [6]:
## reduceBy로 하면 너무 느려서 python으로 함
genre_dic = {}

for i, value in enumerate(rdd2.collect()):
    if value in genre_dic:
        genre_dic[value] += 1
    else:
        genre_dic[value] = 1

print("total genres:",len(genre_dic))
it = zip(genre_dic.keys(), genre_dic.values())
print(next(it))
print(next(it))
print(next(it))
print(next(it))
print(next(it))

total genres: 983
('Young Adult', 11869)
('Fiction', 31638)
('Dystopia', 1692)
('Fantasy', 15046)
('Science Fiction', 5374)


In [24]:
temp = dict(sorted(genre_dic.items(), key=lambda x: x[1], reverse=True))
count = 0
for i in zip(temp.keys(), temp.values()):
    if count >= 30:
        break
    print(i)
    count+=1

# temp = {"name":"hae", "age":12}
# print(temp.items())
# print(temp)

('Fiction', 31638)
('Romance', 15495)
('Fantasy', 15046)
('Young Adult', 11869)
('Contemporary', 10520)
('Nonfiction', 8251)
('Adult', 8246)
('Novels', 7805)
('Mystery', 7702)
('Historical Fiction', 7665)
('Audiobook', 7307)
('Classics', 6902)
('Adventure', 6452)
('Historical', 6383)
('Paranormal', 6030)
('Literature', 5836)
('Science Fiction', 5374)
('Childrens', 5226)
('', 4623)
('Thriller', 4587)
('Magic', 4248)
('Humor', 4227)
('History', 3685)
('Crime', 3675)
('Contemporary Romance', 3624)
('Suspense', 3474)
('Urban Fantasy', 3458)
('Middle Grade', 3389)
('Chick Lit', 3358)
('Science Fiction Fantasy', 3302)


In [12]:
compressed_genre = [
    ""
]
count = 0
for i in zip(temp.keys(), temp.values()):
    if count >= 25:
        break
    print(i)
    count+=1

{'min': 2, 'sung': 2, 'jae': 1, 'han': 1, 'Lee': 1}


In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'you know I want your love',
    'I like you',
    'what should I do ',    
]

tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)

In [22]:
genres_list = (data["genres"].iloc[:].tolist())

In [27]:
# import sqlite3
# conn = sqlite3.connect("../Books.db")
# cur = conn.cursor()
# cur.execute("SELECT genres FROM Foreign_book")
# data = cur.fetchall()
# print(data)
# conn.commit()
# conn.close()

In [23]:
import numpy as np
x = np.array([10,0,1,1])
y = np.array([9,1,0,0])
z = np.array([3,0,1,1])
from scipy.spatial import distance
print(distance.cosine(x, y))
print(distance.cosine(x, x))
print(distance.cosine(x, z))
print(distance.cosine(y, z))

0.01590846210843544
0.0
0.04466970555754857
0.10099833641376377


In [28]:
import math
def consine_similarity(x, y):
    return x.dot(y)/(np.sqrt((x**2).sum())*np.sqrt((y**2).sum()))

print(consine_similarity(x,y))
print(consine_similarity(x,z))
# print(consine_similarity(y,z))


0.9840915378915646
0.9553302944424514


In [35]:
dept = [("Finance",10),("Marketing",20),("Sales",30),("IT",40)]
temprdd = sc.parallelize(dept)
df = temprdd.toDF()
df.show()

+---------+---+
|       _1| _2|
+---------+---+
|  Finance| 10|
|Marketing| 20|
|    Sales| 30|
|       IT| 40|
+---------+---+



In [6]:
import pandas as pd
data = pd.read_csv("./data/foreign-books.csv")
title = data["title"].iloc[0]
genres = data["genres"].iloc[0].replace("[","").replace("]","").replace("\'","")
author = data["author"].iloc[0]
rating = data["rating"].iloc[0]
publisher = data["publisher"].iloc[0]
p_date = data["publishDate"].iloc[0]
pages = data["pages"].iloc[0]
language = data["language"].iloc[0]
description = data["description"][0]
imgUrl = data["coverImg"][0]

print(title)
print(genres)
print(author)
print(rating)
print(publisher)
print(p_date)
print(pages)
print(language)
print(description)
print(imgUrl)
print(len(data))

The Hunger Games
Young Adult, Fiction, Dystopia, Fantasy, Science Fiction, Romance, Adventure, Teen, Post Apocalyptic, Action
Suzanne Collins
4.33
Scholastic Press
09/14/08
374
English
WINNING MEANS FAME AND FORTUNE.LOSING MEANS CERTAIN DEATH.THE HUNGER GAMES HAVE BEGUN. . . .In the ruins of a place once known as North America lies the nation of Panem, a shining Capitol surrounded by twelve outlying districts. The Capitol is harsh and cruel and keeps the districts in line by forcing them all to send one boy and once girl between the ages of twelve and eighteen to participate in the annual Hunger Games, a fight to the death on live TV.Sixteen-year-old Katniss Everdeen regards it as a death sentence when she steps forward to take her sister's place in the Games. But Katniss has been close to dead before—and survival, for her, is second nature. Without really meaning to, she becomes a contender. But if she is to win, she will have to start making choices that weight survival against human