In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
import pandas as pd

# create the Spark Session
spark = SparkSession.builder.appName("spark").getOrCreate()

# create the Spark Context
sc = spark.sparkContext



## check the total number of genres and select the represntative genres

In [2]:
import sqlite3
conn = sqlite3.connect("../Books.db")
cur = conn.cursor()
cur.execute("SELECT genres FROM Korean_book")
check = cur.fetchall()
print(check[0:3])
conn.commit()
conn.close()


[('국내도서,소설,한국소설,역사/대하소설,장르소설,역사소설',), ('국내도서,소설,한국소설,한국소설일반',), ('국내도서,소설,한국소설,한국소설일반',)]


In [3]:
rdd = sc.parallelize(check).map(lambda line: line[0].split(","))
print(rdd.count())
rdd = rdd.flatMap(lambda x: x).map(lambda x: (x,1))
print(rdd.take(5))
print(rdd.count())


44749
[('국내도서', 1), ('소설', 1), ('한국소설', 1), ('역사/대하소설', 1), ('장르소설', 1)]
244815


In [27]:
rdd1 = rdd.reduceByKey(lambda x, y: x+y).sortBy(lambda x: -x[1]) # descending
rdd2 = rdd.reduceByKey(lambda x, y: x+y).sortBy(lambda x: x[1]) # ascending
total_genres_num = rdd1.count()-1
print("total genres:",total_genres_num) # <-- (-1)은 국내도서를 없애기 위해

total genres: 2860


In [17]:
top = 30

for i, value in enumerate(rdd1.collect()):
    if top <= 0:
        break
    print(f"{i}| {value}")
    top-=1




0| ('국내도서', 44749)
1| ('인문', 7794)
2| ('대학교재', 7241)
3| ('경제/경영', 5368)
4| ('외국어', 4323)
5| ('소설', 3211)
6| ('청소년', 3074)
7| ('컴퓨터/IT', 2829)
8| ('시/에세이', 2815)
9| ('기술/공학', 2793)
10| ('예술/대중문화', 2783)
11| ('취미/실용/스포츠', 2562)
12| ('과학', 2415)
13| ('역사/문화', 2300)
14| ('정치/사회', 2277)
15| ('자기계발', 2221)
16| ('요리', 1879)
17| ('건강', 1725)
18| ('종교', 1638)
19| ('컴퓨터', 1458)
20| ('가정/육아', 1091)
21| ('교육학', 1079)
22| ('고전소설/문학선', 917)
23| ('기술공학', 887)
24| ('나라별에세이', 885)
25| ('인물/자전적에세이', 882)
26| ('장르소설', 874)
27| ('중/고등참고서', 863)
28| ('철학', 785)
29| ('인문학일반', 767)


In [26]:
genre_idx = [1,2,3,4,5,7,8,9,10,11,12,13,14,15,16,17,18,20,22,28]
print(len(genre_idx))
representative_genres = []

top = 30
for i, value in enumerate(rdd1.collect()):
    if top <= 0:
        break
    if i in genre_idx:
        representative_genres.append(value[0])
    top-=1

print(representative_genres)
print(len(representative_genres))

with open("../../data/korean-representative-genres.txt", "wb") as f:
    for value in representative_genres:
        f.write((value+"\n").encode("UTF-8"))

20
['인문', '대학교재', '경제/경영', '외국어', '소설', '컴퓨터/IT', '시/에세이', '기술/공학', '예술/대중문화', '취미/실용/스포츠', '과학', '역사/문화', '정치/사회', '자기계발', '요리', '건강', '종교', '가정/육아', '고전소설/문학선', '철학']


In [30]:
others_genres = []
for i, value in enumerate(rdd1.collect()):
    if i not in genre_idx and i!=0:
        others_genres.append(value[0])

print(others_genres[0:5])
print(len(others_genres)+len(representative_genres))
print(total_genres_num)

with open("../../data/korean-all-genres.txt", "wb") as f:
    for value in others_genres:
        f.write((value+"\n").encode("UTF-8"))

['청소년', '컴퓨터', '교육학', '기술공학', '나라별에세이']
2860
2860


## pracitice

In [4]:
temp = sc.parallelize([('이재정', '박희윤'), ("김민형", "이재정"), ("이재정", "박희윤")])
temp.take(1)
temp1 = temp.distinct()
for i in temp1.collect():
    print(i)


박희윤
김민형
이상원
이재정


In [58]:
with open("../data/korean-representative-genres.txt", "rb") as f:
    value = f.read().decode("UTF-8").split("\n")
    value = value[:len(value)-1]
    print(value)
    print(len(value))
    

['인문', '대학교재', '경제/경영', '외국어', '소설', '컴퓨터/IT', '시/에세이', '기술/공학', '예술/대중문화', '취미/실용/스포츠', '과학', '역사/문화', '정치/사회', '자기계발', '요리', '건강', '종교', '가정/육아', '고전소설/문학선', '철학']
20
