# Import PySpark

In [1]:
import findspark
findspark.init('/home/hadoop/spark-2.2.2-bin-hadoop2.7')
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
conf = SparkConf().setAppName("miniProject").setMaster("spark://master:7077")
sc = SparkContext(conf=conf)
sqlcontext = SQLContext(sc)

# Load Term Frequency

In [2]:
token_count=sqlcontext.read.parquet('hdfs://master:9000/user/hadoop/token_count')
token_count.show(5)

+--------+-----+-----+
|title_id|token|count|
+--------+-----+-----+
|  119874|  수집이|    3|
|  119874| 전쟁터가|    3|
|  119874|  푸시고|    3|
|  119874|  미간이|    3|
|  119874|  바다에|    3|
+--------+-----+-----+
only showing top 5 rows



# Calculate Document Frequency

In [3]:
import pyspark.sql.functions as F

frequencies=token_count.groupBy('token').agg(F.countDistinct('title_id').alias('DF'), F.sum('count'))
frequencies.show(10)

+-----+---+----------+
|token| DF|sum(count)|
+-----+---+----------+
|  무라로|  4|         5|
|노예대하듯|  1|         1|
|  자시길|  5|         5|
|  불이익|106|       318|
|  대처를| 79|       123|
|  구수한| 21|        22|
|   외면|200|       671|
|  큰행운|  1|         1|
|   것진|  1|         1|
|   보신|273|       953|
+-----+---+----------+
only showing top 10 rows



In [4]:
D_count=token_count.agg(F.countDistinct('title_id').alias('D_count')).collect()[0].D_count # calculate number of documents |D|
D_count

618

In [10]:
freq_frame=token_count.join(frequencies, 'token').select('title_id','token','count','DF').withColumn('D_count',F.lit(D_count))
freq_frame.show(5)

+--------+-----+-----+---+-------+
|title_id|token|count| DF|D_count|
+--------+-----+-----+---+-------+
|  502673|가게였지만|    1|  1|    618|
|  710751|  가격후|    1|  1|    618|
|  732955|  가계에|    1|  5|    618|
|  727838|  가계에|    1|  5|    618|
|  616239|  가계에|    1|  5|    618|
+--------+-----+-----+---+-------+
only showing top 5 rows



In [12]:
freq_frame=freq_frame.withColumnRenamed("count", "TF")
tf_idf_frame=freq_frame.withColumn('TF_IDF',freq_frame.TF*F.log(freq_frame.D_count/freq_frame.DF)).select('title_id','token','TF_IDF').orderBy('title_id', F.desc('TF_IDF'))
tf_idf_frame.show(5)

+--------+-----+------------------+
|title_id|token|            TF_IDF|
+--------+-----+------------------+
|   15441|  엄친아| 174.0557530848959|
|   15441|   워니|130.06036471563695|
|   15441|  워니님| 115.9244642157694|
|   15441|골방환상곡|113.62293383527297|
|   15441|  포켓몬|59.750041766894654|
+--------+-----+------------------+
only showing top 5 rows



In [15]:
from pyspark.sql.window import Window
window = Window.partitionBy(tf_idf_frame['title_id']).orderBy('title_id', F.desc('TF_IDF'))

trunked_tf_idf_frame=tf_idf_frame.select('*', F.rank().over(window).alias('rank')).filter(F.col('rank') <= 100) 

trunked_tf_idf_frame.show()

+--------+-----+-------------------+----+
|title_id|token|             TF_IDF|rank|
+--------+-----+-------------------+----+
|   25517|  감찰국| 3.6538997352179097|   1|
|   25517|  벗으며| 3.3354460040993748|   2|
|   25517|   고라| 2.6198259676873707|   3|
|   25517|  찹쌀떡| 2.5978470609685957|   4|
|   25517|  모른단| 2.5552874465497997|   5|
|   25517|   있대| 2.0957551171713598|   6|
|   25517|   식스| 1.7630493633456237|   7|
|   25517|  맙소사| 1.5589540070021084|   8|
|   25517|   대장| 1.2617024835341768|   9|
|   25517|   나야| 1.0651962917482658|  10|
|   25517|  참신한| 1.0328609111053293|  11|
|   25517|  누군데| 1.0238110755854113|  12|
|   25517|  누군지| 0.5156918134171636|  13|
|   25517|   슬프|0.48106784885111564|  14|
|   25517|   센스| 0.4450742462032102|  15|
|   25517|   녀석|0.43252703015112154|  16|
|   25517|   화가|0.28844342112336857|  17|
|   25517|   죽을|0.22597928341500068|  18|
|   25517|   그게|0.18997321512799922|  19|
|   25517|  아직도|0.17265964588221772|  20|
+--------+-----+------------------

In [17]:
trunked_tf_idf_frame.write.parquet('hdfs://master:9000/user/hadoop/tf_idf')
trunked_tf_idf_frame.write.csv('hdfs://master:9000/user/hadoop/tf_idf.csv')

In [18]:
Term_count=token_count.agg(F.countDistinct('token').alias('Term_count')).collect()[0].Term_count # calculate number of documents |D|
Term_count

1653840

# DF 기반 재분석

In [31]:
frequencies=frequencies.filter(' 62 < DF and DF  < 492 ').show()

+-------+---+----------+
|  token| DF|sum(count)|
+-------+---+----------+
|     목줄| 74|       339|
|     외면|200|       671|
|    이뻐요|174|       554|
|     짤리|131|       335|
|    죽여도|173|       600|
|     있엌|218|       741|
|     상하|101|       220|
|  기가막히게| 81|       123|
|     기적|130|       395|
|    즐거운|241|      1070|
|존경스럽습니다|105|       265|
|    반복하|139|       277|
|     맞게|322|      1465|
|     쉬지|153|       406|
|    중심적| 71|       111|
|   처절하게| 69|       150|
|     지름|127|       246|
|    바지에|214|       693|
|     왔니| 73|       122|
|    쿨하게|170|       441|
+-------+---+----------+
only showing top 20 rows

