In [6]:
!hdfs dfs -ls /labs/laba01/ml-100k

Found 23 items
-rw-r--r--   3 hdfs hdfs       6750 2020-09-05 20:38 /labs/laba01/ml-100k/README
-rw-r--r--   3 hdfs hdfs        716 2020-09-05 20:38 /labs/laba01/ml-100k/allbut.pl
-rw-r--r--   3 hdfs hdfs        643 2020-09-05 20:38 /labs/laba01/ml-100k/mku.sh
-rw-r--r--   3 hdfs hdfs    1979173 2020-09-05 20:38 /labs/laba01/ml-100k/u.data
-rw-r--r--   3 hdfs hdfs        202 2020-09-05 20:38 /labs/laba01/ml-100k/u.genre
-rw-r--r--   3 hdfs hdfs         36 2020-09-05 20:38 /labs/laba01/ml-100k/u.info
-rw-r--r--   3 hdfs hdfs     236344 2020-09-05 20:38 /labs/laba01/ml-100k/u.item
-rw-r--r--   3 hdfs hdfs        193 2020-09-05 20:38 /labs/laba01/ml-100k/u.occupation
-rw-r--r--   3 hdfs hdfs      22628 2020-09-05 20:38 /labs/laba01/ml-100k/u.user
-rw-r--r--   3 hdfs hdfs    1586544 2020-09-05 20:38 /labs/laba01/ml-100k/u1.base
-rw-r--r--   3 hdfs hdfs     392629 2020-09-05 20:38 /labs/laba01/ml-100k/u1.test
-rw-r--r--   3 hdfs hdfs    1583948 2020-09-05 20:38 /labs/laba01/ml-1

In [252]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [253]:
from pyspark import SparkContext, SparkConf

conf = SparkConf()
conf.set("spark.app.name", "ZK Spark RDD app") 

sc = SparkContext(conf=conf)

In [254]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.config(conf=conf).appName("ZK Spark RDD app").getOrCreate()

In [255]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [256]:
data = sc.textFile("/labs/laba01/ml-100k/u.data")

In [257]:
data.take(10)

['196\t242\t3\t881250949',
 '186\t302\t3\t891717742',
 '22\t377\t1\t878887116',
 '244\t51\t2\t880606923',
 '166\t346\t1\t886397596',
 '298\t474\t4\t884182806',
 '115\t265\t2\t881171488',
 '253\t465\t5\t891628467',
 '305\t451\t3\t886324817',
 '6\t86\t3\t883603013']

In [258]:
data_splited = data.map(lambda x: x.split("\t"))
data_splited.take(5)

[['196', '242', '3', '881250949'],
 ['186', '302', '3', '891717742'],
 ['22', '377', '1', '878887116'],
 ['244', '51', '2', '880606923'],
 ['166', '346', '1', '886397596']]

In [259]:
schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("movie_id", IntegerType()),
    StructField("score", IntegerType()),
    StructField("timestamp", IntegerType())
])

In [260]:
data_splited = data_splited.map(lambda x: (int(x[0]), int(x[1]), int(x[2]), int(x[3])))
df = spark.createDataFrame(data_splited, schema=schema)

df.show(5)

+-------+--------+-----+---------+
|user_id|movie_id|score|timestamp|
+-------+--------+-----+---------+
|    196|     242|    3|881250949|
|    186|     302|    3|891717742|
|     22|     377|    1|878887116|
|    244|      51|    2|880606923|
|    166|     346|    1|886397596|
+-------+--------+-----+---------+
only showing top 5 rows



In [92]:
df

DataFrame[user_id: int, movie_id: int, score: int, ip: int]

In [111]:
df.where("movie_id = 302").show(5)

+-------+--------+-----+---------+
|user_id|movie_id|score|       na|
+-------+--------+-----+---------+
|    186|     302|    3|891717742|
|    191|     302|    4|891560253|
|     49|     302|    4|888065432|
|     54|     302|    4|880928519|
|     62|     302|    3|879371909|
+-------+--------+-----+---------+
only showing top 5 rows



In [157]:
data_grouped = df.groupby('movie_id', 'score').count().sort("movie_id","score")

data_grouped.show(5)

+--------+-----+-----+
|movie_id|score|count|
+--------+-----+-----+
|       1|    1|    8|
|       1|    2|   27|
|       1|    3|   96|
|       1|    4|  202|
|       1|    5|  119|
+--------+-----+-----+
only showing top 5 rows



In [261]:
from pyspark.sql.functions import col,when,count,sum

df2 = df.groupBy("score").agg(
    count("movie_id").alias("count_total"),
    count(when(col("movie_id") == 302,1)).alias("count_302")) \
    .sort("score")

df2.show(10)

+-----+-----------+---------+
|score|count_total|count_302|
+-----+-----------+---------+
|    1|       6110|        2|
|    2|      11370|       10|
|    3|      27145|       46|
|    4|      34174|      119|
|    5|      21201|      120|
+-----+-----------+---------+



In [262]:
from pyspark.sql import HiveContext
from pyspark.sql import functions as F

In [202]:
# (df2.agg(
#     F.collect_set("count_302").alias("hist_film"),
#     F.collect_list("count_total").alias("hist_all"))
#   .show(truncate=False))

+---------------------+----------------------------------+
|hist_film            |hist_all                          |
+---------------------+----------------------------------+
|[46, 2, 119, 120, 10]|[6110, 11370, 34174, 27145, 21201]|
+---------------------+----------------------------------+



In [263]:
from pyspark.sql import Window
w = Window.partitionBy().orderBy('score')

sorted_list_df = df2.withColumn('hist_film', F.collect_list('count_302').over(w))\
        .withColumn('hist_all', F.collect_list('count_total').over(w))\
        .groupBy() \
        .agg(F.max('hist_film').alias('hist_film'), \
             F.max('hist_all').alias('hist_all') \
            )
sorted_list_df.show(truncate=False)
    

+---------------------+----------------------------------+
|hist_film            |hist_all                          |
+---------------------+----------------------------------+
|[2, 10, 46, 119, 120]|[6110, 11370, 27145, 34174, 21201]|
+---------------------+----------------------------------+



In [286]:
dict_sample = {}

In [312]:
dict_sample = {"hist_film": sorted_list_df.first()['hist_film'], "hist_all": sorted_list_df.first()['hist_all']}

In [313]:
dict_sample

{'hist_film': [2, 10, 46, 119, 120],
 'hist_all': [6110, 11370, 27145, 34174, 21201]}

In [314]:
with open('lab01.json', 'w') as fp:
    json.dump(dict_sample, fp)

In [None]:
sc.stop()