In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()

spark = SparkSession.builder.config(conf=conf).appName("Pankov lab01").getOrCreate()

In [3]:
ID = 22

### RDD

In [4]:
rdd = spark.sparkContext.textFile("/labs/laba01/ml-100k/u.data")

In [5]:
data = rdd.map(lambda x: x.split("\t"))

In [6]:
data.take(5)

[['196', '242', '3', '881250949'],
 ['186', '302', '3', '891717742'],
 ['22', '377', '1', '878887116'],
 ['244', '51', '2', '880606923'],
 ['166', '346', '1', '886397596']]

For id == 22 step by step

In [7]:
data_filtered = data.filter(lambda x: x[1] == str(ID))

In [8]:
data_filtered.take(5)

[['269', '22', '1', '891448072'],
 ['8', '22', '5', '879362183'],
 ['90', '22', '4', '891384357'],
 ['222', '22', '5', '878183285'],
 ['313', '22', '3', '891014870']]

In [9]:
rating = data_filtered.flatMap(lambda x: x[2])

In [10]:
rating.take(5)

['1', '5', '4', '5', '3']

In [11]:
my_rating_rdd = [i[1] for i in sorted(rating.countByKey().items())]

In [12]:
my_rating_rdd

[5, 14, 46, 98, 134]

For all ids

In [13]:
all_rating_rdd = [i[1] for i in sorted(data
                                       .flatMap(lambda x: x[2])
                                       .countByKey()
                                       .items()
                                      )
                 ]

In [14]:
all_rating_rdd

[6110, 11370, 27145, 34174, 21201]

### DF

In [15]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

In [16]:
schema = StructType(fields=[
    StructField("user_id", StringType()),
    StructField("film_id", StringType()),
    StructField("rating", IntegerType()),
    StructField("timestamp", StringType()),
])

In [17]:
df = spark.read.csv("/labs/laba01/ml-100k/u.data", sep="\t", schema=schema).cache()

In [18]:
df.show(5)

+-------+-------+------+---------+
|user_id|film_id|rating|timestamp|
+-------+-------+------+---------+
|    196|    242|     3|881250949|
|    186|    302|     3|891717742|
|     22|    377|     1|878887116|
|    244|     51|     2|880606923|
|    166|    346|     1|886397596|
+-------+-------+------+---------+
only showing top 5 rows



In [19]:
import pyspark.sql.functions as f

In [20]:
my_rating = (df
             .where(f.col('film_id') == ID)
             .groupby(['film_id', 'rating'])
             .agg(f.count('*').alias('count'))
             .orderBy(f.col('rating'))
            )

In [21]:
my_rating.show()

+-------+------+-----+
|film_id|rating|count|
+-------+------+-----+
|     22|     1|    5|
|     22|     2|   14|
|     22|     3|   46|
|     22|     4|   98|
|     22|     5|  134|
+-------+------+-----+



In [22]:
my_rating = my_rating.select('count').rdd.flatMap(lambda x: x).collect()

In [23]:
my_rating

[5, 14, 46, 98, 134]

In [24]:
all_rating = (df
             .groupby(['rating'])
             .agg(f.count('*').alias('count'))
             .orderBy(f.col('rating'))
            )

In [25]:
all_rating.show()

+------+-----+
|rating|count|
+------+-----+
|     1| 6110|
|     2|11370|
|     3|27145|
|     4|34174|
|     5|21201|
+------+-----+



In [26]:
all_rating = all_rating.select('count').rdd.flatMap(lambda x: x).collect()

In [27]:
all_rating

[6110, 11370, 27145, 34174, 21201]

In [28]:
my_rating_rdd == my_rating

True

In [29]:
all_rating_rdd == all_rating

True

### RESULT

In [30]:
result = {'hist_film': my_rating, 'hist_all': all_rating}

In [31]:
result

{'hist_film': [5, 14, 46, 98, 134],
 'hist_all': [6110, 11370, 27145, 34174, 21201]}

In [32]:
import json

In [33]:
with open('/data/home/ivan.pankov/lab01.json', 'w') as f:
    json.dump(result, f)

In [34]:
spark.stop()