## Aggregate statistic
### Preparing Data
#### Downloading DataSet

In [1]:
# !wget http://files.grouplens.org/datasets/movielens/ml-100k.zip  

#### Unzip

In [2]:
# !unzip ml-100k.zip -d ~/laba06

#### Checking the reading of the file

In [3]:
!head -n 2 /data/home/user/laba06/ml-100k/u.data

196	242	3	881250949
186	302	3	891717742


#### Copy files to HDFS

In [4]:
# !hadoop fs -put laba06/ml-100k

### Data processing
#### Import packages

In [5]:
import os
import sys
import json

#### Connecting to Spark

In [6]:
os.environ["PYSPARK_SUBMIT_ARGS"]='--packages com.databricks:spark-csv_2.10:1.2.0 pyspark-shell'
spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, spark_home + "/python")
execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.2.0
      /_/

Using Python version 2.7.6 (default, Oct 26 2016 20:30:19)
SparkSession available as 'spark'.


#### Converting file to Resilient Distributed Dataset (RDD)

In [7]:
rdd = sc.textFile('ml-100k/u.data')

In [8]:
rdd.take(1)

[u'196\t242\t3\t881250949']

#### Distribution of movie ratings with ID 98

In [9]:
hist_film = (
    rdd
    .map(lambda x: x.split("\t"))
    .filter(lambda x: x[1] == '98')
    .map(lambda k: (k[2],1))
    .reduceByKey(lambda a,b: a+b)
    .sortByKey()
    .map(lambda x: x[1])
    .take(10)
)

In [10]:
hist_film

[6, 10, 30, 163, 181]

#### Distribution of ratings of all movies

In [11]:
hist_all = (
    rdd
    .map(lambda x: x.split("\t"))
    .map(lambda k: (k[2],1))
    .reduceByKey(lambda a,b: a+b)
    .sortByKey()
    .map(lambda x: x[1])
    .take(10)
    
)

In [12]:
hist_all

[6110, 11370, 27145, 34174, 21201]

#### Creating json-file with results

In [13]:
with open('lab06.json', 'w') as file:
    json.dump({'hist_film': hist_film, 'hist_all': hist_all}, file)