# Запускаем контекст

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import json

conf = SparkConf()

spark = (SparkSession
         .builder
         .config(conf=conf)
         .appName("test")
         .getOrCreate())

## Подгружаем данные

In [3]:
!hdfs dfs -ls /labs/laba01/ml-100k

Found 23 items
-rw-r--r--   3 hdfs hdfs       6750 2022-01-06 18:46 /labs/laba01/ml-100k/README
-rw-r--r--   3 hdfs hdfs        716 2022-01-06 18:46 /labs/laba01/ml-100k/allbut.pl
-rw-r--r--   3 hdfs hdfs        643 2022-01-06 18:46 /labs/laba01/ml-100k/mku.sh
-rw-r--r--   3 hdfs hdfs    1979173 2022-01-06 18:46 /labs/laba01/ml-100k/u.data
-rw-r--r--   3 hdfs hdfs        202 2022-01-06 18:46 /labs/laba01/ml-100k/u.genre
-rw-r--r--   3 hdfs hdfs         36 2022-01-06 18:46 /labs/laba01/ml-100k/u.info
-rw-r--r--   3 hdfs hdfs     236344 2022-01-06 18:46 /labs/laba01/ml-100k/u.item
-rw-r--r--   3 hdfs hdfs        193 2022-01-06 18:46 /labs/laba01/ml-100k/u.occupation
-rw-r--r--   3 hdfs hdfs      22628 2022-01-06 18:46 /labs/laba01/ml-100k/u.user
-rw-r--r--   3 hdfs hdfs    1586544 2022-01-06 18:46 /labs/laba01/ml-100k/u1.base
-rw-r--r--   3 hdfs hdfs     392629 2022-01-06 18:46 /labs/laba01/ml-100k/u1.test
-rw-r--r--   3 hdfs hdfs    1583948 2022-01-06 18:46 /labs/laba01/ml-1

In [4]:
films = sc.textFile("/labs/laba01/ml-100k/u.item")

In [5]:
films.take(1)

['1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0']

### user id | item id | rating | timestamp. 

In [6]:
data = sc.textFile("/labs/laba01/ml-100k/u.data").map(lambda x: x.split("\t"))

In [7]:
data.take(1)

[['196', '242', '3', '881250949']]

## Осталяем данные с нужным фильмом

In [8]:
data_my_film = data.filter(lambda x: x[1] == '328')

In [9]:
data_my_film.count()

295

In [10]:
data_my_film.take(10)

[['166', '328', '5', '886397722'],
 ['119', '328', '4', '876923913'],
 ['38', '328', '4', '892428688'],
 ['229', '328', '1', '891632142'],
 ['111', '328', '4', '891679939'],
 ['223', '328', '3', '891548959'],
 ['173', '328', '5', '877557028'],
 ['181', '328', '3', '878961227'],
 ['195', '328', '4', '884420059'],
 ['292', '328', '3', '877560833']]

In [11]:
data_my_film.getNumPartitions()

2

In [12]:
estimates = data_my_film.map(lambda x: x[2], 1).countByKey().items()

## Формируем словарь

In [13]:
estimates_dict = dict(estimates)

In [14]:
estimates_dict

{'5': 40, '4': 109, '1': 12, '3': 94, '2': 40}

In [16]:
sorted_est = dict(sorted(estimates_dict.items()))

In [17]:
sorted_est

{'1': 12, '2': 40, '3': 94, '4': 109, '5': 40}

In [18]:
est_list = list(sorted_est.values()) 
print(est_list)

[12, 40, 94, 109, 40]


In [19]:
hist_film = {'hist_film': est_list}

In [20]:
hist_film

{'hist_film': [12, 40, 94, 109, 40]}

## Сделаем то же самое, только для всех фильмов

In [21]:
data = sc.textFile("/labs/laba01/ml-100k/u.data").map(lambda x: x.split("\t"))

In [22]:
all_films = data.map(lambda x: x[2], 1).countByKey().items()

In [23]:
all_films

dict_items([('3', 27145), ('1', 6110), ('2', 11370), ('4', 34174), ('5', 21201)])

In [24]:
all_films_dict = dict(all_films)

In [25]:
sorted_films = dict(sorted(all_films_dict.items()))

In [27]:
est_list_all = list(sorted_films.values()) 
print(est_list_all)

[6110, 11370, 27145, 34174, 21201]


In [28]:
all_films = {'hist_all': est_list_all}

In [29]:
all_films

{'hist_all': [6110, 11370, 27145, 34174, 21201]}

## Объединяем в один словарь

In [30]:
union_all = dict(list(hist_film.items()) + list(all_films.items()))

In [31]:
union_all

{'hist_film': [12, 40, 94, 109, 40],
 'hist_all': [6110, 11370, 27145, 34174, 21201]}

## Запись в файл

In [32]:
import json 
with open('lab01.json', 'w') as outfile:
    json.dump(union_all, outfile)

In [33]:
sc.stop()