In [1]:
import os
import sys
import pandas as pd
import numpy as np
import json
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark.sql import SparkSession
from pyspark import SparkContext, SparkConf

conf = SparkConf()
conf.set("spark.app.name", "Nabat_lab1") 

spark = SparkSession.builder.config(conf=conf).appName("Nabat_lab1").getOrCreate()

In [3]:
sc = spark.sparkContext

In [7]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import pyspark.sql.functions as f

In [8]:
!hdfs dfs -ls /labs/laba01/ml-100k

Found 23 items
-rw-r--r--   3 hdfs hdfs       6750 2020-09-05 20:38 /labs/laba01/ml-100k/README
-rw-r--r--   3 hdfs hdfs        716 2020-09-05 20:38 /labs/laba01/ml-100k/allbut.pl
-rw-r--r--   3 hdfs hdfs        643 2020-09-05 20:38 /labs/laba01/ml-100k/mku.sh
-rw-r--r--   3 hdfs hdfs    1979173 2020-09-05 20:38 /labs/laba01/ml-100k/u.data
-rw-r--r--   3 hdfs hdfs        202 2020-09-05 20:38 /labs/laba01/ml-100k/u.genre
-rw-r--r--   3 hdfs hdfs         36 2020-09-05 20:38 /labs/laba01/ml-100k/u.info
-rw-r--r--   3 hdfs hdfs     236344 2020-09-05 20:38 /labs/laba01/ml-100k/u.item
-rw-r--r--   3 hdfs hdfs        193 2020-09-05 20:38 /labs/laba01/ml-100k/u.occupation
-rw-r--r--   3 hdfs hdfs      22628 2020-09-05 20:38 /labs/laba01/ml-100k/u.user
-rw-r--r--   3 hdfs hdfs    1586544 2020-09-05 20:38 /labs/laba01/ml-100k/u1.base
-rw-r--r--   3 hdfs hdfs     392629 2020-09-05 20:38 /labs/laba01/ml-100k/u1.test
-rw-r--r--   3 hdfs hdfs    1583948 2020-09-05 20:38 /labs/laba01/ml-1

In [9]:
!hdfs dfs -tail /labs/laba01/ml-100k/u.data

91363685
823	134	5	878438232
130	93	5	874953665
130	121	5	876250746
537	778	3	886031106
655	913	4	891817521
889	2	3	880182460
865	1009	5	880144368
851	979	3	875730244
833	474	5	875122675
394	380	4	881132876
193	690	4	889123221
621	809	4	880740136
766	91	5	891310125
650	479	5	891372339
429	199	5	882386006
847	596	3	878938982
934	216	1	891191511
788	556	2	880871128
897	369	4	879993713
936	287	4	886832419
936	766	3	886832597
449	120	1	879959573
661	762	2	876037121
721	874	3	877137447
821	151	4	874792889
764	596	3	876243046
537	443	3	886031752
618	628	2	891308019
487	291	3	883445079
113	975	5	875936424
943	391	2	888640291
864	685	4	888891900
750	323	3	879445877
279	64	1	875308510
646	750	3	888528902
654	370	2	887863914
617	582	4	883789294
913	690	3	880824288
660	229	2	891406212
421	498	4	892241344
495	1091	4	888637503
806	421	4	882388897
676	538	4	892685437
721	262	3	877137285
913	209	2	881367150
378	78	3	880056976
880	476	3	880175444
716	204

# RDD solve

In [10]:
rdd_row = sc.textFile("/labs/laba01/ml-100k/u.data").cache()

In [11]:
hist_all_list = sorted(
    (rdd_row.map(lambda x: (x.split("\t")[2], 1))\
    .reduceByKey(lambda x, y: x + y)\
    .collect()))
hist_all_list = [second for first, second in hist_all_list]
print(hist_all_list)

[6110, 11370, 27145, 34174, 21201]


In [12]:
hist_film_list = sorted(
    (rdd_row\
    .map(lambda x: (x.split("\t")[1:3], 1))\
    .filter(lambda x: x[0][0] =='96')\
    .map(lambda x: (x[0][1], x[1]))\
    .reduceByKey(lambda x, y: x + y)
    .collect()))
hist_film_list = [second for first, second in hist_film_list]
print(hist_film_list)

[6, 20, 43, 123, 103]


In [13]:
d = {"hist_film" : hist_film_list,
    "hist_all": hist_all_list}

In [14]:
with open('/data/home/ilya.nabatchikov/lab01.json', 'w') as fp:
    json.dump(d, fp)

In [15]:
with open("/data/home/ilya.nabatchikov/lab01.json", "r") as read_file:
    data = json.load(read_file)
    print(data)

{'hist_film': [6, 20, 43, 123, 103], 'hist_all': [6110, 11370, 27145, 34174, 21201]}


# DataFrame solve

In [16]:
schema = StructType(fields=[
    StructField("user_id", StringType()),
    StructField("item_id", StringType()),
    StructField("rating", IntegerType()),
    StructField("timestamp", StringType())
])

In [17]:
df = spark.read.csv("/labs/laba01/ml-100k/u.data", sep="\t", schema=schema).cache()

In [18]:
df_hist_all = df.groupBy("rating")\
    .agg(f.count("*").alias("hist_all"))\
    .orderBy("rating")\
    .select('hist_all')\
    .toPandas()

In [19]:
df_hist_film = df.where("item_id = 96")\
    .groupBy("rating")\
    .agg(f.count("*").alias("hist_film"))\
    .orderBy("rating")\
    .select('hist_film')\
    .toPandas()

In [20]:
d = {"hist_film" : df_hist_film.hist_film.tolist(),
    "hist_all": df_hist_all.hist_all.tolist()}

In [21]:
with open('/data/home/ilya.nabatchikov/lab01.json', 'w') as fp:
    json.dump(d, fp)

In [23]:
with open("/data/home/ilya.nabatchikov/lab01.json", "r") as read_file:
    data = json.load(read_file)
    print(data)

{'hist_film': [6, 20, 43, 123, 103], 'hist_all': [6110, 11370, 27145, 34174, 21201]}


In [None]:
sc.stop()