In [1]:
import os
import sys
from datetime import datetime

os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark.sql.types import *
from pyspark import Row
import numpy as np
import json
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, col, isnan, isnull, broadcast, desc, lower
import re
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import FloatType, ArrayType, StringType
from pyspark.sql.functions import udf 
from pyspark.sql.types import FloatType


conf = SparkConf()
conf.set("spark.app.name", "NV Spark Dataframe app") 

spark = SparkSession.builder.config(conf=conf).appName("NV Spark Dataframe app").getOrCreate()

In [3]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number
from pyspark.sql.functions import split

In [4]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


In [41]:
schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType()),
])


user = spark.read\
          .schema(schema)\
          .format("csv")\
          .option("header", True)\
          .load("/labs/slaba03/laba03_train.csv")

In [45]:
schema = StructType(fields=[
    StructField("item_id", IntegerType()),
    StructField("channel_id", IntegerType()),
    StructField("datetime_availability_start", StringType()),
    StructField("datetime_availability_stop", StringType()),
    StructField("datetime_show_start", StringType()),
    StructField("datetime_show_stop", StringType()),
    StructField("content_type", IntegerType()),
    StructField("title", StringType(), nullable = True),
    StructField("year", FloatType(), nullable = True),
    StructField("genres", StringType()),
    StructField("region_id ", IntegerType())
])


items = spark.read\
          .schema(schema)\
          .format("csv")\
          .option("header", True)\
          .option("sep", "\t")\
          .load("/labs/slaba03/laba03_items.csv")

In [46]:
test = spark.read.csv("/labs/slaba03/laba03_test.csv", header = True)

In [47]:
train = spark.read.csv("/labs/slaba03/laba03_train.csv", header = True)

In [48]:
train = train.join(items, on = 'item_id', how = 'left')
train = train.withColumn("first_genre", split(f.col("genres"), ",").getItem(0))

In [49]:
test = test.join(items, on = 'item_id', how = 'left')
test = test.withColumn("first_genre", split(f.col("genres"), ",").getItem(0))

In [50]:
mean_target = train.groupby('user_id').agg(f.mean('purchase').alias('purchase')).cache()

In [88]:
test_target = test.select('user_id', 'item_id').join(mean_target, on='user_id', how='left').orderBy('user_id', 'item_id')

In [53]:
test_target.show(2)

+-------+-------+--------------------+
|user_id|item_id|            purchase|
+-------+-------+--------------------+
|   1654| 100026|0.001947040498442...|
|   1654| 100029|0.001947040498442...|
+-------+-------+--------------------+
only showing top 2 rows



In [54]:
test_target.count()

2156840

In [89]:
test_target.coalesce(1).write.mode('overwrite').csv('vn_lab3', header=True)

In [90]:
!ls

'ALS (2).ipynb'			 Lab02_s.ipynb
'Clustering (1).ipynb'		 Lab03_best_solution-Copy1.ipynb
 dataframes.ipynb		 Lab03_best_solution.ipynb
 experiments_vasilyeva.ipynb	 Lab03.ipynb
 Graphless			 Lab_2-Copy1.ipynb
'HDFS CLI Examples.ipynb'	 mp_analysis.ipynb
 lab01.json			'spark intro.ipynb'
 Lab01_Natalia_Vasilyeva.ipynb	'Spark ML Pipelines.ipynb'
 Lab02.ipynb			 spark_scala_api.ipynb
 lab02.json			'vectors (1).ipynb'


In [91]:
!hdfs dfs -ls

Found 3 items
drwx------   - natalia.vasilyeva natalia.vasilyeva          0 2022-10-13 21:00 .Trash
drwxr-xr-x   - natalia.vasilyeva natalia.vasilyeva          0 2022-11-01 18:24 .sparkStaging
drwxr-xr-x   - natalia.vasilyeva natalia.vasilyeva          0 2022-11-01 19:59 vn_lab3


In [92]:
!hdfs dfs -ls vn_lab3

Found 2 items
-rw-r--r--   3 natalia.vasilyeva natalia.vasilyeva          0 2022-11-01 19:59 vn_lab3/_SUCCESS
-rw-r--r--   3 natalia.vasilyeva natalia.vasilyeva   68506595 2022-11-01 19:59 vn_lab3/part-00000-e8632981-d786-4a8c-8c17-f283a30aae82-c000.csv


In [93]:
!hdfs dfs -copyToLocal vn_lab3

In [94]:
spark.stop()