In [1]:
import os
import sys
from datetime import datetime

os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 2 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
import numpy as np
import json
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, Normalizer, StopWordsRemover
from pyspark.ml import Pipeline
from pyspark.sql.functions import udf, col, isnan, isnull, broadcast, desc, lower, split
import re
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import FloatType, ArrayType, StringType
from pyspark.sql.functions import udf 
from pyspark.sql.types import FloatType


conf = SparkConf()
conf.set("spark.app.name", "Lab3") 

spark = SparkSession.builder.config(conf=conf).appName("Lab3").getOrCreate()

In [4]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


In [5]:
schema = StructType(fields=[
    StructField("item_id", IntegerType()),
    StructField("channel_id", IntegerType()),
    StructField("datetime_availability_start", StringType()),
    StructField("datetime_availability_stop", StringType()),
    StructField("datetime_show_start", StringType()),
    StructField("datetime_show_stop", StringType()),
    StructField("content_type", IntegerType()),
    StructField("title", StringType(), nullable = True),
    StructField("year", FloatType(), nullable = True),
    StructField("genres", StringType()),
    StructField("region_id ", IntegerType())
])

In [6]:
items = spark.read\
          .schema(schema)\
          .format("csv")\
          .option("header", True)\
          .option("sep", "\t")\
          .load("/labs/slaba03/laba03_items.csv")

In [7]:
train = spark.read.csv("/labs/slaba03/laba03_train.csv", header = True).join(items, on = 'item_id', how = 'left')
train = train.withColumn("first_genre", split(F.col("genres"), ",").getItem(0))

In [8]:
test = spark.read.csv("/labs/slaba03/laba03_test.csv", header = True).join(items, on = 'item_id', how = 'left')
test = test.withColumn("first_genre", split(F.col("genres"), ",").getItem(0))

In [9]:
mean_target = train.groupby('user_id').agg(F.mean('purchase').alias('purchase')).cache()

In [10]:
test_target = test.select('user_id', 'item_id').join(mean_target, on='user_id', how='left')

In [22]:
test_target.orderBy('user_id', 'item_id').coalesce(1).write.mode('overwrite').csv("lab03", header=True)

In [23]:
!hdfs dfs -copyToLocal lab03

copyToLocal: `lab03/_SUCCESS': File exists


In [24]:
spark.stop()