# Spark connection

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 --executor-memory 2g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
sc

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, FloatType
from pyspark.sql.functions import col,array_contains
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark import Row
from pyspark.sql.functions import count
import json

In [4]:
from numpy import allclose
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
from pyspark.sql.types import *

# Check Data

In [5]:
!hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2022-01-06 18:46 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2022-01-06 18:46 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2022-01-06 18:46 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2022-01-06 18:46 /labs/slaba03/laba03_views_programmes.csv


In [6]:
schema = StructType() \
      .add("user_id", IntegerType(), True) \
      .add("item_id", IntegerType(), True) \
      .add("purchase", IntegerType(), True)
      
df_user = spark.read.format("csv") \
      .option("header", True) \
      .schema(schema) \
      .load("/labs/slaba03/laba03_train.csv")

## Test Data

In [7]:
from pyspark.sql.types import *

schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

train = spark.read.csv('/labs/slaba03/laba03_train.csv', sep=',', header=True, schema=schema)
train

DataFrame[user_id: int, item_id: int, purchase: int]

## Items dataset

In [8]:
items = spark.read.csv('/labs/slaba03/laba03_items.csv', sep='\t', header=True, inferSchema=True)
items.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- channel_id: double (nullable = true)
 |-- datetime_availability_start: timestamp (nullable = true)
 |-- datetime_availability_stop: timestamp (nullable = true)
 |-- datetime_show_start: timestamp (nullable = true)
 |-- datetime_show_stop: timestamp (nullable = true)
 |-- content_type: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- year: double (nullable = true)
 |-- genres: string (nullable = true)
 |-- region_id: double (nullable = true)



In [9]:
items = items[['item_id', 'title', 'year', 'genres']].na.fill({'year': -9999, 'genres': 'General'})
items

DataFrame[item_id: int, title: string, year: double, genres: string]

## Programs dataset

In [10]:
schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("ts_start", IntegerType()),
    StructField("ts_end", IntegerType()),
    StructField("item_type", StringType())
])

programs = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', header=True, schema=schema)
programs

DataFrame[user_id: int, item_id: int, ts_start: int, ts_end: int, item_type: string]

## Test dataset

In [11]:
schema = StructType(fields=[
    StructField("user_id", IntegerType()),
    StructField("item_id", IntegerType()),
    StructField("purchase", IntegerType())
])

test = spark.read.csv('/labs/slaba03/laba03_test.csv', sep=',', header=True, schema=schema)
test

DataFrame[user_id: int, item_id: int, purchase: int]

### Feats from train: UserPurchase, ItemPurchase, RateFilm

In [12]:
from pyspark.sql import functions as f
from pyspark.ml.feature import CountVectorizer, Tokenizer, HashingTF
from pyspark.ml import Transformer, Estimator, Pipeline
import pyspark

##-- сколько покупок совершил пользователь
class UserPurchase(Estimator):
    def __init__(self):
        Transformer.__init__(self)
        self.userPurchase = None
        
    def fit(self, df: pyspark.sql.dataframe.DataFrame):
        self.userPurchase = df.groupBy("user_id").agg(f.sum('purchase').alias('SumUserPurchases'))
        return self
    
    def transform(self, df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
        return df.join(self.userPurchase, on='user_id', how='inner')

In [13]:
##-- сколько раз купили фильм
class ItemPurchase(Estimator):
    def __init__(self):
        Transformer.__init__(self)
        self.itemPurchase = None
        
    def fit(self, df: pyspark.sql.dataframe.DataFrame):
        self.itemPurchase = df.groupBy("item_id").agg(f.sum('purchase').alias('SumFilmPurchases'))
        return self
    
    def transform(self, df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
        return df.join(self.itemPurchase, on='item_id', how='inner')

In [14]:
##-- доля покупок фильма на пользователя
class RateFilmBuys(Transformer):
    def transform(self, df: pyspark.sql.dataframe.DataFrame):
        return df.withColumn("rateFilmBuys", df.SumUserPurchases / df.SumFilmPurchases).na.fill(0)

In [15]:
pipelineTrain = Pipeline(stages=[
    UserPurchase(),
    ItemPurchase()
])

trainFeats = pipelineTrain.fit(train)
train = trainFeats.transform(train)
test = trainFeats.transform(test)

In [16]:
class JoinItems(Transformer):
    def transform(self, df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
        return df.join(items, on='item_id', how='inner')
    
class AddGenresSplit(Transformer):
    def transform(self, df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
        df = df.withColumn("genresSplit", f.split(f.col('genres'), ','))
        cv = CountVectorizer(inputCol='genresSplit', outputCol='genresCV')
        model = cv.fit(df)
        return model.transform(df)
    
class AddTitle(Transformer):
    def transform(self, df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
        tk = Tokenizer(inputCol='title', outputCol='TkTitle')
        df = tk.transform(df)
        cv = CountVectorizer(inputCol='TkTitle', outputCol='TitleCV')
        model = cv.fit(df)
        return model.transform(df)

In [17]:
pipeline_items = Pipeline(stages=[
    RateFilmBuys(),
    JoinItems(),
    AddGenresSplit(),
    AddTitle()
])

itemsFeats = pipeline_items.fit(train)
train = itemsFeats.transform(train)
test = itemsFeats.transform(test)

In [18]:
train.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- SumUserPurchases: long (nullable = true)
 |-- SumFilmPurchases: long (nullable = true)
 |-- rateFilmBuys: double (nullable = false)
 |-- title: string (nullable = true)
 |-- year: double (nullable = false)
 |-- genres: string (nullable = false)
 |-- genresSplit: array (nullable = false)
 |    |-- element: string (containsNull = true)
 |-- genresCV: vector (nullable = true)
 |-- TkTitle: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- TitleCV: vector (nullable = true)



In [19]:
class JoinPrograms(Estimator):
    def __init__(self):
        Transformer.__init__(self)
        self.userWatch = None
        
    def fit(self, df: pyspark.sql.dataframe.DataFrame):
        df = df.withColumn('timeWatch', df.ts_end - df.ts_start)
        self.userWatch = df.groupBy('user_id')\
                                .pivot('item_type')\
                                .agg(f.sum('timeWatch').alias('timeWatchSumByUser'),
                                     f.mean('timeWatch').alias('timeWatchAvgByUser')).na.fill(0)
        return self
    
    def transform(self, df: pyspark.sql.dataframe.DataFrame) -> pyspark.sql.dataframe.DataFrame:
        df = df.join(self.userWatch, on='user_id', how='left').na.fill(0)
        df = df.withColumn('UserSumWatch', df.live_timeWatchSumByUser + df.pvr_timeWatchSumByUser)
        return df

In [20]:
pipeline_programs = Pipeline(stages=[JoinPrograms()])
timeWatch = pipeline_programs.fit(programs)
train = timeWatch.transform(train).cache()
test = timeWatch.transform(test).cache()

In [21]:
dropFeats = ['purchase', 
              'genres', 
              'genresSplit', 
              'title', 
              'TkTitle']
trainColumns = list(set(train.columns) - set(dropFeats))
trainColumns

['SumFilmPurchases',
 'pvr_timeWatchAvgByUser',
 'UserSumWatch',
 'genresCV',
 'user_id',
 'pvr_timeWatchSumByUser',
 'TitleCV',
 'live_timeWatchAvgByUser',
 'rateFilmBuys',
 'live_timeWatchSumByUser',
 'item_id',
 'SumUserPurchases',
 'year']

In [22]:
%%time

from datetime import datetime
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier

print(datetime.now())

inputCols = trainColumns
assembler = VectorAssembler(inputCols=inputCols, outputCol='features')
gbt = GBTClassifier(featuresCol='features', labelCol='purchase', maxDepth=3, maxIter=20, seed=42)
 
pipeline_model = Pipeline(stages=[
    assembler,
    gbt
])

print("Start fitting...")
model = pipeline_model.fit(train)

2022-10-30 18:17:31.420322
Start fitting...
CPU times: user 1.29 s, sys: 365 ms, total: 1.65 s
Wall time: 2h 22min 55s


In [23]:
predictions = model.transform(test)

In [24]:
pred = predictions.select(['user_id','item_id','probability'])\
                        .orderBy(['user_id','item_id']).cache()

In [25]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

firstelement=udf(lambda v:float(v[1]),FloatType())
pred = pred.select(pred.user_id, pred.item_id, firstelement('probability').alias("purchase")).cache()
pred

DataFrame[user_id: int, item_id: int, purchase: float]

In [26]:
%%time

answer = pred.toPandas()

CPU times: user 18.6 s, sys: 962 ms, total: 19.6 s
Wall time: 3min 13s


In [27]:
answer.head()

Unnamed: 0,user_id,item_id,purchase
0,1654,336,0.043932
1,1654,678,0.043932
2,1654,691,0.043932
3,1654,696,0.043932
4,1654,763,0.043932


In [28]:
answer.purchase.value_counts()

0.043811    1384450
0.044534      97606
0.044333      74167
0.043932      73074
0.044036      49667
0.044063      48793
0.047735      25868
0.045057      24137
0.045650      22992
0.045318      21408
0.044922      20861
0.044701      20567
0.045882      16436
0.045237      15023
0.046660      13715
0.047386      11997
0.045567      11543
0.044926      10684
0.044843      10178
0.047251       9752
0.049255       9743
0.050780       7936
0.045833       7791
0.045317       7597
0.046171       7046
0.045197       5737
0.046922       4851
0.046497       4759
0.045222       4051
0.044762       3548
             ...   
0.071845          4
0.105894          4
0.102315          4
0.162129          4
0.219778          4
0.154455          3
0.134805          3
0.099057          3
0.220537          3
0.311462          3
0.124793          3
0.091620          3
0.118027          3
0.404339          2
0.218353          2
0.102988          2
0.091879          2
0.077475          2
0.208500          1


In [32]:
%%time

answer.to_csv('lab03.csv')

CPU times: user 7.12 s, sys: 84.4 ms, total: 7.2 s
Wall time: 7.21 s


In [33]:
import pandas as pd

test_answer = pd.read_csv('lab03.csv')

In [34]:
test_answer.head()

Unnamed: 0.1,Unnamed: 0,user_id,item_id,purchase
0,0,1654,336,0.043932
1,1,1654,678,0.043932
2,2,1654,691,0.043932
3,3,1654,696,0.043932
4,4,1654,763,0.043932


In [35]:
sc.stop()