In [41]:
import os
import sys
import json
import re
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [42]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as f

from pyspark.ml import Pipeline, Transformer, Estimator

from pyspark.sql.window import Window
from pyspark.ml.feature import MinMaxScaler, VectorAssembler, CountVectorizer

from pyspark.ml.classification import GBTClassifier




import pickle
import pandas as pd
import numpy as np
import pyspark.sql.functions as f
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import StructType, StructField, IntegerType, StringType 
from pyspark.sql.types import TimestampType, LongType, FloatType, ArrayType
from pyspark.ml import Estimator
from pyspark.ml.linalg import VectorUDT

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StandardScaler

from sklearn.linear_model import LogisticRegression
from pyspark import keyword_only
from pyspark.ml import Model, Estimator
from pyspark.ml.param import Param, Params, TypeConverters
from pyspark.ml.param.shared import HasFeaturesCol, HasLabelCol, HasPredictionCol


from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier, LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier, LogisticRegression
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [43]:
spark_config = SparkConf()
spark = SparkSession.builder\
                    .config(conf=spark_config)\
                    .getOrCreate()

In [44]:
train_path = '/labs/slaba03/laba03_train.csv'
test_path = '/labs/slaba03/laba03_test.csv'
items_path = '/labs/slaba03/laba03_items.csv'
users_path = '/labs/slaba03/laba03_views_programmes.csv'

In [45]:
main_schema = StructType(fields=[StructField('user_id', IntegerType()), 
                                              StructField('item_id', IntegerType()),
                                              StructField('purchase', IntegerType(), nullable=True), 
                                             ])

In [46]:
items_schema = StructType(fields=[StructField('item_id', IntegerType()), 
                                       StructField('channel_id', IntegerType()),
                                       StructField('datetime_availability_start', StringType()),
                                       StructField('datetime_availability_stop', StringType()),
                                       StructField('datetime_show_start', StringType()),
                                       StructField('datetime_show_stop', StringType()),
                                       StructField('content_type', IntegerType()),
                                       StructField('title', StringType(), nullable=True),
                                       StructField('year', FloatType(), nullable=True),
                                       StructField('genres', StringType()),
                                       StructField('region_id', IntegerType()),
                                      ])

In [47]:
users_schema = StructType(fields=[StructField('user_id', IntegerType()), 
                                       StructField('item_id', IntegerType()),
                                       StructField('ts_start', IntegerType()),
                                       StructField('ts_end', IntegerType()),
                                       StructField('item_type', StringType()),
                                      ])

In [48]:
df_train = spark.read\
                .format('csv')\
                .schema(main_schema)\
                .option("header", "true")\
                .load(train_path)\
                .cache()  


In [49]:
df_test = spark.read\
                .format('csv')\
                .schema(main_schema)\
                .option("header", "true")\
                .load(test_path)\
                 .cache()  


In [50]:
df_items = spark.read\
                .format('csv')\
                .schema(items_schema)\
                .option("header", "true")\
                .option("delimiter", "\\t")\
                .load(items_path)\
                 .select(['item_id', 'content_type', 'year', 'genres'])\
                 .na.fill({'year': -999, 'genres': 'unknown'}).cache() 

In [51]:
df_users =  spark.read\
                .format('csv')\
                .schema(users_schema)\
                .option("header", "true")\
                .load(users_path)\
                .cache()  

In [52]:
df_train=df_train.join(df_items[["item_id","content_type", "year", "genres"]], ["item_id"])

In [53]:
df_test=df_test.join(df_items[["item_id","content_type", "year", "genres"]], ["item_id"])

In [54]:
df_mean_user_purchase=df_train.groupBy("user_id").mean("purchase")
df_mean_item_purchase=df_train.groupBy("item_id").mean("purchase")

In [55]:
df_mean_user_purchase=df_mean_user_purchase.withColumnRenamed("avg(purchase)","mean_user_purchase")
df_mean_item_purchase=df_mean_item_purchase.withColumnRenamed("avg(purchase)","mean_item_purchase")

In [56]:
df_mean_user_purchase.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- mean_user_purchase: double (nullable = true)



In [57]:
df_mean_item_purchase.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- mean_item_purchase: double (nullable = true)



In [58]:
df_train=df_train.join(df_mean_user_purchase, ["user_id"])
df_train=df_train.join(df_mean_item_purchase, ["item_id"])

In [59]:
df_train.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- content_type: integer (nullable = true)
 |-- year: float (nullable = false)
 |-- genres: string (nullable = false)
 |-- mean_user_purchase: double (nullable = true)
 |-- mean_item_purchase: double (nullable = true)



In [60]:
df_test=df_test.join(df_mean_user_purchase, ["user_id"])
df_test=df_test.withColumnRenamed("avg(purchase)","mean_user_purchase")
df_test=df_test.join(df_mean_item_purchase, ["item_id"])
df_test=df_test.withColumnRenamed("avg(purchase)","mean_item_purchase")

In [71]:
gbt = GBTClassifier(labelCol="purchase", featuresCol="features", maxIter=50, maxDepth=3)

In [74]:
lr = LogisticRegression(labelCol="purchase", featuresCol="features")

In [76]:
lr = LogisticRegression(labelCol="purchase", featuresCol="features", maxIter=100, regParam=0.01, elasticNetParam=0.1)

In [77]:
features_col = ["mean_user_purchase", "mean_item_purchase", "content_type", "year"]

In [78]:
assembler = VectorAssembler(inputCols=features_col, outputCol="features")

In [79]:
from pyspark.ml.feature import StringIndexer
qualification_indexer = StringIndexer(inputCol="genres", outputCol="genres_index")

In [80]:
pipeline = Pipeline(stages=[assembler, lr]) # 

In [81]:
model = pipeline.fit(df_train)

In [82]:
predictions = model.transform(df_test)

In [33]:
total_df_1 = predictions.select(['user_id','item_id','probability'])\
                    .withColumnRenamed("probability","purchase")\
                        .orderBy(['user_id','item_id'])

In [34]:
total_df_1.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: vector (nullable = true)



In [35]:
total_df_1.show(1)

+-------+-------+--------------------+
|user_id|item_id|            purchase|
+-------+-------+--------------------+
|   1654|    336|[0.99869163353666...|
+-------+-------+--------------------+
only showing top 1 row



In [83]:
total_df = predictions.select(['user_id','item_id','probability'])\
                        .orderBy(['user_id','item_id']).toPandas()
total_df['purchase'] = total_df['probability'].apply(lambda x: x[1])
total_df = total_df.drop('probability', axis=1)
total_df.to_csv('lab03.csv')

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


In [84]:
spark.stop()