In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "laba3 app") 

spark = SparkSession.builder.config(conf=conf).appName("laba3").getOrCreate()

In [3]:
spark

In [4]:
sc = spark.sparkContext

# train

In [116]:
# import pyspark.sql.functions as f
# from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType, DataType, FloatType
# schema_train = StructType(fields=[
#     StructField("user_id", IntegerType()),
#     StructField("item_id", IntegerType()),
#     StructField("purchase", IntegerType())
# ])

# df_train = spark.read.csv('/labs/slaba03/laba03_train.csv', schema=schema_train)
# df_train = df_train.filter(df_train.user_id.isNotNull() & df_train.item_id.isNotNull() & df_train.purchase.isNotNull())

In [5]:
train = spark.read.load('/labs/slaba03/laba03_train.csv',
                        format = "csv",
                        sep=",",
                        header=True,
                        inferSchema=True)

In [6]:
train.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



# test

In [79]:
# schema_test = StructType(fields=[
#     StructField("user_id", IntegerType()),
#     StructField("item_id", IntegerType())
# ])
# df_test = spark.read.csv('/labs/slaba03/laba03_test.csv', schema=schema_test)
# df_test = df_test.filter(df_test.user_id.isNotNull() & df_test.item_id.isNotNull())

In [56]:
test = spark.read.load('/labs/slaba03/laba03_test.csv',
                        format = "csv",
                        sep=",",
                        header=True,
                        inferSchema=True)

In [57]:
test = test.select('user_id', 'item_id')

In [58]:
test.show(5)

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
|   1654|   9980|
|   1654|  95099|
|   1654|  11265|
+-------+-------+
only showing top 5 rows



# добавление фичей и мерж с train

In [14]:
from pyspark.sql import functions as f
from pyspark.ml.feature import VectorAssembler

In [15]:
train.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
|   1654|  89901|       0|
|   1654| 100504|       0|
+-------+-------+--------+
only showing top 5 rows



In [16]:
# среднее по покупкам
mean_user_id = train.groupBy('user_id').agg(f.mean('purchase').alias('mean_user_id'))
mean_user_id.show(5)

+-------+--------------------+
|user_id|        mean_user_id|
+-------+--------------------+
| 927211|3.916960438699569E-4|
| 928140|3.869969040247678E-4|
| 867850|3.829950210647261...|
| 870928|7.674597083653108E-4|
| 879401|0.004283489096573208|
+-------+--------------------+
only showing top 5 rows



In [17]:
mean_item_id = train.groupBy('item_id').agg(f.mean('purchase').alias('mean_item_id'))
mean_item_id.show(5)

+-------+--------------------+
|item_id|        mean_item_id|
+-------+--------------------+
|   8638|0.001450326323422...|
|  95940|   7.097232079489E-4|
|  78113|0.001468428781204...|
|  74757|7.358351729212656E-4|
|  94819|7.163323782234957E-4|
+-------+--------------------+
only showing top 5 rows



In [18]:
train = train.join(mean_user_id, on=['user_id'], how='left').join(mean_item_id, on=['item_id'], how='left')

In [19]:
train.show(5)

+-------+-------+--------+--------------------+--------------------+
|item_id|user_id|purchase|        mean_user_id|        mean_item_id|
+-------+-------+--------+--------------------+--------------------+
|   8389| 754230|       0|0.027575641516660282|0.005979073243647235|
|   8389| 780033|       0|7.757951900698216E-4|0.005979073243647235|
|   8389| 798454|       0|3.840245775729646...|0.005979073243647235|
|   8389| 825061|       0|0.001931247585940...|0.005979073243647235|
|   8389| 833685|       0|0.007500986971969996|0.005979073243647235|
+-------+-------+--------+--------------------+--------------------+
only showing top 5 rows



In [22]:
vector_assembler = VectorAssembler(inputCols=["item_id", "user_id", "mean_user_id", "mean_item_id"], outputCol="features")

In [24]:
train = vector_assembler.transform(train)

In [28]:
test_x_y, train_x_y = train.randomSplit(weights=[0.2, 0.8])

# обучение и результат метрики

In [26]:
from pyspark.ml.classification import LogisticRegression

In [27]:
logreg = LogisticRegression(featuresCol='features', labelCol='purchase', maxIter=20)

In [31]:
logreg = logreg.fit(train_x_y)

In [32]:
prediction = logreg.transform(test_x_y)

In [36]:
prediction = prediction.select('item_id', 'user_id', 'purchase', 'probability')

In [40]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [42]:
metrics_roc_auc = BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='purchase', metricName='areaUnderROC')

In [46]:
metrics_roc_auc.evaluate(prediction)

0.8984384790299953

In [None]:
# обучаю на всем датасете train

In [47]:
logreg = LogisticRegression(featuresCol='features', labelCol='purchase', maxIter=20)
logreg = logreg.fit(train)

# Получение предсказания для test

In [59]:
test.show(5)

+-------+-------+
|user_id|item_id|
+-------+-------+
|   1654|  94814|
|   1654|  93629|
|   1654|   9980|
|   1654|  95099|
|   1654|  11265|
+-------+-------+
only showing top 5 rows



In [60]:
# среднее по покупкам
test = test.join(mean_user_id, on=['user_id'], how='left').join(mean_item_id, on=['item_id'], how='left')

In [61]:
test.show(5)

+-------+-------+--------------------+--------------------+
|item_id|user_id|        mean_user_id|        mean_item_id|
+-------+-------+--------------------+--------------------+
|   8389| 886063|0.002698535080956...|0.005979073243647235|
|   8389| 900335|0.004615384615384616|0.005979073243647235|
|   8389| 936359|7.613247049866769E-4|0.005979073243647235|
|   8389| 901323|3.846153846153846E-4|0.005979073243647235|
|   8389| 928231|7.584376185058779E-4|0.005979073243647235|
+-------+-------+--------------------+--------------------+
only showing top 5 rows



In [67]:
test = vector_assembler.transform(test)

In [68]:
prediction = logreg.transform(test)

In [72]:
prediction = prediction.select('item_id', 'user_id', 'probability')

In [73]:
prediction.show(5)

+-------+-------+--------------------+
|item_id|user_id|         probability|
+-------+-------+--------------------+
|   8389| 901323|[0.99763726387370...|
|   8389| 928231|[0.99763832845755...|
|   8389| 852684|[0.99752570951973...|
|   8389| 853468|[0.99755620396093...|
|   8389| 877109|[0.99743339728720...|
+-------+-------+--------------------+
only showing top 5 rows



In [81]:
prediction = prediction.sort("user_id", "item_id").toPandas()

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


In [82]:
prediction = prediction[['user_id', 'item_id', 'probability']]

In [85]:
prediction = prediction.rename(columns={'probability':'purchase'})

In [87]:
prediction['purchase'] = prediction['purchase'].apply(lambda x: x[1])

In [88]:
prediction.head()

Unnamed: 0,user_id,item_id,purchase
0,1654,336,0.002307
1,1654,678,0.002305
2,1654,691,0.002305
3,1654,696,0.002613
4,1654,763,0.00245


In [None]:
,user_id,item_id,purchase

In [89]:
prediction.to_csv('/data/home/yakov.zakharov/lab03.csv')

In [90]:
import pickle

In [None]:
with open("/data/home/yakov.zakharov/lec-4/logistic_model.pk", "wb") as f:
    pickle.dump(logreg, f)