In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 9 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "roman_matiiv_lab3") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [19]:
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LogisticRegression

In [4]:
df_trian = (spark.read
                 .format("csv")
                 .option("header", True)
                 .option("inferSchema", True)
                 .load("/labs/slaba03/laba03_train.csv"))

In [5]:
df_test = (spark.read
                 .format("csv")
                 .option("header", True)
                 .option("inferSchema", True)
                 .load("/labs/slaba03/laba03_test.csv"))

In [6]:
df_test.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: string (nullable = true)



## EDA 

In [7]:
df_trian.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- purchase: integer (nullable = true)



In [8]:
df_trian.show(3)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|       0|
|   1654|  89249|       0|
|   1654|  99982|       0|
+-------+-------+--------+
only showing top 3 rows



In [9]:
df_trian.count()

5032624

In [10]:
df_trian.select("user_id").distinct().count()

1941

In [11]:
df_trian.select("item_id").distinct().count()

3704

## Генерация фичей 

### train

In [12]:
# фичи насколько интенсивно покупает пользователь
df_user_purch_stat = (df_trian.groupBy("user_id")
                              .agg(F.mean("purchase").alias("user_avg_purch"),
                                   F.sum("purchase").alias("user_sum_purch"))) 

In [13]:
# фичи насколько интенсивно покупают айтеты
df_item_purch_stat = (df_trian.groupBy("item_id")
                              .agg(F.mean("purchase").alias("item_avg_purch"),
                               F.sum("purchase").alias("item_sum_purch")))

In [14]:
df_train = (df_trian.join(df_user_purch_stat, on="user_id", how="left")
                    .join(df_item_purch_stat, on="item_id", how="left"))

In [15]:
features = ["user_avg_purch", 
            "user_sum_purch", 
            "item_avg_purch", 
            "item_sum_purch"]

vectorAssembler = VectorAssembler(inputCols=features,
                                  outputCol="features")
                                  
df_train = vectorAssembler.transform(df_train)

### test

In [16]:
df_test = (df_test.join(df_user_purch_stat, on="user_id", how="left")
                  .join(df_item_purch_stat, on="item_id", how="left"))

In [17]:
df_test = vectorAssembler.transform(df_test)

## Обучение модели

In [45]:
lr = LogisticRegression(featuresCol='features', labelCol='purchase', predictionCol='prediction', maxIter=10000)

In [46]:
lr = lr.fit(df_train)

In [47]:
predict = lr.transform(df_test)

In [48]:
predict = predict.orderBy(predict.user_id, predict.item_id)

In [49]:
pred_pd_df = predict.select(["user_id","item_id","probability"]).toPandas()

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


In [50]:
pred_pd_df["purchase"] = pred_pd_df["probability"].apply(lambda arr: arr[1])

In [51]:
a = pred_pd_df[["user_id","item_id","purchase"]]

In [52]:
a.to_csv("/data/home/roman.matiiv/lab03.csv")

In [53]:
spark.stop()