In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "lab3") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
from pyspark.sql import functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LogisticRegression

In [4]:
df_train = (spark.read
                 .format("csv")
                 .option("header", True)
                 .option("inferSchema", True)
                 .load("/labs/slaba03/laba03_train.csv"))

In [5]:
df_test = (spark.read
                 .format("csv")
                 .option("header", True)
                 .option("inferSchema", True)
                 .load("/labs/slaba03/laba03_test.csv"))

## EDA 

In [9]:
df_train.count()

5032624

In [10]:
df_train.select("user_id").distinct().count()

1941

In [11]:
df_train.select("item_id").distinct().count()

3704

## Генерация фичей 

### train

In [12]:
# фичи насколько интенсивно покупает пользователь
df_user_purch_stat = (df_train.groupBy("user_id")
                              .agg(F.mean("purchase").alias("user_avg_purch"),
                                   F.sum("purchase").alias("user_sum_purch"))) 

In [13]:
# фичи насколько интенсивно покупают айтеты
df_item_purch_stat = (df_train.groupBy("item_id")
                              .agg(F.mean("purchase").alias("item_avg_purch"),
                               F.sum("purchase").alias("item_sum_purch")))

In [14]:
df_train = (df_train.join(df_user_purch_stat, on="user_id", how="left")
                    .join(df_item_purch_stat, on="item_id", how="left"))

In [15]:
features = ["user_avg_purch", 
            "user_sum_purch", 
            "item_avg_purch", 
            "item_sum_purch"]

vectorAssembler = VectorAssembler(inputCols=features,
                                  outputCol="features")
                                  
df_train = vectorAssembler.transform(df_train)

### test

In [16]:
df_test = (df_test.join(df_user_purch_stat, on="user_id", how="left")
                  .join(df_item_purch_stat, on="item_id", how="left"))

In [17]:
df_test = vectorAssembler.transform(df_test)

## Обучение модели

In [18]:
lr = LogisticRegression(featuresCol='features', labelCol='purchase', predictionCol='prediction', maxIter=10000)
lr = lr.fit(df_train)

predict = lr.transform(df_test)
predict = predict.orderBy(predict.user_id, predict.item_id)
pred_pd_df = predict.select(["user_id","item_id","probability"]).toPandas()
pred_pd_df["purchase"] = pred_pd_df["probability"].apply(lambda arr: arr[1])

final = pred_pd_df[["user_id","item_id","purchase"]]

In [27]:
final

Unnamed: 0,user_id,item_id,purchase
0,1654,336,0.001193
1,1654,678,0.001193
2,1654,691,0.001193
3,1654,696,0.001388
4,1654,763,0.001271
5,1654,795,0.002052
6,1654,861,0.001269
7,1654,1137,0.001634
8,1654,1159,0.001356
9,1654,1428,0.001279


In [25]:
final.to_csv("/data/home/olga.pogodina/lab03.csv")

In [53]:
spark.stop()