In [103]:
import os
import sys
import json
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [104]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "baryshev konstantin") 

spark = SparkSession.builder.config(conf=conf).appName("baryshev konstantin").getOrCreate()

In [183]:
spark

### Данные

In [105]:
from pyspark.sql.types import (
    FloatType,
    DoubleType,
    IntegerType,
    StringType,
    StructField,
    StructType,
    TimestampType)

In [106]:
schema = StructType(
    [
        StructField("user_id", IntegerType(), True),
        StructField("item_id", IntegerType(), True),
        StructField("purchase", IntegerType(), True),
    ])

In [107]:
df_train = (
    spark.read.format("csv")
    .option("header", True)
    .option("sep", ",")
    .schema(schema)
    .load("/labs/slaba03/laba03_train.csv")
)

In [108]:
df_test = (
    spark.read.format("csv")
    .option("header", True)
    .option("sep", ",")
    .schema(schema)
    .load("/labs/slaba03/laba03_test.csv")
)

In [109]:
schema = StructType(
    [
        StructField("item_id", IntegerType(), True),
        StructField("channel_id", IntegerType(), True),
        StructField("datetime_availability_start", TimestampType(), True),
        StructField("datetime_availability_stop", TimestampType(), True),
        StructField("datetime_show_start", TimestampType(), True),
        StructField("datetime_show_stop", TimestampType(), True),
        StructField("content_type", IntegerType(), True),
        StructField("title", StringType(), True),
        StructField("year", DoubleType(), True),
        StructField("genres", StringType(), True),
        StructField("region_id", IntegerType(), True),
    ])

In [110]:
df_items = (
    spark.read.format("csv")
    .option("header", True)
    .option("sep", "\t")
    .schema(schema)
    .load("/labs/slaba03/laba03_items.csv")
)

In [9]:
df_items.show(1, truncate=True, vertical=True)

-RECORD 0-------------------------------------------
 item_id                     | 65667                
 channel_id                  | null                 
 datetime_availability_start | 1970-01-01 03:00:00  
 datetime_availability_stop  | 2018-01-01 03:00:00  
 datetime_show_start         | null                 
 datetime_show_stop          | null                 
 content_type                | 1                    
 title                       | на пробах только ... 
 year                        | 2013.0               
 genres                      | Эротика              
 region_id                   | null                 
only showing top 1 row



In [111]:
schema = StructType(
    [
        StructField("user_id", IntegerType(), True),
        StructField("item_id", IntegerType(), True),
        StructField("ts_start", IntegerType(), True),
        StructField("ts_end", IntegerType(), True),
        StructField("item_type", StringType(), True),
    ])

In [112]:
df_views_programmes = (
    spark.read.format("csv")
    .option("header", True)
    .option("sep", ",")
    .schema(schema)
    .load("/labs/slaba03/laba03_views_programmes.csv")
)

In [163]:
df_views_programmes.show(2)

+-------+-------------+-------------+
|user_id|avg_time_view|sum_time_view|
+-------+-------------+-------------+
| 561425|       7539.8|        37699|
| 612390|       4406.0|         4406|
+-------+-------------+-------------+
only showing top 2 rows



In [113]:
features_ = ['user_id',
 'item_id',
 'purchase',
 'avg_cnt_buy_item',
 'cnt_buys_item',
 'flag_buys_item',
 'year',
 'cnt_buys_user',
 'flag_buys_user',
 'avg_cnt_buy_user',
 'avg_time_view',
 'sum_time_view']

In [None]:
#подсчет nan значений в dataframes (есть nan в years)
from pyspark.sql.functions import col, count, isnan, when

result = df_train_full.select([count(when(isnan(c), c)).alias(c) for c in features_])
result = df_train_full.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in features_]
)
result.show()

### 2.Фичи инжиниринг

In [115]:
from pyspark.sql.functions import udf
flag_convert = udf(
    lambda x: 1 if x > 0 else 0,
    IntegerType(),
)

In [182]:
df_train_full.where(f.col("item_id")==74107).where(f.col("user_id")==1654).show(1,False, True)

-RECORD 0----------------------------------
 user_id          | 1654                   
 item_id          | 74107                  
 purchase         | 0                      
 avg_cnt_buy_item | 7.575757575757576E-4   
 cnt_buys_item    | 1                      
 flag_buys_item   | 1                      
 year             | 2011.0                 
 genres_array     | [Драмы, Зарубежные]    
 cnt_buys_user    | 5                      
 flag_buys_user   | 1                      
 avg_cnt_buy_user | 0.0019470404984423676  
 avg_time_view    | 1844.3989898989898     
 sum_time_view    | 365191                 
 genres_vector    | (80,[32,45],[1.0,1.0]) 



#### 2.1 ITEMS features

In [13]:
print(f"Число уник-ых пользователей в train: {df_train.select('user_id').distinct().count()}")

Число уник-ых пользователей в train: 1941


In [116]:
from pyspark.sql.functions import col
import pyspark.sql.functions as f

In [117]:
#как часто покупали item из всех item-ов, которые купили хотя бы один раз
avg_buys_items=df_train.groupby("item_id").agg({"purchase":'avg'}) \
                                                  .withColumnRenamed('avg(purchase)', "avg_cnt_buy_item")

cnt_buys_items=df_train.groupby("item_id").sum("purchase").withColumnRenamed('sum(purchase)', "cnt_buys_item")
cnt_buys_items=cnt_buys_items.withColumn("flag_buys_item", flag_convert(cnt_buys_items.cnt_buys_item))

In [118]:
df_items = df_items.na.fill({'year': -999, 'genres': 'unknown'})

In [119]:
#преобразуем файл с просмотрами: массив жанров разделим запятыми и поместим в массив
df_items = df_items.withColumn("genres_array", f.split(f.col("genres"), ','))

In [120]:
items_features = avg_buys_items.join(cnt_buys_items, on=['item_id'])\
                               .join(df_items.select('item_id', 'year', 'genres_array'), on=['item_id'])# , 'genres_vector'

In [144]:
items_features.show(5)

+-------+--------------------+-------------+--------------+------+--------------------+
|item_id|    avg_cnt_buy_item|cnt_buys_item|flag_buys_item|  year|        genres_array|
+-------+--------------------+-------------+--------------+------+--------------------+
|   8389|0.005979073243647235|            8|             1|1981.0|[Мультфильмы, Дет...|
|   8638|0.001450326323422...|            2|             1|2012.0|[Ужасы, Комедии, ...|
|  10817|7.380073800738007E-4|            1|             1|2013.0|[Документальные, ...|
|  72820|7.390983000739098E-4|            1|             1|2016.0|[Драмы, Мелодрамы...|
|  74757|7.358351729212656E-4|            1|             1|2012.0|[Мистические, Ужа...|
+-------+--------------------+-------------+--------------+------+--------------------+
only showing top 5 rows



#### 2.2 User features

In [122]:
avg_buys_users=df_train.groupby("user_id").agg({"purchase":'avg'}).withColumnRenamed('avg(purchase)', "avg_cnt_buy_user")

In [123]:
cnt_buys_users=df_train.groupby("user_id").sum("purchase").withColumnRenamed('sum(purchase)', 'cnt_buys_user')
cnt_buys_users=cnt_buys_users.withColumn("flag_buys_user", flag_convert(cnt_buys_users.cnt_buys_user))

In [124]:
df_views_programmes = df_views_programmes.withColumn('ts_delta', f.col("ts_end")-f.col("ts_start"))

In [125]:
df_views_programmes = df_views_programmes.groupby('user_id')\
                                           .agg(f.mean('ts_delta').alias('avg_time_view'),\
                                                f.sum('ts_delta').alias('sum_time_view')
                                                ) 

In [126]:
users_features=cnt_buys_users.join(avg_buys_users, ["user_id"])\
                             .join(df_views_programmes, ["user_id"], how='left')

In [23]:
users_features.show(2)

+-------+-------------+--------------+--------------------+------------------+-------------+
|user_id|cnt_buys_user|flag_buys_user|    avg_cnt_buy_user|     avg_time_view|sum_time_view|
+-------+-------------+--------------+--------------------+------------------+-------------+
| 754230|           72|             1|0.027575641516660282|1938.5352233676977|      2256455|
| 761341|            1|             1|3.875968992248062E-4| 2555.931818181818|       112461|
+-------+-------------+--------------+--------------------+------------------+-------------+
only showing top 2 rows



In [24]:
users_features.count()

1941

In [25]:
users_features.cache()

DataFrame[user_id: int, cnt_buys_user: bigint, flag_buys_user: int, avg_cnt_buy_user: double, avg_time_view: double, sum_time_view: bigint]

#### ITEM FEATURES + USER FEATURES

In [26]:
items_features.show(2)

+-------+--------------------+-------------+--------------+------+--------------------+
|item_id|    avg_cnt_buy_item|cnt_buys_item|flag_buys_item|  year|        genres_array|
+-------+--------------------+-------------+--------------+------+--------------------+
|   8389|0.005979073243647235|            8|             1|1981.0|[Мультфильмы, Дет...|
|   8638|0.001450326323422...|            2|             1|2012.0|[Ужасы, Комедии, ...|
+-------+--------------------+-------------+--------------+------+--------------------+
only showing top 2 rows



In [167]:
users_features.show(2)

+-------+-------------+--------------+--------------------+------------------+-------------+
|user_id|cnt_buys_user|flag_buys_user|    avg_cnt_buy_user|     avg_time_view|sum_time_view|
+-------+-------------+--------------+--------------------+------------------+-------------+
| 754230|           72|             1|0.027575641516660282|1938.5352233676977|      2256455|
| 761341|            1|             1|3.875968992248062E-4| 2555.931818181818|       112461|
+-------+-------------+--------------+--------------------+------------------+-------------+
only showing top 2 rows



In [127]:
df_train_full = df_train.join(items_features, ["item_id"])\
                        .join(users_features, ["user_id"])

In [128]:
df_train_full.show(2, False, True)

-RECORD 0----------------------------------------
 user_id          | 754230                       
 item_id          | 9782                         
 purchase         | 0                            
 avg_cnt_buy_item | 0.002191380569758948         
 cnt_buys_item    | 3                            
 flag_buys_item   | 1                            
 year             | 2015.0                       
 genres_array     | [Ужасы, Детективы, Триллеры] 
 cnt_buys_user    | 72                           
 flag_buys_user   | 1                            
 avg_cnt_buy_user | 0.027575641516660282         
 avg_time_view    | 1938.5352233676977           
 sum_time_view    | 2256455                      
-RECORD 1----------------------------------------
 user_id          | 754230                       
 item_id          | 10208                        
 purchase         | 0                            
 avg_cnt_buy_item | 0.003668378576669112         
 cnt_buys_item    | 5                            


In [145]:
df_train_full.columns

['user_id',
 'item_id',
 'purchase',
 'avg_cnt_buy_item',
 'cnt_buys_item',
 'flag_buys_item',
 'year',
 'genres_array',
 'cnt_buys_user',
 'flag_buys_user',
 'avg_cnt_buy_user',
 'avg_time_view',
 'sum_time_view',
 'genres_vector']

In [129]:
#вектор жанров, закодированный HashingTF
from pyspark.ml.feature import HashingTF

hasher = HashingTF(numFeatures=80, binary=False, inputCol="genres_array", outputCol="genres_vector")
df_train_full = hasher.transform(df_train_full)

In [130]:
df_train_full.cache()

DataFrame[user_id: int, item_id: int, purchase: int, avg_cnt_buy_item: double, cnt_buys_item: bigint, flag_buys_item: int, year: double, genres_array: array<string>, cnt_buys_user: bigint, flag_buys_user: int, avg_cnt_buy_user: double, avg_time_view: double, sum_time_view: bigint, genres_vector: vector]

In [131]:
df_train_full = df_train_full.na.fill(0)

In [132]:
df_train_full.count()

5032624

In [133]:
df_train_full.show(2, False, True)

-RECORD 0----------------------------------------------------------------
 user_id          | 754230                                               
 item_id          | 8389                                                 
 purchase         | 0                                                    
 avg_cnt_buy_item | 0.005979073243647235                                 
 cnt_buys_item    | 8                                                    
 flag_buys_item   | 1                                                    
 year             | 1981.0                                               
 genres_array     | [Мультфильмы, Детские, Союзмультфильм, Наши]         
 cnt_buys_user    | 72                                                   
 flag_buys_user   | 1                                                    
 avg_cnt_buy_user | 0.027575641516660282                                 
 avg_time_view    | 1938.5352233676977                                   
 sum_time_view    | 2256455           

In [None]:
#df_train_full.write.parquet("df_train_full_v2.parquet")

### Моделирование

In [134]:
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator #for ROC AUC
from pyspark.ml.feature import VectorAssembler

In [135]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="probability", labelCol="purchase", metricName='areaUnderROC')

In [184]:
ignore = ['purchase', 'genres_array']
assembler = VectorAssembler(
                            inputCols= [x for x in df_train_full.columns if x not in ignore],
                            outputCol='features')

In [None]:
hasher = HashingTF(numFeatures=80, binary=False, inputCol="genres_array", outputCol="genres_vector")
df_train_full = hasher.transform(df_train_full)

In [186]:
train_data = assembler.transform(df_train_full).select("purchase", "features")

In [187]:
print(f'Признаки, участвующие в обучении: {[x for x in df_train_full.columns if x not in ignore]}')

Признаки, участвующие в обучении: ['avg_cnt_buy_item', 'cnt_buys_item', 'cnt_buys_user', 'avg_cnt_buy_user']


In [198]:
gbt = GBTClassifier(featuresCol="features", labelCol="purchase", maxDepth=4, minInstancesPerNode=3, maxBins=50, seed=10)

In [199]:
%%time
model = gbt.fit(train_data)

CPU times: user 3.23 s, sys: 1.53 s, total: 4.76 s
Wall time: 9min 37s


### Скоринг df_test

In [200]:
df_test_full = df_test.join(items_features, ["item_id"])\
                      .join(users_features, ["user_id"])

In [50]:
hasher = HashingTF(numFeatures=80, binary=False, inputCol="genres_array", outputCol="genres_vector")
df_test_full = hasher.transform(df_test_full)

In [52]:
df_test_full = df_test_full.na.fill(0)

In [201]:
predictions = model.transform(assembler.transform(df_test_full).select("features"))

In [202]:
firstelement=udf(lambda v:float(v[1]),FloatType())
targets = predictions.select(firstelement('probability')).withColumnRenamed('<lambda>(probability)', "purchase")

In [203]:
from pyspark.sql.functions import monotonically_increasing_id

In [204]:
df1 = df_test_full.select("user_id","item_id").withColumn("id", monotonically_increasing_id())
df2 = targets.withColumn("id", monotonically_increasing_id())

In [205]:
submit = df2.join(df1, "id", "outer").drop("id")

In [206]:
submit.cache()

DataFrame[purchase: float, user_id: int, item_id: int]

In [207]:
from pyspark.sql.functions import col, avg
print("Распределение средней вероятности покупки по пользователям:")
submit.groupby('user_id').agg(avg(col("purchase"))).show(10)

Распределение средней вероятности покупки по пользователям:
+-------+--------------------+
|user_id|       avg(purchase)|
+-------+--------------------+
| 754230| 0.07240736743218835|
| 761341| 0.04390896864739177|
| 776188| 0.04409278008230809|
| 780033| 0.04399668344595928|
| 798454|0.043918690197169784|
| 825061|0.044525370108707066|
| 833685| 0.04794281916446303|
| 846231| 0.04445510645527646|
| 851486| 0.04388509640103132|
| 867850| 0.04390323007341273|
+-------+--------------------+
only showing top 10 rows



In [208]:
#print(f"Максимальная вероятность: {submit.agg({'purchase': 'max'}).collect()[0]}")

In [171]:
train_data.show(2)

+--------+--------------------+
|purchase|            features|
+--------+--------------------+
|       0|[0.00597907324364...|
|       1|[0.00145032632342...|
+--------+--------------------+
only showing top 2 rows



### SAVE

In [None]:
#остортируем submit по возрастанию идентификаторов пользователей (user_id), 
#а затем — по возрастанию идентификаторов передач (item_id)

In [209]:
#!подходит только для файлов небольшого размера
submit.select("user_id", "item_id","purchase")\
      .orderBy(col("user_id"),col("item_id"))\
      .toPandas().to_csv("/data/home/konstantin.baryshev/lab03.csv", header="true")

In [210]:
model.featureImportances

SparseVector(4, {0: 0.1647, 1: 0.3672, 2: 0.3661, 3: 0.102})