In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --driver-memory 3g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

print("LOCAL")

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.
LOCAL


In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, HashingTF, StopWordsRemover, IDF, Normalizer, Imputer, VectorAssembler, MinMaxScaler, CountVectorizer
from pyspark.sql import functions as f
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, DoubleType, FloatType
from pyspark.sql.functions import udf
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import numpy as np
import re
import itertools
from pyspark.ml import Pipeline
conf = SparkConf()
conf.set("spark.app.name", "Dmitry Zh app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
import itertools
import numpy as np

from pyspark import since, keyword_only
from pyspark.ml import Estimator, Model
from pyspark.ml.common import _py2java
from pyspark.ml.param import Params, Param, TypeConverters
from pyspark.ml.param.shared import HasSeed
from pyspark.ml.tuning import CrossValidator, CrossValidatorModel
from pyspark.ml.util import *
from pyspark.ml.wrapper import JavaParams
from pyspark.sql.functions import rand
from functools import reduce

class StratifiedCrossValidator(CrossValidator):
    def stratify_data(self, dataset):
        """
        Returns an array of dataframes with the same ratio of passes and failures.

        Currently only supports binary classification problems.
        """

        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        nFolds = self.getOrDefault(self.numFolds)
        split_ratio = 1.0 / nFolds

        passes = dataset[dataset[self.getEstimator().getStages()[-1].getLabelCol()] == 1]
        fails = dataset[dataset[self.getEstimator().getStages()[-1].getLabelCol()] == 0]

        pass_splits = passes.randomSplit([split_ratio for i in range(nFolds)])
        fail_splits = fails.randomSplit([split_ratio for i in range(nFolds)])

        stratified_data = [pass_splits[i].unionAll(fail_splits[i]) for i in range(nFolds)]

        return stratified_data

    def _fit(self, dataset):
        est = self.getOrDefault(self.estimator)
        epm = self.getOrDefault(self.estimatorParamMaps)
        numModels = len(epm)
        eva = self.getOrDefault(self.evaluator)
        nFolds = self.getOrDefault(self.numFolds)
        seed = self.getOrDefault(self.seed)
        metrics = [0.0] * numModels

        stratified_data = self.stratify_data(dataset)

        for i in range(nFolds):
            train_arr = [x for j,x in enumerate(stratified_data) if j != i]
            train = reduce((lambda x, y: x.unionAll(y)), train_arr)
            validation = stratified_data[i]

            models = est.fit(train, epm)

            for j in range(numModels):
                model = models[j]
                metric = eva.evaluate(model.transform(validation, epm[j]))
                metrics[j] += metric/nFolds

        if eva.isLargerBetter():
            bestIndex = np.argmax(metrics)
        else:
            bestIndex = np.argmin(metrics)

        bestModel = est.fit(dataset, epm[bestIndex])
        return self._copyValues(CrossValidatorModel(bestModel, metrics))

In [4]:
df_items = spark.read.option("delimiter", "\t").option("header", True).csv("/labs/slaba03/laba03_items.csv")

In [5]:
df_items.createOrReplaceTempView("input_")

In [6]:
df_test = spark.read.option("delimiter", ",").option("header", True).csv("/labs/slaba03/laba03_test.csv")
df_test.show(1, vertical=True, truncate=False)

-RECORD 0---------
 user_id  | 1654  
 item_id  | 94814 
 purchase | null  
only showing top 1 row



In [7]:
df_train = spark.read.option("delimiter", ",").option("header", True).csv("/labs/slaba03/laba03_train.csv")
df_train.show(1, vertical=True, truncate=False)

-RECORD 0---------
 user_id  | 1654  
 item_id  | 74107 
 purchase | 0     
only showing top 1 row



In [8]:
df_views_programmes = spark.read.option("delimiter", ",").option("header", True).csv("/labs/slaba03/laba03_views_programmes.csv")
df_views_programmes.show(1, vertical=True, truncate=False)

-RECORD 0---------------
 user_id   | 0          
 item_id   | 7101053    
 ts_start  | 1491409931 
 ts_end    | 1491411600 
 item_type | live       
only showing top 1 row



In [9]:
df_items.select([f.count(f.when(f.isnan(c) | f.col(c).isNull(), c)).alias(c) for c in df_items.columns]).show()

+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+-----+------+------+---------+
|item_id|channel_id|datetime_availability_start|datetime_availability_stop|datetime_show_start|datetime_show_stop|content_type|title|  year|genres|region_id|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+-----+------+------+---------+
|      0|      3704|                          0|                         0|               3704|              3704|           0|    0|631868|    33|   362264|
+-------+----------+---------------------------+--------------------------+-------------------+------------------+------------+-----+------+------+---------+



In [10]:
@f.pandas_udf(StringType())
def wordsClean(s):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    
    return s.apply(lambda i: " ".join(regex.findall(i.lower())) if i is not None else i)

df_features_items = df_items.select("item_id", f.coalesce(wordsClean("genres"), f.lit('')).alias("genres")\
                                    , wordsClean("title").alias("title")).where(f.col("content_type")==1).cache()

In [11]:
tokenizer = Tokenizer(inputCol="genres", outputCol="genres_token")

langs=['danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian'
       , 'italian', 'norwegian', 'portuguese', 'russian', 'spanish', 'swedish', 'turkish']
stop_words=[StopWordsRemover.loadDefaultStopWords(i) for i in langs]
stop_words=list(itertools.chain(*stop_words))
swr = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol="genres_filtered", stopWords=stop_words)

count_vectorizer = CountVectorizer(inputCol=swr.getOutputCol(), outputCol="word_vectorizer", binary=True)

pipeline_genres = Pipeline(stages=[
    tokenizer,
    swr,
    count_vectorizer
])

model = pipeline_genres.fit(df_features_items)
df_features = model.transform(df_features_items)

In [12]:
df_features.show(1, vertical=True, truncate=False)

-RECORD 0------------------------------------------------------
 item_id         | 65667                                       
 genres          | эротика                                     
 title           | на пробах только девушки all girl auditions 
 genres_token    | [эротика]                                   
 genres_filtered | [эротика]                                   
 word_vectorizer | (90,[20],[1.0])                             
only showing top 1 row



### Деление Train на 3 части

In [13]:
train_prep = df_train.sampleBy("purchase", fractions={'0': 0.97, '1': 0.97}, seed=42).cache()
test = df_train.join(train_prep, on=["user_id", "item_id"], how="leftanti").coalesce(10).cache()

stat = train_prep.sampleBy("purchase", fractions={'0': 0.6, '1': 0.6}, seed=42).cache()
train = train_prep.join(stat, on=["user_id", "item_id"], how="leftanti").coalesce(10).cache()

In [14]:
train_prep.count()

4882184

In [15]:
test.count()

150440

In [16]:
stat.count()

2928567

In [17]:
train.count()

1953617

In [18]:
train.show(1, vertical=True, truncate=False)

-RECORD 0----------
 user_id  | 1654   
 item_id  | 100221 
 purchase | 0      
only showing top 1 row



In [19]:
test.show(1, vertical=True, truncate=False)

-RECORD 0--------
 user_id  | 1654 
 item_id  | 5340 
 purchase | 0    
only showing top 1 row



In [20]:
stat.show(1, vertical=True, truncate=False)

-RECORD 0----------
 user_id  | 1654   
 item_id  | 100504 
 purchase | 0      
only showing top 1 row



In [21]:
stat_user_features = stat.groupBy("user_id")\
                                    .agg(f.sum("purchase").alias("users_purchase")).cache()
stat_item_features = stat.groupBy("item_id")\
                                    .agg(f.sum("purchase").alias("items_purchase")).cache()



In [22]:
df_features_user = df_views_programmes.groupBy(f.col("user_id"))\
                    .agg(f.round(f.avg(f.col("ts_end")-f.col("ts_start")), 0).alias("avg_time")\
                         , f.count(f.col("item_id")).alias("items_num"))\
                    .cache()

In [23]:
def encoding(df, cols, target, smoothing_factor=1, min_samples_leaf=1):
    prior = df.agg(f.mean(f.col(target))).collect()[0][0]
    for i in cols:
        df_encode = df.\
            groupBy(i).\
            agg(f.count(target).alias("cnt")\
                , f.mean(target).alias("target_mean"))\
            .select(i,\
                    (1 / (1 + f.exp(-(f.col("cnt") - min_samples_leaf) / smoothing_factor))).alias("smoove"), "target_mean")\
            .select(i,\
                    (prior * (1 - f.col("smoove")) + f.col("target_mean") * f.col("smoove")).alias("{0}_encode".format(i)))
        df = df.join(df_encode, on=i, how='left')
    return df

stat_encode=encoding(stat, ["user_id", "item_id"], "purchase").coalesce(10).cache()


In [24]:
train = train.join(stat_user_features, on=["user_id"], how="left")\
             .join(stat_item_features, on=["item_id"], how="left")\
             .join(stat_encode.select("user_id", "user_id_encode").distinct(), on=["user_id"], how="left")\
             .join(stat_encode.select("item_id", "item_id_encode").distinct(), on=["item_id"], how="left")\
             .join(df_features, on=["item_id"], how="left")\
             .join(df_features_user, on=["user_id"], how="left")\
             .select("user_id", "item_id", "avg_time", "items_num", \
                     f.col("purchase").cast(IntegerType()).alias("purchase"), "word_vectorizer", \
                      f.coalesce(f.col("users_purchase").cast(IntegerType()), f.lit("-1").cast(IntegerType())).alias("users_purchase")\
                     , f.coalesce(f.col("items_purchase").cast(IntegerType()), f.lit("-1").cast(IntegerType())).alias("items_purchase")\
                     , f.coalesce(f.col("item_id_encode").cast(IntegerType()), f.lit("-1").cast(IntegerType())).alias("item_encode")\
                     , f.coalesce(f.col("user_id_encode").cast(IntegerType()), f.lit("-1").cast(IntegerType())).alias("user_encode")).coalesce(10).cache()\

test = test.join(stat_user_features, on=["user_id"], how="left")\
             .join(stat_item_features, on=["item_id"], how="left")\
             .join(stat_encode.select("user_id", "user_id_encode").distinct(), on=["user_id"], how="left")\
             .join(stat_encode.select("item_id", "item_id_encode").distinct(), on=["item_id"], how="left")\
             .join(df_features, on=["item_id"], how="left")\
             .join(df_features_user, on=["user_id"], how="left")\
             .select("user_id", "item_id", "avg_time", "items_num",\
                     f.col("purchase").cast(IntegerType()).alias("purchase"), "word_vectorizer", \
                      f.coalesce(f.col("users_purchase").cast(IntegerType()), f.lit("-1").cast(IntegerType())).alias("users_purchase")\
                     , f.coalesce(f.col("items_purchase").cast(IntegerType()), f.lit("-1")).cast(IntegerType()).alias("items_purchase")  
                     , f.coalesce(f.col("item_id_encode").cast(IntegerType()), f.lit("-1").cast(IntegerType())).alias("item_encode")\
                     , f.coalesce(f.col("user_id_encode").cast(IntegerType()), f.lit("-1").cast(IntegerType())).alias("user_encode")).coalesce(10).cache()\


In [25]:
train = train.withColumn("items_num", train["items_num"].cast(DoubleType()))
imputer = Imputer(
    inputCols=["avg_time", "items_num"], 
    outputCols=["{}".format(c) for c in ["avg_time", "items_num"]]
    ).setStrategy("mean")

train = imputer.fit(train).transform(train)

test = test.withColumn("items_num", test["items_num"].cast(DoubleType()))
imputer = Imputer(
    inputCols=["avg_time", "items_num"], 
    outputCols=["{}".format(c) for c in ["avg_time", "items_num"]]
    ).setStrategy("mean")

test = imputer.fit(test).transform(test)

In [26]:
train.show(1, vertical=True, truncate=False)

-RECORD 0-------------------------------------
 user_id         | 867363                     
 item_id         | 100110                     
 avg_time        | 2635.0                     
 items_num       | 180.0                      
 purchase        | 0                          
 word_vectorizer | (90,[0,1,8],[1.0,1.0,1.0]) 
 users_purchase  | 1                          
 items_purchase  | 1                          
 item_encode     | 0                          
 user_encode     | 0                          
only showing top 1 row



In [27]:
test.show(1, vertical=True, truncate=False)

-RECORD 0--------------------------------
 user_id         | 867363                
 item_id         | 60351                 
 avg_time        | 2635.0                
 items_num       | 180.0                 
 purchase        | 0                     
 word_vectorizer | (90,[0,13],[1.0,1.0]) 
 users_purchase  | 1                     
 items_purchase  | 0                     
 item_encode     | 0                     
 user_encode     | 0                     
only showing top 1 row



In [28]:
test.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- avg_time: double (nullable = true)
 |-- items_num: double (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- word_vectorizer: vector (nullable = true)
 |-- users_purchase: integer (nullable = true)
 |-- items_purchase: integer (nullable = true)
 |-- item_encode: integer (nullable = true)
 |-- user_encode: integer (nullable = true)



In [29]:
train.printSchema()

root
 |-- user_id: string (nullable = true)
 |-- item_id: string (nullable = true)
 |-- avg_time: double (nullable = true)
 |-- items_num: double (nullable = true)
 |-- purchase: integer (nullable = true)
 |-- word_vectorizer: vector (nullable = true)
 |-- users_purchase: integer (nullable = true)
 |-- items_purchase: integer (nullable = true)
 |-- item_encode: integer (nullable = true)
 |-- user_encode: integer (nullable = true)



In [30]:
train.rdd.getNumPartitions()

10

In [31]:
assembler = VectorAssembler(inputCols=[
                                       "word_vectorizer"
                                       , "users_purchase"
                                       , "items_purchase"
                                       , "avg_time"
                                       , "items_num"], outputCol="features")

df_train_transformed = assembler.transform(train).select("user_id", "item_id", "purchase", "features")
df_test_transformed = assembler.transform(test).select("user_id", "item_id", "purchase", "features")

In [32]:
df_train_transformed.show(1000, truncate=False)

+-------+-------+--------+-------------------------------------------------------------------------------------------------------------+
|user_id|item_id|purchase|features                                                                                                     |
+-------+-------+--------+-------------------------------------------------------------------------------------------------------------+
|867363 |100110 |0       |(94,[0,1,8,90,91,92,93],[1.0,1.0,1.0,1.0,1.0,2635.0,180.0])                                                  |
|867363 |66663  |0       |(94,[0,5,9,12,90,91,92,93],[1.0,1.0,1.0,1.0,1.0,6.0,2635.0,180.0])                                           |
|867363 |69574  |0       |(94,[0,1,59,90,92,93],[1.0,1.0,1.0,1.0,2635.0,180.0])                                                        |
|867363 |92801  |0       |(94,[1,4,11,14,90,92,93],[1.0,1.0,1.0,1.0,1.0,2635.0,180.0])                                                 |
|867363 |93525  |0       |(94,[0,2,90,92,

In [33]:
df_train_transformed_balanced = df_train_transformed.sampleBy("purchase", fractions={0: 0.008, 1: 1}, seed=5757)

In [34]:
df_train_transformed_balanced.groupby("purchase").agg(f.count("*")).show(2, False)

+--------+--------+
|purchase|count(1)|
+--------+--------+
|1       |4194    |
|0       |15524   |
+--------+--------+



In [35]:
gbt = GBTClassifier(featuresCol=assembler.getOutputCol(), labelCol="purchase")
pipeline = Pipeline(stages=[gbt])
paramGrid = ParamGridBuilder().addGrid(gbt.maxIter, [5, 10, 15])\
                              .addGrid(gbt.maxDepth, [2, 3, 5])\
                              .build()                              
evaluator = BinaryClassificationEvaluator(labelCol=gbt.getLabelCol())

In [36]:
crossval = CrossValidator(estimator=pipeline, estimatorParamMaps=paramGrid,
                              evaluator=evaluator, numFolds=2, parallelism=2)

In [37]:
cv_model=crossval.fit(df_train_transformed_balanced)

In [38]:
predictions = cv_model.transform(df_test_transformed)

In [39]:
evaluator = BinaryClassificationEvaluator(
    labelCol="purchase", 
    metricName='areaUnderROC'
)

evaluator.evaluate(predictions)

0.911200307438139

### Тестовые

In [40]:
df_test_to_model = df_test.join(stat_user_features, on=["user_id"], how="left")\
             .join(stat_item_features, on=["item_id"], how="left")\
             .join(df_features, on=["item_id"], how="left")\
             .join(stat_encode.select("user_id", "user_id_encode").distinct(), on=["user_id"], how="left")\
             .join(stat_encode.select("item_id", "item_id_encode").distinct(), on=["item_id"], how="left")\
             .join(df_features_user, on=["user_id"], how="left")\
             .select("user_id", "item_id", "avg_time", "items_num", \
                     f.col("purchase").cast(IntegerType()).alias("purchase"), "word_vectorizer", \
                     f.coalesce(f.col("users_purchase").cast(IntegerType()), f.lit("-1").cast(IntegerType())).alias("users_purchase")\
                     , f.coalesce(f.col("items_purchase").cast(IntegerType()), f.lit("-1")).cast(IntegerType()).alias("items_purchase")\
                     , f.coalesce(f.col("item_id_encode").cast(IntegerType()), f.lit("-1").cast(IntegerType())).alias("item_encode")\
                     , f.coalesce(f.col("user_id_encode").cast(IntegerType()), f.lit("-1").cast(IntegerType())).alias("user_encode")).coalesce(10).cache()\

df_test_to_model = df_test_to_model.withColumn("items_num", df_test_to_model["items_num"].cast(DoubleType()))
imputer = Imputer(
    inputCols=["avg_time", "items_num"], 
    outputCols=["{}".format(c) for c in ["avg_time", "items_num"]]
    ).setStrategy("mean")

df_test_to_model = imputer.fit(df_test_to_model).transform(df_test_to_model)

assembler = VectorAssembler(inputCols=[
                                       "word_vectorizer"
                                       , "users_purchase"
                                       , "items_purchase"
                                       , "avg_time"
                                       , "items_num"], outputCol="features")


df_test_transformed_model = assembler.transform(df_test_to_model).select("user_id", "item_id", "purchase", "features")

In [41]:
df_test_to_model.drop("word_vectorizer").select([f.count(f.when(f.isnan(c) | f.col(c).isNull(), c)).alias(c) for c in df_test_to_model.drop("word_vectorizer").columns]).show()

+-------+-------+--------+---------+--------+--------------+--------------+-----------+-----------+
|user_id|item_id|avg_time|items_num|purchase|users_purchase|items_purchase|item_encode|user_encode|
+-------+-------+--------+---------+--------+--------------+--------------+-----------+-----------+
|      0|      0|       0|        0| 2156840|             0|             0|          0|          0|
+-------+-------+--------+---------+--------+--------------+--------------+-----------+-----------+



In [42]:
predictions_test = cv_model.transform(df_test_transformed_model)

In [43]:
predictions_test.where("item_id=100234").show(5 ,False)

+-------+-------+--------+------------------------------------------------+----------------------------------------+----------------------------------------+----------+
|user_id|item_id|purchase|features                                        |rawPrediction                           |probability                             |prediction|
+-------+-------+--------+------------------------------------------------+----------------------------------------+----------------------------------------+----------+
|867363 |100234 |null    |(94,[8,90,91,92,93],[1.0,1.0,1.0,2635.0,180.0]) |[1.3725218804276922,-1.3725218804276922]|[0.9396328290018002,0.06036717099819977]|0.0       |
|882935 |100234 |null    |(94,[8,90,91,92,93],[1.0,1.0,1.0,10163.0,287.0])|[1.3725218804276922,-1.3725218804276922]|[0.9396328290018002,0.06036717099819977]|0.0       |
|889974 |100234 |null    |(94,[8,91,92,93],[1.0,1.0,3592.0,715.0])        |[1.3725218804276922,-1.3725218804276922]|[0.9396328290018002,0.06036717099819977

In [60]:
predictions_test.select(secondelement(f.col("probability")).alias("purchase")).orderBy(f.col("user_id").cast(IntegerType()), "user_id", "item_id").select([f.count(f.when(f.isnan(c) | f.col(c).isNull(), c)).alias(c) for c in predictions_test.select(secondelement(f.col("probability")).alias("purchase")).orderBy(f.col("user_id").cast(IntegerType()), "user_id", "item_id").columns]).show()

+--------+
|purchase|
+--------+
|       0|
+--------+



In [70]:
secondelement=f.udf(lambda v:float(v[1]), FloatType())
df=predictions_test.select(f.col("user_id").cast(IntegerType()).alias("user_id"),\
                   f.col("item_id").cast(IntegerType()).alias("item_id"), \
                   secondelement(f.col("probability")).alias("purchase")).orderBy(f.col("user_id").cast(IntegerType()), f.col("item_id").cast(IntegerType())).toPandas()#.to_csv("lab03.csv", sep=",", header=True, index=False)

In [71]:
#df.columns=["user_id", "item_id", "purchase"]
df.to_csv("lab03.csv", sep=",", header=True, index=False)

In [72]:
df.shape

(2156840, 3)

In [54]:
predictions_test.count()

2156840

In [47]:
import pandas as pd
g = pd.read_csv("./lab03.csv")

In [48]:
g.head()

Unnamed: 0,user_id,item_id,purchase
0,1654,423,0.081012
1,1654,3481,0.074066
2,1654,4741,0.076792
3,1654,4992,0.081012
4,1654,5064,0.098024


In [49]:
#sc.stop()