In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 3 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as f

from pyspark.ml import Pipeline, Transformer, Estimator

from pyspark.sql.window import Window
from pyspark.ml.feature import MinMaxScaler, VectorAssembler, CountVectorizer

from pyspark.ml.classification import GBTClassifier

In [3]:
spark_config = SparkConf()
spark = SparkSession.builder\
                    .config(conf=spark_config)\
                    .appName("danilov_lab03")\
                    .getOrCreate()

In [4]:
spark.conf.set("spark.sql.shuffle.partitions", 3)

In [5]:
read_interactions_schema = StructType(fields=[StructField('user_id', IntegerType()), 
                                              StructField('item_id', IntegerType()),
                                              StructField('purchase', IntegerType(), nullable=True), 
                                             ])

read_items_schema = StructType(fields=[StructField('item_id', IntegerType()), 
                                       StructField('channel_id', IntegerType()),
                                       StructField('datetime_availability_start', StringType()),
                                       StructField('datetime_availability_stop', StringType()),
                                       StructField('datetime_show_start', StringType()),
                                       StructField('datetime_show_stop', StringType()),
                                       StructField('content_type', IntegerType()),
                                       StructField('title', StringType(), nullable=True),
                                       StructField('year', FloatType(), nullable=True),
                                       StructField('genres', StringType()),
                                       StructField('region_id', IntegerType()),
                                      ])

read_users_schema = StructType(fields=[StructField('user_id', IntegerType()), 
                                       StructField('item_id', IntegerType()),
                                       StructField('ts_start', IntegerType()),
                                       StructField('ts_end', IntegerType()),
                                       StructField('item_type', StringType()),
                                      ])


train_path = '/labs/laba03/lab10_train.csv'
test_path = '/labs/laba03/lab10_test.csv'
items_path = '/labs/laba03/lab10_items.csv'
users_path = '/labs/laba03/lab10_views_programmes.csv'

In [6]:
train_sdf = spark.read\
                .format('csv')\
                .schema(read_interactions_schema)\
                .option("header", "true")\
                .load(train_path)

test_sdf = spark.read\
                .format('csv')\
                .schema(read_interactions_schema)\
                .option("header", "true")\
                .load(test_path)

items_sdf = spark.read\
                .format('csv')\
                .schema(read_items_schema)\
                .option("header", "true")\
                .option("delimiter", "\\t")\
                .load(items_path)

items_sdf = items_sdf.select(['item_id', 'content_type', 'year', 'genres'])\
                        .na.fill({'year': -999, 'genres': 'unknown'})

users_sdf =  spark.read\
                .format('csv')\
                .schema(read_users_schema)\
                .option("header", "true")\
                .load(users_path)

# Feature engineering

### target encoding

In [7]:
class MeanPurchaseByUser(Estimator):
    def __init__(self):
        Transformer.__init__(self)
        self.user_mean_purchase = None
        
    def fit(self, X):
        self.user_mean_purchase = X.select(['user_id', 'purchase'])\
                                    .groupBy('user_id')\
                                    .agg(f.mean('purchase').alias('user_purchase_mean'),
                                         f.sum('purchase').alias('user_purchase_count'))
        return self
        
    def transform(self, X):
        return X.join(self.user_mean_purchase, on='user_id', how='inner')

    
class MeanPurchaseByItem(Estimator):
    def __init__(self):
        Transformer.__init__(self)
        self.item_mean_purchase = None
        
    def fit(self, X):
        self.item_mean_purchase = X.select(['item_id', 'purchase'])\
                                    .groupBy('item_id')\
                                    .agg(f.mean('purchase').alias('item_purchase_mean'), 
                                         f.sum('purchase').alias('item_purchase_count'))
        return self
        
    def transform(self, X):
        return X.join(self.item_mean_purchase, on='item_id', how='inner')

In [8]:
target_features = Pipeline(stages=[MeanPurchaseByUser(),
                                   MeanPurchaseByItem()
                                  ])
target_features_model = target_features.fit(train_sdf)
X_train = target_features_model.transform(train_sdf)
X_test = target_features_model.transform(test_sdf)

### item features

In [9]:
class ItemFeaturesJoin(Transformer):
    def transform(self, X):
        return X.join(items_sdf, on='item_id', how='left')
    
class GenresVectorizer(Transformer):
    def transform(self, X):
        X = X.withColumn("genres_array", f.split(f.col("genres"), ','))
        return CountVectorizer(inputCol="genres_array", outputCol="genres_vector").fit(X).transform(X)

In [10]:
item_features = Pipeline(stages=[ItemFeaturesJoin(),
                                 GenresVectorizer()
                                ])
X_train = item_features.fit(X_train).transform(X_train)
X_test = item_features.fit(X_test).transform(X_test)

### user features

In [11]:
class UserWatchedTime(Estimator):
    def __init__(self):
        Transformer.__init__(self)
        self.user_watched_time = None
    
    def fit(self, users_features):
        users_features = users_features.withColumn('watched_time', f.col('ts_end') - f.col('ts_start'))
        watched_time = users_features.groupBy(['user_id']).pivot('item_type')\
                                            .agg(f.sum('watched_time').alias('sum_watched'),
                                                 f.mean('watched_time').alias('mean_watched'))
                                        
        self.user_watched_time = watched_time.na.fill(0)
        return self
    
    def transform(self, X):
        X = X.join(self.user_watched_time, on='user_id', how='left').na.fill(0)
        X = X.withColumn('watched_time', f.col('live_sum_watched') + f.col('pvr_sum_watched'))
        X = X.withColumn('watched_time_mean', (f.col('live_mean_watched') + f.col('pvr_mean_watched')) / 2.0)
        return X

In [12]:
user_features = Pipeline(stages=[UserWatchedTime()
                                ])
user_features_model = user_features.fit(users_sdf)
X_train = user_features_model.transform(X_train).cache()
X_test = user_features_model.transform(X_test).cache()

# GB Model

In [13]:
features_list = ['user_id', 'item_id', 'year', 'genres_vector', 'content_type',
                 'user_purchase_mean', 'item_purchase_mean', 'user_purchase_count', 'item_purchase_count',
                 'watched_time', 'watched_time_mean',
                 'live_sum_watched', 'live_mean_watched', 'pvr_sum_watched',  'pvr_mean_watched']

features_assembler = VectorAssembler(inputCols=features_list, outputCol="features")

gbt = GBTClassifier(featuresCol="features", labelCol='purchase', seed=23,
                   maxIter=50, maxDepth=3, minInstancesPerNode=1)

In [14]:
model_pipeline = Pipeline(stages=[features_assembler,
                                  gbt
                                 ])
  
model = model_pipeline.fit(X_train)

In [15]:
predictions = model.transform(X_test)

### Write to csv

In [16]:
answer_df = predictions.select(['user_id','item_id','probability'])\
                        .orderBy(['user_id','item_id']).toPandas()
answer_df['purchase'] = answer_df['probability'].apply(lambda x: x[1])
answer_df = answer_df.drop('probability', axis=1)
answer_df.to_csv('lab03.csv')

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


In [17]:
spark.stop()

# Моя кросс-валидация - лидерборд

![не надо так](kaggle_kek.png)

In [None]:
ROC AUC:          [features]
    
0.795526912524 - ['mean_user_purchase', 'mean_item_purchase']
0.831922675074 - ['user_id', 'mean_user_purchase', 'mean_item_purchase']
0.839958717247 - ['user_id', 'item_id', 'mean_user_purchase', 'mean_item_purchase']
0.837710328607 - ['user_id', 'item_id', 'mean_user_purchase', 'mean_item_purchase', 'content_type']
0.851308185017 - ['user_id', 'item_id', 'mean_user_purchase', 'mean_item_purchase', 'year']
0.874271152159 - ['user_id', 'item_id', 'mean_user_purchase', 'mean_item_purchase', 'year', 'genres_vector']
0.856894574506 - ['user_id', 'item_id', 'mean_user_purchase', 'mean_item_purchase', 'year', 'genres_vector',
                  'content_type']
0.864672248951 - ['user_id', 'item_id', 'mean_user_purchase', 'mean_item_purchase', 'year', 'genres_vector',
                  'content_type', 'live_sum_watched', 'live_mean_watched', 'pvr_sum_watched', 'pvr_mean_watched']
0.888011066583 - ['mean_user_purchase', 'mean_item_purchase', 'year', 'genres_vector', 'content_type', 
                 'live_sum_watched', 'live_mean_watched', 'pvr_sum_watched',  'pvr_mean_watched'], max_depth=4
0.889411788109 - ['user_id', 'item_id', 'mean_user_purchase', 'mean_item_purchase', 'year', 'genres_vector', 'content_type', 
                 'live_sum_watched', 'live_mean_watched', 'pvr_sum_watched',  'pvr_mean_watched'], max_depth=4
0.887809837865 - ['user_id', 'item_id', 'mean_user_purchase', 'mean_item_purchase', 'year', 'genres_vector', 'content_type', 
                 'live_sum_watched', 'live_mean_watched', 'pvr_sum_watched',  'pvr_mean_watched'], max_depth=4,  minInstancesPerNode=10
0.883210370509 - ['user_id', 'item_id', 'mean_user_purchase', 'mean_item_purchase', 'year', 'genres_vector', 'content_type', 
                 'live_mean_watched',  'pvr_mean_watched'], max_depth=4
0.886225789312 - ['user_id', 'item_id', 'mean_user_purchase', 'mean_item_purchase', 'year', 'genres_vector', 'content_type', 
                 'live_mean_watched',  'pvr_mean_watched'], max_depth=3
0.880213380002 - ['user_id', 'item_id', 'year', 'genres_vector', 'content_type',
                 'user_purchase_mean', 'item_purchase_mean', 'user_purchase_count', 'item_purchase_count',
                 'live_sum_watched', 'live_mean_watched', 'pvr_sum_watched',  'pvr_mean_watched'], max_depth=4
0.891061583894 - ['user_id', 'item_id', 'year', 'genres_vector', 'content_type',
                 'user_purchase_mean', 'item_purchase_mean', 'user_purchase_count', 'item_purchase_count',
                 'live_sum_watched', 'live_mean_watched', 'pvr_sum_watched',  'pvr_mean_watched'], max_depth=3
0.904424383111 - ['user_id', 'item_id', 'year', 'genres_vector', 'content_type',
                 'user_purchase_mean', 'item_purchase_mean', 'user_purchase_count', 'item_purchase_count',
                 'watched_time', 'watched_time_mean',
                 'live_sum_watched', 'live_mean_watched', 'pvr_sum_watched',  'pvr_mean_watched'], max_depth=3