In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 8 pyspark-shell'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "lab3 lr ALS app") 

spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
spark

![ALS](pics/mf.png)

In [4]:
from pyspark.sql.types import StructType, StructField, IntegerType, LongType , StringType

# DATA

In [5]:
! hdfs dfs -ls /labs/slaba03/

Found 4 items
-rw-r--r--   3 hdfs hdfs   91066524 2021-02-27 22:12 /labs/slaba03/laba03_items.csv
-rw-r--r--   3 hdfs hdfs   29965581 2021-02-27 22:12 /labs/slaba03/laba03_test.csv
-rw-r--r--   3 hdfs hdfs   74949368 2021-02-27 22:12 /labs/slaba03/laba03_train.csv
-rw-r--r--   3 hdfs hdfs  871302535 2021-02-27 22:12 /labs/slaba03/laba03_views_programmes.csv


## items

In [6]:
items = spark.read.csv("/labs/slaba03/laba03_items.csv", sep="\t" , header=True).repartition(20).cache()

items = items.select(['item_id' , 'content_type' , 'title' , 'year' , 'genres' ])

items = items.select(
        items.item_id.cast(IntegerType()),
        items.content_type.cast(IntegerType()), 
        items.title.cast(StringType()), 
        items.year.cast(IntegerType()),
        items.genres.cast(StringType())
    )

## train

In [7]:
train = spark.read.csv("/labs/slaba03/laba03_train.csv", sep="," , header=True).repartition(20).cache()

train = train.select(
        train.user_id.cast(IntegerType()),
        train.item_id.cast(IntegerType()),
        train.purchase.cast(IntegerType())
    )

In [8]:
train.show()

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
| 908726|  90428|       0|
| 907172|  90013|       0|
| 900784|  74711|       0|
| 902100|  94992|       0|
| 902812|  10099|       0|
| 913192|  72902|       0|
| 910553|  74864|       0|
| 903276|  80891|       0|
| 902597|  10354|       0|
| 906738|  99743|       0|
| 908172|  93586|       0|
| 911781|  92357|       0|
| 909478|  86740|       0|
| 914327|  71588|       0|
| 910449|  94853|       0|
| 911435|  11194|       0|
| 910702|  88974|       0|
| 911072|  71673|       0|
| 906109|  98670|       0|
| 906250|  91206|       0|
+-------+-------+--------+
only showing top 20 rows



In [9]:
train.count()

5032624

## test

In [10]:
test = spark.read.csv("/labs/slaba03/laba03_test.csv", sep="," , header=True).drop('purchase').repartition(20).cache()

test = test.select(
        test.user_id.cast(IntegerType()),
        test.item_id.cast(IntegerType())
    )

In [11]:
test.show()

+-------+-------+
|user_id|item_id|
+-------+-------+
| 851090|  10936|
| 852464|  94846|
| 847696|  74282|
| 830022|   5451|
| 826835|  74287|
| 830022|   3066|
| 838366| 100759|
| 836137|  85981|
| 842105|  94864|
| 846644|  99698|
| 833086|  83024|
| 812764|  83484|
| 822559|  88808|
| 814235|   9855|
| 845644|  88636|
| 836300|  95141|
| 839066|  72625|
| 816774|  10016|
| 812764|  89257|
| 824685|  94668|
+-------+-------+
only showing top 20 rows



In [12]:
test.count()

2156840

# Data Analysis

In [13]:
items.groupBy('content_type').count().show()

+------------+------+
|content_type| count|
+------------+------+
|           1|  3704|
|           0|631864|
+------------+------+



In [14]:
qq = train.join(items , on = 'item_id', how = 'left')
qq.groupBy('content_type').count().show()

+------------+-------+
|content_type|  count|
+------------+-------+
|           1|5032624|
+------------+-------+



In [15]:
qq = test.join(items , on = 'item_id', how = 'left')
qq.groupBy('content_type').count().show()

+------------+-------+
|content_type|  count|
+------------+-------+
|           1|2156840|
+------------+-------+



In [16]:
qq = test.join(train , on = ['user_id'], how = 'left_anti')
qq.show()

+-------+-------+
|user_id|item_id|
+-------+-------+
+-------+-------+



In [17]:
qq = train.join(test , on = ['user_id'], how = 'left_anti')
qq.show()

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
+-------+-------+--------+



In [18]:
print( train.select(['user_id']).distinct().count() )
print( test.select(['user_id']).distinct().count() )

1941
1941


# Data prep

In [19]:
# train = train.filter(train['user_id'].isNotNull())

# test = test.filter(test['user_id'].isNotNull())

# MODEL

In [20]:
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as f

In [21]:
als = ALS(rank=10, maxIter=5, seed=5757, userCol ='user_id', itemCol = 'item_id' , ratingCol = 'purchase',regParam = 0.01)
model = als.fit(train)
print(model.rank)

10


In [22]:
predictions = model.transform(test)

In [23]:
predictions.show(5)

+-------+-------+--------------+
|user_id|item_id|    prediction|
+-------+-------+--------------+
| 761341|   8389|  1.5657317E-6|
| 776188|   8389|  1.0929426E-5|
| 846231|   8389|  8.4143896E-5|
| 822709|   8389|-2.0951666E-17|
| 824008|   8389|   3.307014E-7|
+-------+-------+--------------+
only showing top 5 rows



In [24]:
predictions = predictions.withColumnRenamed('prediction' , 'purchase')
predictions = predictions.orderBy( f.col("user_id").asc(),f.col("item_id").asc() )
predictions_local = predictions.toPandas()

# predictions_local.to_csv("lab03.csv" , sep = ',')
# predictions_local.to_csv("../../../lab03.csv" , sep = ',')

# рок аук 0.740602362923 для модели ALS(rank=100, maxIter=5, seed=5757, userCol ='user_id', itemCol = 'item_id' , ratingCol = 'purchase')

# рок аук 0.74226086714  для модели 
# ALS(rank=10, maxIter=5, seed=5757, userCol ='user_id', itemCol = 'item_id' , ratingCol = 'purchase',regParam = 0.01)

In [25]:
predictions.show()

+-------+-------+------------+
|user_id|item_id|    purchase|
+-------+-------+------------+
|   1654|    336|         0.0|
|   1654|    678|         0.0|
|   1654|    691|         0.0|
|   1654|    696| 9.717579E-6|
|   1654|    763|1.3223355E-6|
|   1654|    795|2.2608542E-4|
|   1654|    861| 1.897685E-5|
|   1654|   1137| 5.246289E-5|
|   1654|   1159|6.5571094E-6|
|   1654|   1428|1.8262921E-4|
|   1654|   1685|1.8207046E-4|
|   1654|   1686|2.1551454E-5|
|   1654|   1704|3.4678287E-5|
|   1654|   2093|         0.0|
|   1654|   2343|1.8626827E-6|
|   1654|   2451|         0.0|
|   1654|   2469| 7.666614E-5|
|   1654|   2603| 9.716289E-7|
|   1654|   2609|         0.0|
|   1654|   2621|2.3607981E-5|
+-------+-------+------------+
only showing top 20 rows



In [26]:
matrix_u = model.userFactors.cache()
matrix_v = model.itemFactors.cache()

In [27]:
matrix_u.show()

+------+--------------------+
|    id|            features|
+------+--------------------+
|523860|[-6.827328E-4, 8....|
|728960|[-8.58977E-5, 1.0...|
|731490|[0.0, 0.0, 0.0, 0...|
|736010|[-9.690348E-5, 1....|
|739230|[-8.503938E-5, 1....|
|741210|[-4.174811E-6, 3....|
|747780|[0.0, 0.0, 0.0, 0...|
|748500|[-1.094802E-6, 1....|
|752360|[-1.4963507E-4, 1...|
|752570|[-1.1249864E-6, 8...|
|754230|[-0.0022404697, 0...|
|765780|[-3.0246412E-4, 4...|
|768480|[-1.8377606E-4, 1...|
|768510|[-3.900799E-4, 3....|
|769240|[-2.7155953E-16, ...|
|780610|[-1.2636714E-6, 2...|
|781080|[0.0, 0.0, 0.0, 0...|
|781430|[-1.9749602E-4, 3...|
|785860|[-1.6113782E-4, 2...|
|793430|[-2.7821903E-7, -...|
+------+--------------------+
only showing top 20 rows



In [28]:
matrix_v.show()

+----+--------------------+
|  id|            features|
+----+--------------------+
| 400|[-4.6870817E-5, 5...|
| 430|[-1.4768392E-5, 1...|
| 540|[-9.280533E-6, 9....|
|1320|[0.0, 0.0, 0.0, 0...|
|1870|[-1.6594844E-5, -...|
|2360|[0.0, 0.0, 0.0, 0...|
|2830|[-2.550362E-5, 2....|
|2990|[-4.126096E-5, 4....|
|3110|[-4.623342E-5, 5....|
|3250|[-0.0017411183, 0...|
|3470|[0.0, 0.0, 0.0, 0...|
|3630|[-3.5064313E-5, 4...|
|3750|[-6.91573E-6, 1.5...|
|4120|[-0.0020157015, 0...|
|5170|[-0.0017182988, 0...|
|5340|[-4.953657E-5, 5....|
|5510|[-0.0036376354, 0...|
|5530|[-1.9856334E-4, 1...|
|5550|[-1.9855585E-4, 1...|
|5660|[-4.0021852E-5, 2...|
+----+--------------------+
only showing top 20 rows



In [29]:
matrix_v

DataFrame[id: int, features: array<float>]

# ПОПРОБУЕМ НА ГИПЕРПАРАМЕТРАХ ВЫЕХАТЬ

In [69]:
from pyspark.ml.param.shared import HasInputCol, HasOutputCol
from pyspark import keyword_only
from pyspark.ml import Transformer

In [72]:
class DoubleTransformer(Transformer, HasInputCol, HasOutputCol):
    @keyword_only
    def __init__(self, inputCol=None, outputCol=None):
        super(DoubleTransformer, self).__init__()
        if inputCol is not None:
            self.setInputCol(inputCol)
        if outputCol is not None:
            self.setOutputCol(outputCol)
            
    def _transform(self, dataset):
        return dataset.withColumn(self.getOutputCol(), f.col(self.getInputCol()).cast("double"))

In [73]:
dt = DoubleTransformer(inputCol = "prediction" , outputCol="prediction")

In [74]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [75]:
from pyspark.ml import Pipeline

In [76]:
als = ALS(seed=5757, userCol ='user_id', itemCol = 'item_id' , ratingCol = 'purchase')

In [77]:
pipeline = Pipeline(stages=[
    als,
    dt
])

In [78]:
param_grid = ParamGridBuilder() \
            .addGrid(als.rank, [10, 50, 100, 150]) \
            .addGrid(als.regParam, [.01, .05, .1, .15]) \
            .build()

In [79]:
evaluator = BinaryClassificationEvaluator(
           metricName="areaUnderROC", 
           labelCol="purchase", 
           rawPredictionCol="prediction") 
print ("Num models to be tested: ", len(param_grid))

Num models to be tested:  16


In [83]:
cv = CrossValidator(estimator=pipeline, estimatorParamMaps=param_grid, evaluator=evaluator, numFolds=5 , parallelism=4)

In [None]:
model = cv.fit(train)

In [None]:
best_model = model.bestModel

In [None]:
model.bestModel

In [None]:
model = cv.fit(train)
best_model = model.bestModel
test_predictions = best_model.transform(test)
roc_auc = evaluator.evaluate(test_predictions)
print(roc_auc)

In [None]:
print("**Best Model**")
# Print "Rank"
print("  Rank:", best_model._java_obj.parent().getRank())
# Print "MaxIter"
print("  MaxIter:", best_model._java_obj.parent().getMaxIter())
# Print "RegParam"
print("  RegParam:", best_model._java_obj.parent().getRegParam())

In [28]:
# Бесконечно долго строим бустинг

In [34]:
# досчиталось 

In [None]:
model.getEstimatorParamMaps()

In [None]:
model.avgMetrics

In [None]:
import numpy as np
print(model.getEstimatorParamMaps()[np.argmax(model.avgMetrics)])

# Строим Бустинг

## Частота покупок user и покупаемость item

In [30]:
sdf_user_agg = train.select(['user_id', 'purchase']).groupBy(['user_id'])\
                    .agg(f.sum('purchase').alias('user_purchase_sum'),
                         f.mean('purchase').alias('user_purchase_mean'))

X_train = train.join(sdf_user_agg, on = ['user_id'], how = 'inner')
X_test = test.join(sdf_user_agg, on = ['user_id'], how = 'inner')

In [31]:
sdf_item_agg = train.select(['item_id', 'purchase']).groupBy(['item_id'])\
                    .agg(f.sum('purchase').alias('item_purchase_sum'),
                         f.mean('purchase').alias('item_purchase_mean'))

X_train = X_train.join(sdf_item_agg, on = ['item_id'], how = 'inner')
X_test = X_test.join(sdf_item_agg, on = ['item_id'], how = 'inner')

## CountVectorizer для жанров

In [32]:
from pyspark.ml.feature import CountVectorizer

In [33]:
items = items.na.fill({ 'genres': 'unknown'})

In [34]:
X_train = X_train.join(items, on='item_id', how='left')
X_test = X_test.join(items, on='item_id', how='left')

In [142]:
# X_train.show()

In [143]:
# X_test.show()

In [37]:
X_train = X_train.withColumn("genres_list", f.split(f.col("genres"), ','))

In [144]:
# X_train.show()

In [39]:
cv_train = CountVectorizer(inputCol="genres_list", outputCol="genres_cv")

cv_model = cv_train.fit(X_train)

X_train = cv_model.transform(X_train)

In [145]:
# X_train.show()

In [44]:
X_test = X_test.withColumn("genres_list", f.split(f.col("genres"), ','))

In [146]:
# X_test.show()

In [46]:
X_test = cv_model.transform(X_test)

In [48]:
# X_test.show()

+-------+-------+-----------------+--------------------+-----------------+--------------------+------------+--------------------+----+--------------------+--------------------+--------------------+
|item_id|user_id|user_purchase_sum|  user_purchase_mean|item_purchase_sum|  item_purchase_mean|content_type|               title|year|              genres|         genres_list|           genres_cv|
+-------+-------+-----------------+--------------------+-----------------+--------------------+------------+--------------------+----+--------------------+--------------------+--------------------+
|   8389| 901323|                1|3.846153846153846E-4|                8|0.005979073243647235|           1|пес в сапогах (су...|1981|Мультфильмы,Детск...|[Мультфильмы, Дет...|(84,[6,14,19,22],...|
|   8389| 928231|                2|7.584376185058779E-4|                8|0.005979073243647235|           1|пес в сапогах (су...|1981|Мультфильмы,Детск...|[Мультфильмы, Дет...|(84,[6,14,19,22],...|
|   8389| 

## Эмбеддинги из ALS 

In [49]:
matrix_u.show()

+------+--------------------+
|    id|            features|
+------+--------------------+
|523860|[-6.827328E-4, 8....|
|728960|[-8.58977E-5, 1.0...|
|731490|[0.0, 0.0, 0.0, 0...|
|736010|[-9.690348E-5, 1....|
|739230|[-8.503938E-5, 1....|
|741210|[-4.174811E-6, 3....|
|747780|[0.0, 0.0, 0.0, 0...|
|748500|[-1.094802E-6, 1....|
|752360|[-1.4963507E-4, 1...|
|752570|[-1.1249864E-6, 8...|
|754230|[-0.0022404697, 0...|
|765780|[-3.0246412E-4, 4...|
|768480|[-1.8377606E-4, 1...|
|768510|[-3.900799E-4, 3....|
|769240|[-2.7155953E-16, ...|
|780610|[-1.2636714E-6, 2...|
|781080|[0.0, 0.0, 0.0, 0...|
|781430|[-1.9749602E-4, 3...|
|785860|[-1.6113782E-4, 2...|
|793430|[-2.7821903E-7, -...|
+------+--------------------+
only showing top 20 rows



In [50]:
matrix_v.show()

+----+--------------------+
|  id|            features|
+----+--------------------+
| 400|[-4.6870817E-5, 5...|
| 430|[-1.4768392E-5, 1...|
| 540|[-9.280533E-6, 9....|
|1320|[0.0, 0.0, 0.0, 0...|
|1870|[-1.6594844E-5, -...|
|2360|[0.0, 0.0, 0.0, 0...|
|2830|[-2.550362E-5, 2....|
|2990|[-4.126096E-5, 4....|
|3110|[-4.623342E-5, 5....|
|3250|[-0.0017411183, 0...|
|3470|[0.0, 0.0, 0.0, 0...|
|3630|[-3.5064313E-5, 4...|
|3750|[-6.91573E-6, 1.5...|
|4120|[-0.0020157015, 0...|
|5170|[-0.0017182988, 0...|
|5340|[-4.953657E-5, 5....|
|5510|[-0.0036376354, 0...|
|5530|[-1.9856334E-4, 1...|
|5550|[-1.9855585E-4, 1...|
|5660|[-4.0021852E-5, 2...|
+----+--------------------+
only showing top 20 rows



In [51]:
matrix_u = matrix_u.withColumnRenamed('id', 'user_id')
matrix_v = matrix_v.withColumnRenamed('id', 'item_id')

matrix_u = matrix_u.withColumnRenamed('features', 'features_u')
matrix_v = matrix_v.withColumnRenamed('features', 'features_v')

matrix_u = matrix_u.withColumnRenamed('features', 'features_u')
matrix_v = matrix_v.withColumnRenamed('features', 'features_v')

In [52]:
X_train = X_train.join(matrix_u , on = ['user_id'] , how = 'left')
X_train = X_train.join(matrix_v , on = ['item_id'] , how = 'left')

In [53]:
X_test = X_test.join(matrix_u , on = ['user_id'] , how = 'left')
X_test = X_test.join(matrix_v , on = ['item_id'] , how = 'left')

#  БУСТИНГ

In [56]:
from pyspark.ml.feature import VectorAssembler , StandardScaler
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier , LogisticRegression

In [126]:
X_train_exploded = X_train.select(
    X_train["item_id"],
    X_train["user_id"],
    X_train["purchase"],
    X_train["user_purchase_sum"], 
    X_train["user_purchase_mean"], 
    X_train["item_purchase_sum"], 
    X_train["item_purchase_mean"], 
    X_train["genres_cv"], 
    *[X_train["features_u"][i] for i in range(10)],
    *[X_train["features_v"][i] for i in range(10)]
)

In [127]:
X_test_exploded = X_test.select(
    # X_test["purchase"],
    X_test["item_id"],
    X_test["user_id"],
    X_test["user_purchase_sum"], 
    X_test["user_purchase_mean"], 
    X_test["item_purchase_sum"], 
    X_test["item_purchase_mean"], 
    X_test["genres_cv"], 
    *[X_test["features_u"][i] for i in range(10)],
    *[X_test["features_v"][i] for i in range(10)]
)

In [147]:
# X_train_exploded.columns

In [130]:
features_list = [# 'purchase',
 'user_purchase_sum',
 'user_purchase_mean',
 'item_purchase_sum',
 'item_purchase_mean',
 'genres_cv',
 'features_u[0]',
 'features_u[1]',
 'features_u[2]',
 'features_u[3]',
 'features_u[4]',
 'features_u[5]',
 'features_u[6]',
 'features_u[7]',
 'features_u[8]',
 'features_u[9]',
 'features_v[0]',
 'features_v[1]',
 'features_v[2]',
 'features_v[3]',
 'features_v[4]',
 'features_v[5]',
 'features_v[6]',
 'features_v[7]',
 'features_v[8]',
 'features_v[9]']

In [131]:
va = VectorAssembler(inputCols = features_list, outputCol = "features")

In [132]:
ss = StandardScaler(withMean = True, withStd = True , inputCol="features", outputCol="features_scaled")

In [133]:
lr = LogisticRegression( featuresCol = 'features', labelCol = 'purchase', maxIter = 15,
                       regParam=0.1)

In [134]:
pipe = Pipeline(stages = [va, ss, lr])

In [136]:
model_lr = pipe.fit(X_train_exploded)

In [137]:
predictions = model_lr.transform(X_test_exploded)

In [138]:
res_predictions = predictions.select(['user_id','item_id','probability'])
res_predictions = res_predictions.orderBy( f.col("user_id").asc(),f.col("item_id").asc() )
res_predictions_local = res_predictions.toPandas()
res_predictions_local['purchase'] = res_predictions_local['probability'].apply(lambda x: x[1])
res_predictions_local = res_predictions_local.drop('probability', axis=1)


res_predictions_local.to_csv("lab03_lr.csv" , sep = ',')
res_predictions_local.to_csv("../../../lab03.csv" , sep = ',')
# рок аук получился 0.846554609141

  Unsupported type in conversion to Arrow: VectorUDT
Attempting non-optimization as 'spark.sql.execution.arrow.fallback.enabled' is set to true.


In [141]:
res_predictions_local

Unnamed: 0,user_id,item_id,purchase
0,1654,336,0.001707
1,1654,678,0.001702
2,1654,691,0.001787
3,1654,696,0.001800
4,1654,763,0.001797
5,1654,795,0.003303
6,1654,861,0.001835
7,1654,1137,0.002096
8,1654,1159,0.001787
9,1654,1428,0.002624


# бесконечный бустинг

In [201]:
va = VectorAssembler(inputCols = features_list, outputCol = "features")

In [202]:
gbt = GBTClassifier(featuresCol = "features",
                    labelCol = 'purchase' ,
                    seed = 42,
                    maxIter = 50,
                    maxDepth = 3,
                    minInstancesPerNode = 1)

In [203]:
pipe = Pipeline(stages = [va, gbt])

In [None]:
model_gbt = pipe.fit(X_train_exploded)

In [None]:
predictions = model.transform(X_test_exploded)

In [None]:
predictions.show()

In [None]:
model.save