In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 16 --executor-memory 4g --executor-cores 8 --driver-memory 4g pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')

sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))
exec(open(os.path.join(spark_home, 'python/pyspark/shell.py')).read())

Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 2.4.7
      /_/

Using Python version 3.6.5 (default, Apr 29 2018 16:14:56)
SparkSession available as 'spark'.


In [2]:
PARTITIONS = 256

In [3]:
import pyspark.sql.functions as f

In [4]:
from pyspark.sql.types import ArrayType, StructType, StructField, DataType, StringType, LongType, TimestampType, BooleanType, IntegerType
schema = StructType(fields=[
                         StructField("item_id",IntegerType(), True),\
                         StructField("channel_id",IntegerType(), True),\
                         StructField("datetime_availability_start",TimestampType(), True),\
                         StructField("datetime_availability_stop",TimestampType(), True),\
                         StructField("datetime_show_start",TimestampType(), True),\
                         StructField("datetime_show_stop",TimestampType(), True),\
                         StructField("content_type",IntegerType(), True),\
                         StructField("title",StringType(), True),\
                         StructField("year", StringType(), True),\
                         StructField('genres',StringType(), True),\
                         StructField("region_id",IntegerType(), True),\
                         StructField("broken",StringType(), True)])

In [9]:
raw_items = spark.read.csv("/labs/slaba03/laba03_items.csv",
                       sep='\t',
                       header=True,
                       nullValue = '',
                       multiLine=True,
                       enforceSchema=True,
                       escape='"',
                       dateFormat="yyyy.dd",
                       columnNameOfCorruptRecord='broken',
                       schema=schema)

raw_items = raw_items.drop("datetime_availability_start",
                     "datetime_availability_stop", "datetime_show_start",
                     "datetime_show_stop",
                     "channel_id", "region_id", "broken")
raw_items = raw_items.na.fill({'title': ' ', 'genres': '', 'year': 'empty'})
items = raw_items.filter("content_type == 1").withColumn("category_words", f.split("genres", ",")).drop("genres")

In [7]:
items.printSchema()

root
 |-- item_id: integer (nullable = true)
 |-- content_type: integer (nullable = true)
 |-- title: string (nullable = false)
 |-- year: string (nullable = false)
 |-- category_words: array (nullable = false)
 |    |-- element: string (containsNull = true)



In [67]:
from pyspark.ml.feature import Tokenizer, VectorAssembler,\
    RegexTokenizer, StopWordsRemover, MinMaxScaler,\
    HashingTF, StringIndexer, CountVectorizer, IDF, OneHotEncoder, Normalizer, PCA

In [14]:
#tokenizer = Tokenizer(inputCol="title")
tokenizer = RegexTokenizer(inputCol="title", pattern=u'[\p{L}+]{2,}', gaps=False)

stop_words = list(set.union(set(StopWordsRemover.loadDefaultStopWords("russian")),
set(StopWordsRemover.loadDefaultStopWords("english"))))

remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol = "words")

tf = HashingTF(inputCol=remover.getOutputCol(), outputCol="title_tf_idf", numFeatures=10000, binary=True)

#tf = CountVectorizer(inputCol=remover.getOutputCol())
#idf = IDF(inputCol=tf.getOutputCol())
#scaler = MinMaxScaler(inputCol=idf.getOutputCol(), outputCol="title_tf_idf")

tf_c = HashingTF(inputCol="category_words", outputCol="cat", numFeatures=100, binary=True)

si = StringIndexer(inputCol="year", outputCol="si_years")
ohe_y = OneHotEncoder(inputCol=si.getOutputCol(), outputCol="years")

va = VectorAssembler(inputCols = ["title_tf_idf", "cat", "years"], handleInvalid = "keep")
nrm = Normalizer(inputCol = va.getOutputCol(), outputCol="features",)

from pyspark.ml import Pipeline
pipeline = Pipeline(stages=[
    tokenizer,
    remover,
    tf,
    #idf,
    #scaler,
    tf_c,
    si,
    ohe_y,
    va,
    nrm
])

pipeline_model = pipeline.fit(items)
df = pipeline_model.transform(items).select("item_id","features").repartition(PARTITIONS).distinct().cache()
df.show()

+-------+--------------------+
|item_id|            features|
+-------+--------------------+
|    578|(10180,[7684,1003...|
| 100452|(10180,[665,3873,...|
|  92471|(10180,[2911,4493...|
|  75516|(10180,[2073,4549...|
| 100500|(10180,[9543,1003...|
|  11153|(10180,[5715,1001...|
|   7679|(10180,[1188,2386...|
|  69922|(10180,[5490,9632...|
| 100218|(10180,[1752,6654...|
|   7632|(10180,[1249,4068...|
|   4711|(10180,[4061,1001...|
|  86748|(10180,[7243,1000...|
|  87546|(10180,[3420,7636...|
|  91995|(10180,[665,715,1...|
|  93505|(10180,[6127,9160...|
|  92528|(10180,[8274,1000...|
|  72961|(10180,[3857,7325...|
|  99900|(10180,[665,1646,...|
| 100188|(10180,[4083,4225...|
|  88648|(10180,[1466,9269...|
+-------+--------------------+
only showing top 20 rows



In [17]:
schema = StructType(fields=[
                         StructField("user_id",LongType(), True),\
                         StructField("item_id",LongType(), True),\
                         StructField("ts_start",LongType(), True),\
                         StructField("ts_end",LongType(), True),\
                         StructField("item_type",StringType(), True)])
raw_views = spark.read.csv("/labs/slaba03/laba03_views_programmes.csv", sep=',', header=True, schema=schema)
raw_views = raw_views.withColumn("ts_start", f.hour(f.from_unixtime("ts_start"))).\
                      withColumn("ts_end", f.hour(f.from_unixtime("ts_end"))).\
                      withColumn("item_type", f.regexp_replace('item_type', 'live', '1')).\
                      withColumn("item_type", f.regexp_replace('item_type', 'pvr', '0')).\
                      withColumn("is_live", f.col('item_type').cast(IntegerType())).\
                      drop("item_type")


In [18]:
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

arr_sum = udf(lambda x: sum(x), IntegerType())   

views = raw_views.groupBy("user_id").agg(f.collect_list("item_id").alias("item_ids"),\
                                 f.collect_list("ts_start").alias("h_starts"),\
                                 f.collect_list("ts_end").alias("h_ends"),\
                                 f.sum("is_live").alias("sum_is_live"),\
                                 f.count("is_live").alias("count_is_live"))
views = views.withColumn("live_per", f.col("sum_is_live") / f.col("count_is_live") ).\
              drop("count_is_live", "sum_is_live").repartition(PARTITIONS).cache()

In [22]:
tf_item = HashingTF(inputCol="item_ids", numFeatures=10000, binary=True)
tf_h_start = HashingTF(inputCol="h_starts", numFeatures=24)
tf_h_end = HashingTF(inputCol="h_ends", numFeatures=24)

scaler_h_start = MinMaxScaler(inputCol=tf_h_start.getOutputCol())
scaler_h_end = MinMaxScaler(inputCol=tf_h_end.getOutputCol())


va = VectorAssembler(inputCols = [tf_item.getOutputCol(),
                                  scaler_h_start.getOutputCol(),
                                  scaler_h_end.getOutputCol(),
                                  "live_per"])

nrm = Normalizer(inputCol=va.getOutputCol(), outputCol="u_features")



pipeline = Pipeline(stages=[tf_item, tf_h_start, tf_h_end, scaler_h_start, scaler_h_end, va, nrm])

pipeline_model = pipeline.fit(views)
df_views = pipeline_model.transform(views).select("user_id", "u_features").repartition(PARTITIONS).distinct().cache()


In [23]:
df_views.show()

+-------+--------------------+
|user_id|          u_features|
+-------+--------------------+
| 778092|(10049,[1753,2233...|
| 878654|(10049,[14,69,84,...|
| 932242|(10049,[37,1263,3...|
| 897562|(10049,[8,18,25,4...|
| 872344|(10049,[6635,7958...|
| 927038|(10049,[146,149,1...|
| 741217|(10049,[1,2,19,28...|
| 920307|(10049,[389,896,1...|
| 857007|(10049,[18,74,152...|
| 844660|(10049,[0,12,17,2...|
| 822408|(10049,[44,48,69,...|
| 858175|(10049,[17,23,26,...|
| 852348|(10049,[130,242,2...|
| 825660|(10049,[19,31,53,...|
| 927510|(10049,[198,229,2...|
| 906692|(10049,[6,7,19,22...|
| 864958|(10049,[6,10,31,4...|
| 849009|(10049,[71,110,12...|
| 866205|(10049,[613,3286,...|
| 920380|(10049,[525,671,7...|
+-------+--------------------+
only showing top 20 rows



In [89]:
schema = StructType(fields=[
                         StructField("user_id",LongType(), True),\
                         StructField("item_id",LongType(), True),\
                         StructField("purchase",IntegerType(), True)])
raw_train = spark.read.csv("/labs/slaba03/laba03_train.csv", sep=',', header=True, schema=schema)

In [90]:
data = raw_train.sampleBy("purchase", fractions={0: 0.005, 1: 1}, seed=5757).repartition(PARTITIONS)

In [91]:
va = VectorAssembler(inputCols = ["u_features", "features"], outputCol="all_features", handleInvalid="skip")

In [92]:
data = va.transform(data.join(df_views, "user_id", "left").join(df, "item_id", "left")).drop("u_features", "features").repartition(PARTITIONS).cache()
data.show()

+-------+-------+--------+--------------------+
|item_id|user_id|purchase|        all_features|
+-------+-------+--------+--------------------+
|  11214| 625638|       0|(20229,[370,734,8...|
|  88768| 875711|       1|(20229,[35,58,230...|
|  94689| 747028|       1|(20229,[0,1,2,3,5...|
|  10065| 747028|       1|(20229,[0,1,2,3,5...|
|  96398| 831574|       0|(20229,[5,6,20,37...|
|  95416| 865208|       0|(20229,[22,24,47,...|
|  74569| 921908|       1|(20229,[114,333,3...|
|   7573| 852632|       0|(20229,[24,63,113...|
| 102441| 894940|       0|(20229,[154,184,2...|
|  74570| 871411|       1|(20229,[74,158,17...|
| 100410| 902914|       0|(20229,[17,57,89,...|
| 101992| 937376|       1|(20229,[64,160,17...|
|  80416| 776138|       1|(20229,[2,46,47,4...|
|   8588| 742324|       1|(20229,[19,71,104...|
| 100100| 875739|       0|(20229,[192,274,2...|
|  73659| 811663|       0|(20229,[7,18,23,2...|
|  89636| 851412|       0|(20229,[51,63,78,...|
|   4302| 899297|       0|(20229,[84,100

# Обучение модели

In [93]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

lr = LogisticRegression(featuresCol='all_features', labelCol="purchase")

In [94]:
train = data.sampleBy("purchase", fractions={0: 0.9, 1: 0.9}, seed=5757).repartition(PARTITIONS)
test = data.join(train, (data.user_id == train.user_id) & (data.item_id == train.item_id), how="leftanti").repartition(PARTITIONS)

In [95]:
lr_model = lr.fit(train)
predictions = lr_model.transform(test)

In [96]:
evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="purchase", metricName='areaUnderROC')
evaluator.evaluate(predictions)

0.8234304875099727

In [240]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
paramGrid = ParamGridBuilder().addGrid(lr.maxIter, [15, 30, 50, 100])\
                              .addGrid(lr.regParam, [0.01, 0.05, 0.1])\
                              .addGrid(lr.threshold, [0.05, 0.1, 0.5])\
                              .build()

In [242]:
crossval = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid,
                              evaluator=evaluator, numFolds=5, parallelism=5)

In [243]:
cv_model = crossval.fit(train)

In [360]:
cv_model.getEstimatorParamMaps()[-3]

{Param(parent='LogisticRegression_7930e0157b5e', name='maxIter', doc='max number of iterations (>= 0).'): 100,
 Param(parent='LogisticRegression_7930e0157b5e', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
 Param(parent='LogisticRegression_7930e0157b5e', name='threshold', doc='Threshold in binary classification prediction, in range [0, 1]. If threshold and thresholds are both set, they must match.e.g. if threshold is p, then thresholds must be equal to [1-p, p].'): 0.05}

In [245]:
best_model = lr.fit(data)

# Отправка результатов

In [415]:
schema = StructType(fields=[
                         StructField("user_id", LongType(), True),\
                         StructField("item_id", LongType(), True)])
raw_test = spark.read.csv("/labs/slaba03/laba03_test.csv", sep=',', header=True, schema=schema)

In [416]:
va = VectorAssembler(inputCols = ["u_features", "features"], outputCol="all_features", handleInvalid="keep")
t = va.transform(raw_test.join(df_views, "user_id", "left").join(df, "item_id", "left")).drop("u_features", "features").repartition(PARTITIONS).cache()

In [430]:
p = best_model.transform(t)

In [431]:
import pyspark.sql.types as T

et_1 = f.udf(lambda v: v.toArray().tolist()[1], T.FloatType())

p = p.drop("all_features").withColumn("purchase", et_1("probability"))
p = p.drop('rawPrediction', 'probability', 'prediction').repartition(PARTITIONS)

In [432]:
output = p.select('user_id', 'item_id', 'purchase').toPandas()

In [433]:
output['item_id'].count() == 2156840

True

In [434]:
output = output.fillna(0.0)

In [435]:
output = output.sort_values(['user_id', 'item_id'])

In [436]:
output.to_csv("lab03.csv")

In [97]:
spark.stop()