In [48]:
exit()

In [1]:
import os
import sys
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 9 pyspark-shell'

spark_home = os.environ.get('SPARK_HOME', None)


sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

conf = SparkConf()
conf.set("spark.app.name", "korneev") 

conf.set('spark.sql.autoBroadcastJoinThreshold', -1)
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [3]:
import pandas as pd
import numpy as np

In [4]:
from pyspark.sql.types import StructType, StringType, StructField, IntegerType, DoubleType

In [5]:
list_fields = [StructField("user_id", IntegerType()),
               StructField("item_id", IntegerType()),
               StructField("purchase", DoubleType())]

In [6]:
schema = StructType(list_fields)

In [7]:
df_train = spark.read.csv('/labs/slaba03/laba03_train.csv', 
                          header=True, 
                          schema=schema)
test = spark.read.csv('/labs/slaba03/laba03_test.csv', 
                          header=True, 
                          schema=schema)
df_views = spark.read.csv('/labs/slaba03/laba03_views_programmes.csv', 
                          header=True)
df_items = spark.read.csv('/labs/slaba03/laba03_items.csv', 
                          header=True, 
                          sep='\t')

In [8]:
df_train.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  74107|     0.0|
|   1654|  89249|     0.0|
|   1654|  99982|     0.0|
|   1654|  89901|     0.0|
|   1654| 100504|     0.0|
+-------+-------+--------+
only showing top 5 rows



In [9]:
df_train.rdd.getNumPartitions()

9

In [10]:
test.show(5)

+-------+-------+--------+
|user_id|item_id|purchase|
+-------+-------+--------+
|   1654|  94814|    null|
|   1654|  93629|    null|
|   1654|   9980|    null|
|   1654|  95099|    null|
|   1654|  11265|    null|
+-------+-------+--------+
only showing top 5 rows



In [11]:
df_views.show(5)

+-------+-------+----------+----------+---------+
|user_id|item_id|  ts_start|    ts_end|item_type|
+-------+-------+----------+----------+---------+
|      0|7101053|1491409931|1491411600|     live|
|      0|7101054|1491412481|1491451571|     live|
|      0|7101054|1491411640|1491412481|     live|
|      0|6184414|1486191290|1486191640|     live|
|    257|4436877|1490628499|1490630256|     live|
+-------+-------+----------+----------+---------+
only showing top 5 rows



In [12]:
df_items.show(1, vertical=True, truncate=False)

-RECORD 0--------------------------------------------------------------------
 item_id                     | 65667                                         
 channel_id                  | null                                          
 datetime_availability_start | 1970-01-01T00:00:00Z                          
 datetime_availability_stop  | 2018-01-01T00:00:00Z                          
 datetime_show_start         | null                                          
 datetime_show_stop          | null                                          
 content_type                | 1                                             
 title                       | на пробах только девушки (all girl auditions) 
 year                        | 2013.0                                        
 genres                      | Эротика                                       
 region_id                   | null                                          
only showing top 1 row



In [13]:
spark.conf.set("spark.sql.shuffle.partitions", 200)

In [14]:
# spplit 

In [15]:
train = df_train.sampleBy("purchase",
                          fractions={0: 0.8, 1: 0.8},
                          seed=42).cache()

valid = df_train.join(train, 
                      on=["user_id", "item_id"],
                      how="leftanti").cache()

In [16]:
# target
df_train.groupBy("purchase").count().collect()

[Row(purchase=0.0, count=5021720), Row(purchase=1.0, count=10904)]

In [17]:
import pyspark.sql.functions as f

In [18]:
#покупки каждого юзера
user_purchase = train.groupBy('user_id')\
                        .sum()\
                        .select(f.col("sum(purchase)").alias("user_purchase"),
                                f.col("user_id")).cache()
user_purchase.show(2)

+-------------+-------+
|user_purchase|user_id|
+-------------+-------+
|         53.0| 754230|
|          1.0| 761341|
+-------------+-------+
only showing top 2 rows



In [19]:
#доля от всех
all_purchase = user_purchase.agg(f.sum("user_purchase")).collect()[0][0]
user_purchase = user_purchase.withColumn("user_purchase_per",
                                         user_purchase["user_purchase"] / all_purchase)
user_purchase.show(2)

+-------------+-------+--------------------+
|user_purchase|user_id|   user_purchase_per|
+-------------+-------+--------------------+
|         53.0| 754230|0.006083562901744...|
|          1.0| 761341|1.147842056932966E-4|
+-------------+-------+--------------------+
only showing top 2 rows



In [20]:
#просмотры юзера
user_purchase_v = train.groupBy('user_id')\
                        .count()\
                        .select(f.col("count").alias("user_count"),
                                f.col("user_id")).cache()
user_purchase_v.show(2)

+----------+-------+
|user_count|user_id|
+----------+-------+
|      2053| 825061|
|      2041| 833685|
+----------+-------+
only showing top 2 rows



In [21]:
#покупаемость каждого фильма
item_purchase = train.groupBy('item_id')\
                        .sum()\
                        .select(f.col("sum(purchase)").alias("item_purchase"),
                                f.col("item_id")).cache()
item_purchase.show(2)

+-------------+-------+
|item_purchase|item_id|
+-------------+-------+
|          3.0|  90019|
|          1.0|  72820|
+-------------+-------+
only showing top 2 rows



In [22]:
#доля ото всех
all_purchase_i = item_purchase.agg(f.sum("item_purchase")).collect()[0][0]
item_purchase = item_purchase.withColumn("item_purchase_per",
                                         item_purchase["item_purchase"] / all_purchase_i)
item_purchase.show(2)

+-------------+-------+--------------------+
|item_purchase|item_id|   item_purchase_per|
+-------------+-------+--------------------+
|          3.0|  90019|3.443526170798898E-4|
|          1.0|  72820|1.147842056932966E-4|
+-------------+-------+--------------------+
only showing top 2 rows



In [23]:
#topN самых покупаемых
#если item_id = top1-topN, ставим True в столбец

N = 10
list_topN = [row.asDict()['item_id'] for row in 
             item_purchase.sort(f.desc("item_purchase"))\
             .select('item_id')\
             .head(N)]
list_topN

[67036, 90762, 8661, 10585, 77442, 8658, 93666, 9919, 9911, 89637]

In [24]:
def check_topN(item, list_topN):
    if item in list_topN:
        return 1
    else:
        return 0
udf_check_topN = f.udf(lambda x: check_topN(x, list_topN), IntegerType())

In [25]:
item_purchase = item_purchase.withColumn("item_id_topN",
                                         udf_check_topN(item_purchase["item_id"]))

In [26]:
item_purchase.show(2)

+-------------+-------+--------------------+------------+
|item_purchase|item_id|   item_purchase_per|item_id_topN|
+-------------+-------+--------------------+------------+
|          3.0|  90019|3.443526170798898E-4|           0|
|          1.0|  72820|1.147842056932966E-4|           0|
+-------------+-------+--------------------+------------+
only showing top 2 rows



In [27]:
#просмотры фильма
item_purchase_v = train.groupBy('item_id')\
                        .count()\
                        .select(f.col("count").alias("item_count"),
                                f.col("item_id")).cache()
item_purchase_v.show(2)

+----------+-------+
|item_count|item_id|
+----------+-------+
|      1075|   8638|
|      1096|  74757|
+----------+-------+
only showing top 2 rows



In [28]:
#жанры
items_genres_years = df_items.select('item_id',
                                     'title',
                                     'genres',
                                     'year')
items_genres_years = items_genres_years.na.fill({'title': u"_"})
items_genres_years = items_genres_years.na.fill({'genres': u"_"})
items_genres_years = items_genres_years.na.fill({'year': u"0000"})
items_genres_years = items_genres_years.withColumn('year', 
                                                   items_genres_years.year.cast(IntegerType()))

In [29]:


from pyspark.ml.feature import Tokenizer, RegexTokenizer, HashingTF, IDF
import re
def text_regexp_filter(string):
    regex = re.compile(u'[\w\d]{2,}', re.U)
    return " ".join(regex.findall(string.lower()))
udf_text_regexp_filter = f.udf(lambda x: text_regexp_filter(x), StringType())

In [30]:
#text prep
items_genres_years = items_genres_years.withColumn("genres_filter", udf_text_regexp_filter(items_genres_years["genres"]))

#tokenizer
tokenizer = Tokenizer(inputCol="genres_filter", outputCol="words")
items_genres_years = tokenizer.transform(items_genres_years)

#считаем tf
ht = HashingTF(inputCol="words", outputCol="features", numFeatures=300)
items_genres_years = ht.transform(items_genres_years)

#считаем tfidf
idf = IDF(inputCol="features", outputCol="genres_tfidf").fit(items_genres_years)
items_genres_years = idf.transform(items_genres_years)



#title
#text prep
items_genres_years = items_genres_years.withColumn("title_filter",
                                                   udf_text_regexp_filter(items_genres_years["title"]))

#tokenizer
tokenizer = Tokenizer(inputCol="title_filter", outputCol="words_title")
items_genres_years = tokenizer.transform(items_genres_years)

#считаем tf
ht = HashingTF(inputCol="words_title", outputCol="features_title", numFeatures=300)
items_genres_years = ht.transform(items_genres_years)

#считаем tfidf
idf = IDF(inputCol="features_title", outputCol="title_tfidf").fit(items_genres_years)
items_genres_years = idf.transform(items_genres_years)

#need cols
items_genres_years = items_genres_years.select('item_id', 'year', 'genres_tfidf', 'title_tfidf')



#user
train = train.join(user_purchase, on='user_id', how='left')
valid = valid.join(user_purchase, on='user_id', how='left')
test = test.join(user_purchase, on='user_id', how='left')

train = train.join(user_purchase_v, on='user_id', how='left')
valid = valid.join(user_purchase_v, on='user_id', how='left')
test = test.join(user_purchase_v, on='user_id', how='left')

#item
train = train.join(item_purchase, on='item_id', how='left')
valid = valid.join(item_purchase, on='item_id', how='left')
test = test.join(item_purchase, on='item_id', how='left')

train = train.join(item_purchase_v, on='item_id', how='left')
valid = valid.join(item_purchase_v, on='item_id', how='left')
test = test.join(item_purchase_v, on='item_id', how='left')

#pers
train = train.withColumn('user_addict', f.col('user_purchase') / f.col('user_count'))
valid = valid.withColumn('user_addict', f.col('user_purchase') / f.col('user_count'))
test = test.withColumn('user_addict', f.col('user_purchase') / f.col('user_count'))

train = train.withColumn('item_addict', f.col('item_purchase') / f.col('item_count'))
valid = valid.withColumn('item_addict', f.col('item_purchase') / f.col('item_count'))
test = test.withColumn('item_addict', f.col('item_purchase') / f.col('item_count'))

#item genres, years
train = train.join(items_genres_years, on='item_id', how='left')
valid = valid.join(items_genres_years, on='item_id', how='left')
test = test.join(items_genres_years, on='item_id', how='left')

In [31]:
# fillna
train = train.na.fill(0)
valid = valid.na.fill(0)
test = test.na.fill(0)

In [32]:
# Copy paste
# Добавим вектор пользовательской истории
from pyspark.sql.functions import monotonically_increasing_id

items_count = train.groupBy('item_id').count().withColumnRenamed('count', 'item_count')
items_desc_count = items_count.orderBy(items_count.item_count.desc()).limit(250) 
items_desc_count = items_desc_count.coalesce(1)
items_desc_count = items_desc_count.withColumn("item_row_id", monotonically_increasing_id())

items_desc_count.cache()

train_truncated = train.join(items_desc_count, on='item_id', how='inner')\
                    .select('user_id', 'item_row_id', 'purchase').cache()

train_truncated.show(2)

+-------+-----------+--------+
|user_id|item_row_id|purchase|
+-------+-----------+--------+
| 761341|        124|     0.0|
| 780033|        124|     0.0|
+-------+-----------+--------+
only showing top 2 rows



In [33]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry
from pyspark.ml.linalg import VectorUDT

from pyspark.sql.functions import udf

as_ml = udf(lambda v: v.asML(), VectorUDT())

train_matrix = train_truncated.rdd.map(lambda r: MatrixEntry(r[0], r[1], r[2]))
train_matrix = CoordinateMatrix(train_matrix)

train_row_mat_i = train_matrix.toIndexedRowMatrix()

train_mat_df = train_row_mat_i.rows.toDF().withColumnRenamed('index', 'user_id') \
                                    .withColumn("history_vec", as_ml("vector"))

In [34]:
train = train.join(train_mat_df, 'user_id', 'left')
valid = valid.join(train_mat_df, 'user_id', 'left')
test = test.join(train_mat_df, 'user_id', 'left')

In [35]:
train.show(3)

+-------+-------+--------+-------------+-----------------+----------+-------------+--------------------+------------+----------+-----------+--------------------+----+--------------------+--------------------+--------------------+--------------------+
|user_id|item_id|purchase|user_purchase|user_purchase_per|user_count|item_purchase|   item_purchase_per|item_id_topN|item_count|user_addict|         item_addict|year|        genres_tfidf|         title_tfidf|              vector|         history_vec|
+-------+-------+--------+-------------+-----------------+----------+-------------+--------------------+------------+----------+-----------+--------------------+----+--------------------+--------------------+--------------------+--------------------+
| 728960|   8389|     0.0|          0.0|              0.0|      2081|          6.0|6.887052341597796E-4|           0|      1063|        0.0|0.005644402634054563|1981|(300,[222,246,266...|(300,[25,258,284]...|(250,[1,2,3,5,9,1...|(250,[1,2,3,5,9,1.

In [None]:
# one model 

In [36]:
from pyspark.ml.feature import VectorAssembler

In [37]:

features_col = ["user_purchase", "user_purchase_per", "user_count", 
                "item_purchase", "item_purchase_per", "item_count",
                "year", "genres_tfidf", "title_tfidf", "item_id_topN",
                'user_addict', 'item_addict', 'history_vec']
target_col = "purchase"

In [38]:
assembler = VectorAssembler(inputCols=features_col,
                            outputCol="features")

train_data = assembler.transform(train).cache()
valid_data = assembler.transform(valid)
test_data  = assembler.transform(test)

In [39]:
from pyspark.ml.classification import GBTClassifier

In [40]:
gbt = GBTClassifier(maxDepth=5,
                    minInstancesPerNode=3,
                    maxBins=50,
                    labelCol=target_col)
gbt_m = gbt.fit(train_data)
pred_valid = gbt_m.transform(valid_data)

In [41]:
pred_valid.show(3)

+-------+-------+--------+-------------+-----------------+----------+-------------+--------------------+------------+----------+-----------+--------------------+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|user_id|item_id|purchase|user_purchase|user_purchase_per|user_count|item_purchase|   item_purchase_per|item_id_topN|item_count|user_addict|         item_addict|year|        genres_tfidf|         title_tfidf|              vector|         history_vec|            features|       rawPrediction|         probability|prediction|
+-------+-------+--------+-------------+-----------------+----------+-------------+--------------------+------------+----------+-----------+--------------------+----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| 728960|  93486|     0.0

In [42]:
gbt_m.featureImportances

SparseVector(860, {0: 0.2849, 1: 0.0032, 2: 0.014, 3: 0.2939, 4: 0.0138, 5: 0.0046, 6: 0.0028, 11: 0.0, 17: 0.0066, 66: 0.0035, 79: 0.0174, 80: 0.0007, 99: 0.0025, 104: 0.0011, 138: 0.0118, 144: 0.0003, 148: 0.0082, 188: 0.0023, 229: 0.0005, 248: 0.0001, 253: 0.0047, 273: 0.0025, 307: 0.0018, 311: 0.0014, 331: 0.0009, 341: 0.0, 350: 0.0009, 357: 0.0004, 359: 0.0059, 399: 0.0011, 439: 0.0009, 474: 0.0006, 537: 0.0, 545: 0.0011, 589: 0.0002, 608: 0.1226, 609: 0.0934, 617: 0.0013, 619: 0.0027, 625: 0.0214, 628: 0.0004, 644: 0.0074, 648: 0.0088, 649: 0.0061, 658: 0.0006, 662: 0.0017, 678: 0.0014, 687: 0.0006, 695: 0.001, 696: 0.0034, 697: 0.0131, 747: 0.0015, 753: 0.013, 769: 0.0004, 808: 0.0003, 816: 0.0003, 831: 0.001, 835: 0.0024, 859: 0.0006})

In [43]:
# evaluation 
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol=target_col, 
                                          metricName='areaUnderROC')
score = evaluator.evaluate(pred_valid)

In [44]:
# test 
pred = gbt_m.transform(test_data)

In [46]:
# submit 
pred_pd = pred.select("user_id", 
                      "item_id",
                      f.col("probability").alias(target_col)).toPandas()
pred_pd = pred_pd.sort_values(by=["user_id", "item_id"])
pred_pd[target_col] = pred_pd[target_col].apply(lambda x: x[1])
pred_pd.to_csv("lab03.csv", index=False)